Ruby 4.1.0dev (2026-03-05 revision 8a87cebd1874f8f9f68af8928191ee3f0d97bb28)
string.c (8a87cebd1874f8f9f68af8928191ee3f0d97bb28)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
156} while (0)
157
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
161} while (0)
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
172 }\
173 }\
174 else {\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 }\
180} while (0)
181
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
192 } \
193} while (0)
194
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
197/* TODO: include the terminator size in capa. */
198
199#define STR_ENC_GET(str) get_encoding(str)
200
201#if !defined SHARABLE_MIDDLE_SUBSTRING
202# define SHARABLE_MIDDLE_SUBSTRING 0
203#endif
204#if !SHARABLE_MIDDLE_SUBSTRING
205#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
206#else
207#define SHARABLE_SUBSTRING_P(beg, len, end) 1
208#endif
209
210
211static inline long
212str_embed_capa(VALUE str)
213{
214 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
215}
216
217bool
218rb_str_reembeddable_p(VALUE str)
219{
220 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
221}
222
223static inline size_t
224rb_str_embed_size(long capa, long termlen)
225{
226 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
227 if (size < sizeof(struct RString)) size = sizeof(struct RString);
228 return size;
229}
230
231size_t
232rb_str_size_as_embedded(VALUE str)
233{
234 size_t real_size;
235 if (STR_EMBED_P(str)) {
236 size_t capa = RSTRING(str)->len;
237 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
238
239 real_size = rb_str_embed_size(capa, TERM_LEN(str));
240 }
241 /* if the string is not currently embedded, but it can be embedded, how
242 * much space would it require */
243 else if (rb_str_reembeddable_p(str)) {
244 size_t capa = RSTRING(str)->as.heap.aux.capa;
245 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
246
247 real_size = rb_str_embed_size(capa, TERM_LEN(str));
248 }
249 else {
250 real_size = sizeof(struct RString);
251 }
252
253 return real_size;
254}
255
256static inline bool
257STR_EMBEDDABLE_P(long len, long termlen)
258{
259 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
260}
261
262static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
263static VALUE str_new_frozen(VALUE klass, VALUE orig);
264static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
265static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
266static VALUE str_new(VALUE klass, const char *ptr, long len);
267static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
268static inline void str_modifiable(VALUE str);
269static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
270static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
271
272static inline void
273str_make_independent(VALUE str)
274{
275 long len = RSTRING_LEN(str);
276 int termlen = TERM_LEN(str);
277 str_make_independent_expand((str), len, 0L, termlen);
278}
279
280static inline int str_dependent_p(VALUE str);
281
282void
283rb_str_make_independent(VALUE str)
284{
285 if (str_dependent_p(str)) {
286 str_make_independent(str);
287 }
288}
289
290void
291rb_str_make_embedded(VALUE str)
292{
293 RUBY_ASSERT(rb_str_reembeddable_p(str));
294 RUBY_ASSERT(!STR_EMBED_P(str));
295
296 int termlen = TERM_LEN(str);
297 char *buf = RSTRING(str)->as.heap.ptr;
298 long old_capa = RSTRING(str)->as.heap.aux.capa + termlen;
299 long len = RSTRING(str)->len;
300
301 STR_SET_EMBED(str);
302 STR_SET_LEN(str, len);
303
304 if (len > 0) {
305 memcpy(RSTRING_PTR(str), buf, len);
306 SIZED_FREE_N(buf, old_capa);
307 }
308
309 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
310}
311
312void
313rb_debug_rstring_null_ptr(const char *func)
314{
315 fprintf(stderr, "%s is returning NULL!! "
316 "SIGSEGV is highly expected to follow immediately.\n"
317 "If you could reproduce, attach your debugger here, "
318 "and look at the passed string.\n",
319 func);
320}
321
322/* symbols for [up|down|swap]case/capitalize options */
323static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
324
325static rb_encoding *
326get_encoding(VALUE str)
327{
328 return rb_enc_from_index(ENCODING_GET(str));
329}
330
331static void
332mustnot_broken(VALUE str)
333{
334 if (is_broken_string(str)) {
335 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
336 }
337}
338
339static void
340mustnot_wchar(VALUE str)
341{
342 rb_encoding *enc = STR_ENC_GET(str);
343 if (rb_enc_mbminlen(enc) > 1) {
344 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
345 }
346}
347
348static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
349
350#if SIZEOF_LONG == SIZEOF_VOIDP
351#define PRECOMPUTED_FAKESTR_HASH 1
352#else
353#endif
354
355static inline bool
356BARE_STRING_P(VALUE str)
357{
358 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
359}
360
361static inline st_index_t
362str_do_hash(VALUE str)
363{
364 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
365 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
366 if (e && !is_ascii_string(str)) {
367 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
368 }
369 return h;
370}
371
372static VALUE
373str_store_precomputed_hash(VALUE str, st_index_t hash)
374{
375 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
376 RUBY_ASSERT(STR_EMBED_P(str));
377
378#if RUBY_DEBUG
379 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
380 size_t free_bytes = str_embed_capa(str) - used_bytes;
381 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
382#endif
383
384 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
385
386 FL_SET(str, STR_PRECOMPUTED_HASH);
387
388 return str;
389}
390
391VALUE
392rb_fstring(VALUE str)
393{
394 VALUE fstr;
395 int bare;
396
397 Check_Type(str, T_STRING);
398
399 if (FL_TEST(str, RSTRING_FSTR))
400 return str;
401
402 bare = BARE_STRING_P(str);
403 if (!bare) {
404 if (STR_EMBED_P(str)) {
405 OBJ_FREEZE(str);
406 return str;
407 }
408
409 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
411 return str;
412 }
413 }
414
415 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
416 rb_str_resize(str, RSTRING_LEN(str));
417
418 fstr = register_fstring(str, false, false);
419
420 if (!bare) {
421 str_replace_shared_without_enc(str, fstr);
422 OBJ_FREEZE(str);
423 return str;
424 }
425 return fstr;
426}
427
428static VALUE fstring_table_obj;
429
430static VALUE
431fstring_concurrent_set_hash(VALUE str)
432{
433#ifdef PRECOMPUTED_FAKESTR_HASH
434 st_index_t h;
435 if (FL_TEST_RAW(str, STR_FAKESTR)) {
436 // register_fstring precomputes the hash and stores it in capa for fake strings
437 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
438 }
439 else {
440 h = rb_str_hash(str);
441 }
442 // rb_str_hash doesn't include the encoding for ascii only strings, so
443 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
444 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
445#else
446 return (VALUE)rb_str_hash(str);
447#endif
448}
449
450static bool
451fstring_concurrent_set_cmp(VALUE a, VALUE b)
452{
453 long alen, blen;
454 const char *aptr, *bptr;
455
458
459 RSTRING_GETMEM(a, aptr, alen);
460 RSTRING_GETMEM(b, bptr, blen);
461 return (alen == blen &&
462 ENCODING_GET(a) == ENCODING_GET(b) &&
463 memcmp(aptr, bptr, alen) == 0);
464}
465
467 bool copy;
468 bool force_precompute_hash;
469};
470
471static VALUE
472fstring_concurrent_set_create(VALUE str, void *data)
473{
474 struct fstr_create_arg *arg = data;
475
476 // Unless the string is empty or binary, its coderange has been precomputed.
477 int coderange = ENC_CODERANGE(str);
478
479 if (FL_TEST_RAW(str, STR_FAKESTR)) {
480 if (arg->copy) {
481 VALUE new_str;
482 long len = RSTRING_LEN(str);
483 long capa = len + sizeof(st_index_t);
484 int term_len = TERM_LEN(str);
485
486 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
487 new_str = str_alloc_embed(rb_cString, capa + term_len);
488 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
489 STR_SET_LEN(new_str, RSTRING_LEN(str));
490 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
491 rb_enc_copy(new_str, str);
492 str_store_precomputed_hash(new_str, str_do_hash(str));
493 }
494 else {
495 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
496 rb_enc_copy(new_str, str);
497#ifdef PRECOMPUTED_FAKESTR_HASH
498 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
499 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
500 }
501#endif
502 }
503 str = new_str;
504 }
505 else {
506 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
507 RSTRING(str)->len,
508 ENCODING_GET(str));
509 }
510 OBJ_FREEZE(str);
511 }
512 else {
513 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
514 str = str_new_frozen(rb_cString, str);
515 }
516 if (STR_SHARED_P(str)) { /* str should not be shared */
517 /* shared substring */
518 str_make_independent(str);
520 }
521 if (!BARE_STRING_P(str)) {
522 str = str_new_frozen(rb_cString, str);
523 }
524 }
525
526 ENC_CODERANGE_SET(str, coderange);
527 RBASIC(str)->flags |= RSTRING_FSTR;
528 if (!RB_OBJ_SHAREABLE_P(str)) {
529 RB_OBJ_SET_SHAREABLE(str);
530 }
531 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
534 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
535 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
537 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
538
539 return str;
540}
541
542static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
543 .hash = fstring_concurrent_set_hash,
544 .cmp = fstring_concurrent_set_cmp,
545 .create = fstring_concurrent_set_create,
546 .free = NULL,
547};
548
549void
550Init_fstring_table(void)
551{
552 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
553 rb_gc_register_address(&fstring_table_obj);
554}
555
556static VALUE
557register_fstring(VALUE str, bool copy, bool force_precompute_hash)
558{
559 struct fstr_create_arg args = {
560 .copy = copy,
561 .force_precompute_hash = force_precompute_hash
562 };
563
564#if SIZEOF_VOIDP == SIZEOF_LONG
565 if (FL_TEST_RAW(str, STR_FAKESTR)) {
566 // if the string hasn't been interned, we'll need the hash twice, so we
567 // compute it once and store it in capa
568 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
569 }
570#endif
571
572 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
573
574 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
576 RUBY_ASSERT(OBJ_FROZEN(result));
578 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
579 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
581
582 return result;
583}
584
585bool
586rb_obj_is_fstring_table(VALUE obj)
587{
588 ASSERT_vm_locking();
589
590 return obj == fstring_table_obj;
591}
592
593void
594rb_gc_free_fstring(VALUE obj)
595{
596 ASSERT_vm_locking_with_barrier();
597
598 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
600 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
601
602 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
603
604 RB_DEBUG_COUNTER_INC(obj_str_fstr);
605
606 FL_UNSET(obj, RSTRING_FSTR);
607}
608
609void
610rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
611{
612 if (fstring_table_obj) {
613 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
614 }
615}
616
617static VALUE
618setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
619{
620 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
621 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
622
623 if (!name) {
625 name = "";
626 }
627
628 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
629
630 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
631 fake_str->len = len;
632 fake_str->as.heap.ptr = (char *)name;
633 fake_str->as.heap.aux.capa = len;
634 return (VALUE)fake_str;
635}
636
637/*
638 * set up a fake string which refers a static string literal.
639 */
640VALUE
641rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
642{
643 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
644}
645
646/*
647 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
648 * shared string which refers a static string literal. `ptr` must
649 * point a constant string.
650 */
651VALUE
652rb_fstring_new(const char *ptr, long len)
653{
654 struct RString fake_str = {RBASIC_INIT};
655 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
656}
657
658VALUE
659rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
660{
661 struct RString fake_str = {RBASIC_INIT};
662 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
663}
664
665VALUE
666rb_fstring_cstr(const char *ptr)
667{
668 return rb_fstring_new(ptr, strlen(ptr));
669}
670
671static inline bool
672single_byte_optimizable(VALUE str)
673{
674 int encindex = ENCODING_GET(str);
675 switch (encindex) {
676 case ENCINDEX_ASCII_8BIT:
677 case ENCINDEX_US_ASCII:
678 return true;
679 case ENCINDEX_UTF_8:
680 // For UTF-8 it's worth scanning the string coderange when unknown.
682 }
683 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
684 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
685 return true;
686 }
687
688 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
689 return true;
690 }
691
692 /* Conservative. Possibly single byte.
693 * "\xa1" in Shift_JIS for example. */
694 return false;
695}
696
698
699static inline const char *
700search_nonascii(const char *p, const char *e)
701{
702 const char *s, *t;
703
704#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
705# if SIZEOF_UINTPTR_T == 8
706# define NONASCII_MASK UINT64_C(0x8080808080808080)
707# elif SIZEOF_UINTPTR_T == 4
708# define NONASCII_MASK UINT32_C(0x80808080)
709# else
710# error "don't know what to do."
711# endif
712#else
713# if SIZEOF_UINTPTR_T == 8
714# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
715# elif SIZEOF_UINTPTR_T == 4
716# define NONASCII_MASK 0x80808080UL /* or...? */
717# else
718# error "don't know what to do."
719# endif
720#endif
721
722 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
723#if !UNALIGNED_WORD_ACCESS
724 if ((uintptr_t)p % SIZEOF_VOIDP) {
725 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
726 p += l;
727 switch (l) {
728 default: UNREACHABLE;
729#if SIZEOF_VOIDP > 4
730 case 7: if (p[-7]&0x80) return p-7;
731 case 6: if (p[-6]&0x80) return p-6;
732 case 5: if (p[-5]&0x80) return p-5;
733 case 4: if (p[-4]&0x80) return p-4;
734#endif
735 case 3: if (p[-3]&0x80) return p-3;
736 case 2: if (p[-2]&0x80) return p-2;
737 case 1: if (p[-1]&0x80) return p-1;
738 case 0: break;
739 }
740 }
741#endif
742#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
743#define aligned_ptr(value) \
744 __builtin_assume_aligned((value), sizeof(uintptr_t))
745#else
746#define aligned_ptr(value) (value)
747#endif
748 s = aligned_ptr(p);
749 t = (e - (SIZEOF_VOIDP-1));
750#undef aligned_ptr
751 for (;s < t; s += sizeof(uintptr_t)) {
752 uintptr_t word;
753 memcpy(&word, s, sizeof(word));
754 if (word & NONASCII_MASK) {
755#ifdef WORDS_BIGENDIAN
756 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
757#else
758 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
759#endif
760 }
761 }
762 p = (const char *)s;
763 }
764
765 switch (e - p) {
766 default: UNREACHABLE;
767#if SIZEOF_VOIDP > 4
768 case 7: if (e[-7]&0x80) return e-7;
769 case 6: if (e[-6]&0x80) return e-6;
770 case 5: if (e[-5]&0x80) return e-5;
771 case 4: if (e[-4]&0x80) return e-4;
772#endif
773 case 3: if (e[-3]&0x80) return e-3;
774 case 2: if (e[-2]&0x80) return e-2;
775 case 1: if (e[-1]&0x80) return e-1;
776 case 0: return NULL;
777 }
778}
779
780static int
781coderange_scan(const char *p, long len, rb_encoding *enc)
782{
783 const char *e = p + len;
784
785 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
786 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
787 p = search_nonascii(p, e);
789 }
790
791 if (rb_enc_asciicompat(enc)) {
792 p = search_nonascii(p, e);
793 if (!p) return ENC_CODERANGE_7BIT;
794 for (;;) {
795 int ret = rb_enc_precise_mbclen(p, e, enc);
797 p += MBCLEN_CHARFOUND_LEN(ret);
798 if (p == e) break;
799 p = search_nonascii(p, e);
800 if (!p) break;
801 }
802 }
803 else {
804 while (p < e) {
805 int ret = rb_enc_precise_mbclen(p, e, enc);
807 p += MBCLEN_CHARFOUND_LEN(ret);
808 }
809 }
810 return ENC_CODERANGE_VALID;
811}
812
813long
814rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
815{
816 const char *p = s;
817
818 if (*cr == ENC_CODERANGE_BROKEN)
819 return e - s;
820
821 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
822 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
823 if (*cr == ENC_CODERANGE_VALID) return e - s;
824 p = search_nonascii(p, e);
826 return e - s;
827 }
828 else if (rb_enc_asciicompat(enc)) {
829 p = search_nonascii(p, e);
830 if (!p) {
831 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
832 return e - s;
833 }
834 for (;;) {
835 int ret = rb_enc_precise_mbclen(p, e, enc);
836 if (!MBCLEN_CHARFOUND_P(ret)) {
838 return p - s;
839 }
840 p += MBCLEN_CHARFOUND_LEN(ret);
841 if (p == e) break;
842 p = search_nonascii(p, e);
843 if (!p) break;
844 }
845 }
846 else {
847 while (p < e) {
848 int ret = rb_enc_precise_mbclen(p, e, enc);
849 if (!MBCLEN_CHARFOUND_P(ret)) {
851 return p - s;
852 }
853 p += MBCLEN_CHARFOUND_LEN(ret);
854 }
855 }
857 return e - s;
858}
859
860static inline void
861str_enc_copy(VALUE str1, VALUE str2)
862{
863 rb_enc_set_index(str1, ENCODING_GET(str2));
864}
865
866/* Like str_enc_copy, but does not check frozen status of str1.
867 * You should use this only if you're certain that str1 is not frozen. */
868static inline void
869str_enc_copy_direct(VALUE str1, VALUE str2)
870{
871 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
872 if (inlined_encoding == ENCODING_INLINE_MAX) {
873 rb_enc_set_index(str1, rb_enc_get_index(str2));
874 }
875 else {
876 ENCODING_SET_INLINED(str1, inlined_encoding);
877 }
878}
879
880static void
881rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
882{
883 /* this function is designed for copying encoding and coderange
884 * from src to new string "dest" which is made from the part of src.
885 */
886 str_enc_copy(dest, src);
887 if (RSTRING_LEN(dest) == 0) {
888 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
890 else
892 return;
893 }
894 switch (ENC_CODERANGE(src)) {
897 break;
899 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
900 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
902 else
904 break;
905 default:
906 break;
907 }
908}
909
910static void
911rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
912{
913 str_enc_copy(dest, src);
915}
916
917static int
918enc_coderange_scan(VALUE str, rb_encoding *enc)
919{
920 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
921}
922
923int
924rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
925{
926 return enc_coderange_scan(str, enc);
927}
928
929int
931{
932 int cr = ENC_CODERANGE(str);
933
934 if (cr == ENC_CODERANGE_UNKNOWN) {
935 cr = enc_coderange_scan(str, get_encoding(str));
936 ENC_CODERANGE_SET(str, cr);
937 }
938 return cr;
939}
940
941static inline bool
942rb_enc_str_asciicompat(VALUE str)
943{
944 int encindex = ENCODING_GET_INLINED(str);
945 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
946}
947
948int
950{
951 switch(ENC_CODERANGE(str)) {
953 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
955 return true;
956 default:
957 return false;
958 }
959}
960
961static inline void
962str_mod_check(VALUE s, const char *p, long len)
963{
964 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
965 rb_raise(rb_eRuntimeError, "string modified");
966 }
967}
968
969static size_t
970str_capacity(VALUE str, const int termlen)
971{
972 if (STR_EMBED_P(str)) {
973 return str_embed_capa(str) - termlen;
974 }
975 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
976 return RSTRING(str)->len;
977 }
978 else {
979 return RSTRING(str)->as.heap.aux.capa;
980 }
981}
982
983size_t
985{
986 return str_capacity(str, TERM_LEN(str));
987}
988
989static inline void
990must_not_null(const char *ptr)
991{
992 if (!ptr) {
993 rb_raise(rb_eArgError, "NULL pointer given");
994 }
995}
996
997static inline VALUE
998str_alloc_embed(VALUE klass, size_t capa)
999{
1000 size_t size = rb_str_embed_size(capa, 0);
1001 RUBY_ASSERT(size > 0);
1002 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1003
1004 NEWOBJ_OF(str, struct RString, klass,
1006
1007 str->len = 0;
1008 str->as.embed.ary[0] = 0;
1009
1010 return (VALUE)str;
1011}
1012
1013static inline VALUE
1014str_alloc_heap(VALUE klass)
1015{
1016 NEWOBJ_OF(str, struct RString, klass,
1017 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1018
1019 str->len = 0;
1020 str->as.heap.aux.capa = 0;
1021 str->as.heap.ptr = NULL;
1022
1023 return (VALUE)str;
1024}
1025
1026static inline VALUE
1027empty_str_alloc(VALUE klass)
1028{
1029 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1030 VALUE str = str_alloc_embed(klass, 0);
1031 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1033 return str;
1034}
1035
1036static VALUE
1037str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1038{
1039 VALUE str;
1040
1041 if (len < 0) {
1042 rb_raise(rb_eArgError, "negative string size (or size too big)");
1043 }
1044
1045 if (enc == NULL) {
1046 enc = rb_ascii8bit_encoding();
1047 }
1048
1049 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1050
1051 int termlen = rb_enc_mbminlen(enc);
1052
1053 if (STR_EMBEDDABLE_P(len, termlen)) {
1054 str = str_alloc_embed(klass, len + termlen);
1055 if (len == 0) {
1056 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1057 }
1058 }
1059 else {
1060 str = str_alloc_heap(klass);
1061 RSTRING(str)->as.heap.aux.capa = len;
1062 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1063 * integer overflow. If we can STATIC_ASSERT that, the following
1064 * mul_add_mul can be reverted to a simple ALLOC_N. */
1065 RSTRING(str)->as.heap.ptr =
1066 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1067 }
1068
1069 rb_enc_raw_set(str, enc);
1070
1071 if (ptr) {
1072 memcpy(RSTRING_PTR(str), ptr, len);
1073 }
1074 else {
1075 memset(RSTRING_PTR(str), 0, len);
1076 }
1077
1078 STR_SET_LEN(str, len);
1079 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1080 return str;
1081}
1082
1083static VALUE
1084str_new(VALUE klass, const char *ptr, long len)
1085{
1086 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1087}
1088
1089VALUE
1090rb_str_new(const char *ptr, long len)
1091{
1092 return str_new(rb_cString, ptr, len);
1093}
1094
1095VALUE
1096rb_usascii_str_new(const char *ptr, long len)
1097{
1098 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1099}
1100
1101VALUE
1102rb_utf8_str_new(const char *ptr, long len)
1103{
1104 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1105}
1106
1107VALUE
1108rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1109{
1110 return str_enc_new(rb_cString, ptr, len, enc);
1111}
1112
1113VALUE
1115{
1116 must_not_null(ptr);
1117 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1118 * memory regions, and that cannot be detected by the MSAN. Just
1119 * trust the programmer that the argument passed here is a sane C
1120 * string. */
1121 __msan_unpoison_string(ptr);
1122 return rb_str_new(ptr, strlen(ptr));
1123}
1124
1125VALUE
1127{
1128 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1129}
1130
1131VALUE
1133{
1134 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1135}
1136
1137VALUE
1139{
1140 must_not_null(ptr);
1141 if (rb_enc_mbminlen(enc) != 1) {
1142 rb_raise(rb_eArgError, "wchar encoding given");
1143 }
1144 return rb_enc_str_new(ptr, strlen(ptr), enc);
1145}
1146
1147static VALUE
1148str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1149{
1150 VALUE str;
1151
1152 if (len < 0) {
1153 rb_raise(rb_eArgError, "negative string size (or size too big)");
1154 }
1155
1156 if (!ptr) {
1157 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1158 }
1159 else {
1160 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1161 str = str_alloc_heap(klass);
1162 RSTRING(str)->len = len;
1163 RSTRING(str)->as.heap.ptr = (char *)ptr;
1164 RSTRING(str)->as.heap.aux.capa = len;
1165 RBASIC(str)->flags |= STR_NOFREE;
1166 rb_enc_associate_index(str, encindex);
1167 }
1168 return str;
1169}
1170
1171VALUE
1172rb_str_new_static(const char *ptr, long len)
1173{
1174 return str_new_static(rb_cString, ptr, len, 0);
1175}
1176
1177VALUE
1179{
1180 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1181}
1182
1183VALUE
1185{
1186 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1187}
1188
1189VALUE
1191{
1192 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1193}
1194
1195static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1196 rb_encoding *from, rb_encoding *to,
1197 int ecflags, VALUE ecopts);
1198
1199static inline bool
1200is_enc_ascii_string(VALUE str, rb_encoding *enc)
1201{
1202 int encidx = rb_enc_to_index(enc);
1203 if (rb_enc_get_index(str) == encidx)
1204 return is_ascii_string(str);
1205 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1206}
1207
1208VALUE
1209rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1210{
1211 long len;
1212 const char *ptr;
1213 VALUE newstr;
1214
1215 if (!to) return str;
1216 if (!from) from = rb_enc_get(str);
1217 if (from == to) return str;
1218 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1219 rb_is_ascii8bit_enc(to)) {
1220 if (STR_ENC_GET(str) != to) {
1221 str = rb_str_dup(str);
1222 rb_enc_associate(str, to);
1223 }
1224 return str;
1225 }
1226
1227 RSTRING_GETMEM(str, ptr, len);
1228 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1229 from, to, ecflags, ecopts);
1230 if (NIL_P(newstr)) {
1231 /* some error, return original */
1232 return str;
1233 }
1234 return newstr;
1235}
1236
1237VALUE
1238rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1239 rb_encoding *from, int ecflags, VALUE ecopts)
1240{
1241 long olen;
1242
1243 olen = RSTRING_LEN(newstr);
1244 if (ofs < -olen || olen < ofs)
1245 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1246 if (ofs < 0) ofs += olen;
1247 if (!from) {
1248 STR_SET_LEN(newstr, ofs);
1249 return rb_str_cat(newstr, ptr, len);
1250 }
1251
1252 rb_str_modify(newstr);
1253 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1254 rb_enc_get(newstr),
1255 ecflags, ecopts);
1256}
1257
1258VALUE
1259rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1260{
1261 STR_SET_LEN(str, 0);
1262 rb_enc_associate(str, enc);
1263 rb_str_cat(str, ptr, len);
1264 return str;
1265}
1266
1267static VALUE
1268str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1269 rb_encoding *from, rb_encoding *to,
1270 int ecflags, VALUE ecopts)
1271{
1272 rb_econv_t *ec;
1274 long olen;
1275 VALUE econv_wrapper;
1276 const unsigned char *start, *sp;
1277 unsigned char *dest, *dp;
1278 size_t converted_output = (size_t)ofs;
1279
1280 olen = rb_str_capacity(newstr);
1281
1282 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1283 RBASIC_CLEAR_CLASS(econv_wrapper);
1284 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1285 if (!ec) return Qnil;
1286 DATA_PTR(econv_wrapper) = ec;
1287
1288 sp = (unsigned char*)ptr;
1289 start = sp;
1290 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1291 (dp = dest + converted_output),
1292 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1294 /* destination buffer short */
1295 size_t converted_input = sp - start;
1296 size_t rest = len - converted_input;
1297 converted_output = dp - dest;
1298 rb_str_set_len(newstr, converted_output);
1299 if (converted_input && converted_output &&
1300 rest < (LONG_MAX / converted_output)) {
1301 rest = (rest * converted_output) / converted_input;
1302 }
1303 else {
1304 rest = olen;
1305 }
1306 olen += rest < 2 ? 2 : rest;
1307 rb_str_resize(newstr, olen);
1308 }
1309 DATA_PTR(econv_wrapper) = 0;
1310 RB_GC_GUARD(econv_wrapper);
1311 rb_econv_close(ec);
1312 switch (ret) {
1313 case econv_finished:
1314 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1315 rb_str_set_len(newstr, len);
1316 rb_enc_associate(newstr, to);
1317 return newstr;
1318
1319 default:
1320 return Qnil;
1321 }
1322}
1323
1324VALUE
1326{
1327 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1328}
1329
1330VALUE
1332{
1333 rb_encoding *ienc;
1334 VALUE str;
1335 const int eidx = rb_enc_to_index(eenc);
1336
1337 if (!ptr) {
1338 return rb_enc_str_new(ptr, len, eenc);
1339 }
1340
1341 /* ASCII-8BIT case, no conversion */
1342 if ((eidx == rb_ascii8bit_encindex()) ||
1343 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1344 return rb_str_new(ptr, len);
1345 }
1346 /* no default_internal or same encoding, no conversion */
1347 ienc = rb_default_internal_encoding();
1348 if (!ienc || eenc == ienc) {
1349 return rb_enc_str_new(ptr, len, eenc);
1350 }
1351 /* ASCII compatible, and ASCII only string, no conversion in
1352 * default_internal */
1353 if ((eidx == rb_ascii8bit_encindex()) ||
1354 (eidx == rb_usascii_encindex()) ||
1355 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1356 return rb_enc_str_new(ptr, len, ienc);
1357 }
1358 /* convert from the given encoding to default_internal */
1359 str = rb_enc_str_new(NULL, 0, ienc);
1360 /* when the conversion failed for some reason, just ignore the
1361 * default_internal and result in the given encoding as-is. */
1362 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1363 rb_str_initialize(str, ptr, len, eenc);
1364 }
1365 return str;
1366}
1367
1368VALUE
1369rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1370{
1371 int eidx = rb_enc_to_index(eenc);
1372 if (eidx == rb_usascii_encindex() &&
1373 !is_ascii_string(str)) {
1374 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1375 return str;
1376 }
1377 rb_enc_associate_index(str, eidx);
1378 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1379}
1380
1381VALUE
1382rb_external_str_new(const char *ptr, long len)
1383{
1384 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1385}
1386
1387VALUE
1389{
1390 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1391}
1392
1393VALUE
1394rb_locale_str_new(const char *ptr, long len)
1395{
1396 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1397}
1398
1399VALUE
1401{
1402 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1403}
1404
1405VALUE
1407{
1408 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1409}
1410
1411VALUE
1413{
1414 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1415}
1416
1417VALUE
1419{
1420 return rb_str_export_to_enc(str, rb_default_external_encoding());
1421}
1422
1423VALUE
1425{
1426 return rb_str_export_to_enc(str, rb_locale_encoding());
1427}
1428
1429VALUE
1431{
1432 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1433}
1434
1435static VALUE
1436str_replace_shared_without_enc(VALUE str2, VALUE str)
1437{
1438 const int termlen = TERM_LEN(str);
1439 char *ptr;
1440 long len;
1441
1442 RSTRING_GETMEM(str, ptr, len);
1443 if (str_embed_capa(str2) >= len + termlen) {
1444 char *ptr2 = RSTRING(str2)->as.embed.ary;
1445 STR_SET_EMBED(str2);
1446 memcpy(ptr2, RSTRING_PTR(str), len);
1447 TERM_FILL(ptr2+len, termlen);
1448 }
1449 else {
1450 VALUE root;
1451 if (STR_SHARED_P(str)) {
1452 root = RSTRING(str)->as.heap.aux.shared;
1453 RSTRING_GETMEM(str, ptr, len);
1454 }
1455 else {
1456 root = rb_str_new_frozen(str);
1457 RSTRING_GETMEM(root, ptr, len);
1458 }
1459 RUBY_ASSERT(OBJ_FROZEN(root));
1460
1461 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1462 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1463 rb_fatal("about to free a possible shared root");
1464 }
1465 char *ptr2 = STR_HEAP_PTR(str2);
1466 if (ptr2 != ptr) {
1467 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1468 }
1469 }
1470 FL_SET(str2, STR_NOEMBED);
1471 RSTRING(str2)->as.heap.ptr = ptr;
1472 STR_SET_SHARED(str2, root);
1473 }
1474
1475 STR_SET_LEN(str2, len);
1476
1477 return str2;
1478}
1479
1480static VALUE
1481str_replace_shared(VALUE str2, VALUE str)
1482{
1483 str_replace_shared_without_enc(str2, str);
1484 rb_enc_cr_str_exact_copy(str2, str);
1485 return str2;
1486}
1487
1488static VALUE
1489str_new_shared(VALUE klass, VALUE str)
1490{
1491 return str_replace_shared(str_alloc_heap(klass), str);
1492}
1493
1494VALUE
1496{
1497 return str_new_shared(rb_obj_class(str), str);
1498}
1499
1500VALUE
1502{
1503 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1504 return str_new_frozen(rb_obj_class(orig), orig);
1505}
1506
1507static VALUE
1508rb_str_new_frozen_String(VALUE orig)
1509{
1510 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1511 return str_new_frozen(rb_cString, orig);
1512}
1513
1514
1515VALUE
1516rb_str_frozen_bare_string(VALUE orig)
1517{
1518 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1519 return str_new_frozen(rb_cString, orig);
1520}
1521
1522VALUE
1523rb_str_tmp_frozen_acquire(VALUE orig)
1524{
1525 if (OBJ_FROZEN_RAW(orig)) return orig;
1526 return str_new_frozen_buffer(0, orig, FALSE);
1527}
1528
1529VALUE
1530rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1531{
1532 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1533 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1534
1535 VALUE str = str_alloc_heap(0);
1536 OBJ_FREEZE(str);
1537 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1538 FL_SET(str, STR_SHARED_ROOT);
1539
1540 size_t capa = str_capacity(orig, TERM_LEN(orig));
1541
1542 /* If the string is embedded then we want to create a copy that is heap
1543 * allocated. If the string is shared then the shared root must be
1544 * embedded, so we want to create a copy. If the string is a shared root
1545 * then it must be embedded, so we want to create a copy. */
1546 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1547 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1548 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1549 }
1550 else {
1551 /* orig must be heap allocated and not shared, so we can safely transfer
1552 * the pointer to str. */
1553 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1554 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1555 RBASIC(orig)->flags &= ~STR_NOFREE;
1556 STR_SET_SHARED(orig, str);
1557 if (RB_OBJ_SHAREABLE_P(orig)) {
1558 RB_OBJ_SET_SHAREABLE(str);
1559 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1560 }
1561 }
1562
1563 RSTRING(str)->len = RSTRING(orig)->len;
1564 RSTRING(str)->as.heap.aux.capa = capa + (TERM_LEN(orig) - TERM_LEN(str));
1565
1566 return str;
1567}
1568
1569void
1570rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1571{
1572 if (RBASIC_CLASS(tmp) != 0)
1573 return;
1574
1575 if (STR_EMBED_P(tmp)) {
1577 }
1578 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1579 !OBJ_FROZEN_RAW(orig)) {
1580 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1581
1582 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1583 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1584 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1585
1586 /* Unshare orig since the root (tmp) only has this one child. */
1587 FL_UNSET_RAW(orig, STR_SHARED);
1588 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1589 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1591
1592 /* Make tmp embedded and empty so it is safe for sweeping. */
1593 STR_SET_EMBED(tmp);
1594 STR_SET_LEN(tmp, 0);
1595 }
1596 }
1597}
1598
1599static VALUE
1600str_new_frozen(VALUE klass, VALUE orig)
1601{
1602 return str_new_frozen_buffer(klass, orig, TRUE);
1603}
1604
1605static VALUE
1606heap_str_make_shared(VALUE klass, VALUE orig)
1607{
1608 RUBY_ASSERT(!STR_EMBED_P(orig));
1609 RUBY_ASSERT(!STR_SHARED_P(orig));
1611
1612 VALUE str = str_alloc_heap(klass);
1613 STR_SET_LEN(str, RSTRING_LEN(orig));
1614 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1615 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1616 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1617 RBASIC(orig)->flags &= ~STR_NOFREE;
1618 STR_SET_SHARED(orig, str);
1619 if (klass == 0)
1620 FL_UNSET_RAW(str, STR_BORROWED);
1621 return str;
1622}
1623
1624static VALUE
1625str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1626{
1627 VALUE str;
1628
1629 long len = RSTRING_LEN(orig);
1630 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1631 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1632
1633 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1634 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1635 RUBY_ASSERT(STR_EMBED_P(str));
1636 }
1637 else {
1638 if (FL_TEST_RAW(orig, STR_SHARED)) {
1639 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1640 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1641 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1642 RUBY_ASSERT(ofs >= 0);
1643 RUBY_ASSERT(rest >= 0);
1644 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1646
1647 if ((ofs > 0) || (rest > 0) ||
1648 (klass != RBASIC(shared)->klass) ||
1649 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1650 str = str_new_shared(klass, shared);
1651 RUBY_ASSERT(!STR_EMBED_P(str));
1652 RSTRING(str)->as.heap.ptr += ofs;
1653 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1654 }
1655 else {
1656 if (RBASIC_CLASS(shared) == 0)
1657 FL_SET_RAW(shared, STR_BORROWED);
1658 return shared;
1659 }
1660 }
1661 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1662 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1663 STR_SET_EMBED(str);
1664 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1665 STR_SET_LEN(str, RSTRING_LEN(orig));
1666 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1667 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1668 }
1669 else {
1670 if (RB_OBJ_SHAREABLE_P(orig)) {
1671 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1672 }
1673 else {
1674 str = heap_str_make_shared(klass, orig);
1675 }
1676 }
1677 }
1678
1679 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1680 OBJ_FREEZE(str);
1681 return str;
1682}
1683
1684VALUE
1685rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1686{
1687 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1688}
1689
1690static VALUE
1691str_new_empty_String(VALUE str)
1692{
1693 VALUE v = rb_str_new(0, 0);
1694 rb_enc_copy(v, str);
1695 return v;
1696}
1697
1698#define STR_BUF_MIN_SIZE 63
1699
1700VALUE
1702{
1703 if (STR_EMBEDDABLE_P(capa, 1)) {
1704 return str_alloc_embed(rb_cString, capa + 1);
1705 }
1706
1707 VALUE str = str_alloc_heap(rb_cString);
1708
1709 RSTRING(str)->as.heap.aux.capa = capa;
1710 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1711 RSTRING(str)->as.heap.ptr[0] = '\0';
1712
1713 return str;
1714}
1715
1716VALUE
1718{
1719 VALUE str;
1720 long len = strlen(ptr);
1721
1722 str = rb_str_buf_new(len);
1723 rb_str_buf_cat(str, ptr, len);
1724
1725 return str;
1726}
1727
1728VALUE
1730{
1731 return str_new(0, 0, len);
1732}
1733
1734void
1736{
1737 if (STR_EMBED_P(str)) {
1738 RB_DEBUG_COUNTER_INC(obj_str_embed);
1739 }
1740 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1741 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1742 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1743 }
1744 else {
1745 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1746 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1747 }
1748}
1749
1750size_t
1751rb_str_memsize(VALUE str)
1752{
1753 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1754 return STR_HEAP_SIZE(str);
1755 }
1756 else {
1757 return 0;
1758 }
1759}
1760
1761VALUE
1763{
1764 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1765}
1766
1767static inline void str_discard(VALUE str);
1768static void str_shared_replace(VALUE str, VALUE str2);
1769
1770void
1772{
1773 if (str != str2) str_shared_replace(str, str2);
1774}
1775
1776static void
1777str_shared_replace(VALUE str, VALUE str2)
1778{
1779 rb_encoding *enc;
1780 int cr;
1781 int termlen;
1782
1783 RUBY_ASSERT(str2 != str);
1784 enc = STR_ENC_GET(str2);
1785 cr = ENC_CODERANGE(str2);
1786 str_discard(str);
1787 termlen = rb_enc_mbminlen(enc);
1788
1789 STR_SET_LEN(str, RSTRING_LEN(str2));
1790
1791 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1792 STR_SET_EMBED(str);
1793 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1794 rb_enc_associate(str, enc);
1795 ENC_CODERANGE_SET(str, cr);
1796 }
1797 else {
1798 if (STR_EMBED_P(str2)) {
1799 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1800 long len = RSTRING_LEN(str2);
1801 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1802
1803 char *new_ptr = ALLOC_N(char, len + termlen);
1804 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1805 RSTRING(str2)->as.heap.ptr = new_ptr;
1806 STR_SET_LEN(str2, len);
1807 RSTRING(str2)->as.heap.aux.capa = len;
1808 STR_SET_NOEMBED(str2);
1809 }
1810
1811 STR_SET_NOEMBED(str);
1812 FL_UNSET(str, STR_SHARED);
1813 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1814
1815 if (FL_TEST(str2, STR_SHARED)) {
1816 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1817 STR_SET_SHARED(str, shared);
1818 }
1819 else {
1820 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1821 }
1822
1823 /* abandon str2 */
1824 STR_SET_EMBED(str2);
1825 RSTRING_PTR(str2)[0] = 0;
1826 STR_SET_LEN(str2, 0);
1827 rb_enc_associate(str, enc);
1828 ENC_CODERANGE_SET(str, cr);
1829 }
1830}
1831
1832VALUE
1834{
1835 VALUE str;
1836
1837 if (RB_TYPE_P(obj, T_STRING)) {
1838 return obj;
1839 }
1840 str = rb_funcall(obj, idTo_s, 0);
1841 return rb_obj_as_string_result(str, obj);
1842}
1843
1844VALUE
1845rb_obj_as_string_result(VALUE str, VALUE obj)
1846{
1847 if (!RB_TYPE_P(str, T_STRING))
1848 return rb_any_to_s(obj);
1849 return str;
1850}
1851
1852static VALUE
1853str_replace(VALUE str, VALUE str2)
1854{
1855 long len;
1856
1857 len = RSTRING_LEN(str2);
1858 if (STR_SHARED_P(str2)) {
1859 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1861 STR_SET_NOEMBED(str);
1862 STR_SET_LEN(str, len);
1863 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1864 STR_SET_SHARED(str, shared);
1865 rb_enc_cr_str_exact_copy(str, str2);
1866 }
1867 else {
1868 str_replace_shared(str, str2);
1869 }
1870
1871 return str;
1872}
1873
1874static inline VALUE
1875ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1876{
1877 size_t size = rb_str_embed_size(capa, 0);
1878 RUBY_ASSERT(size > 0);
1879 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1880
1881 NEWOBJ_OF(str, struct RString, klass,
1883
1884 str->len = 0;
1885
1886 return (VALUE)str;
1887}
1888
1889static inline VALUE
1890ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1891{
1892 NEWOBJ_OF(str, struct RString, klass,
1893 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1894
1895 str->as.heap.aux.capa = 0;
1896 str->as.heap.ptr = NULL;
1897
1898 return (VALUE)str;
1899}
1900
1901static inline VALUE
1902str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1903{
1904 int encidx = 0;
1905 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1906 encidx = rb_enc_get_index(str);
1907 flags &= ~ENCODING_MASK;
1908 }
1909 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1910 if (encidx) rb_enc_associate_index(dup, encidx);
1911 return dup;
1912}
1913
1914static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1915
1916static inline VALUE
1917str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1918{
1919 VALUE flags = FL_TEST_RAW(str, flag_mask);
1920 long len = RSTRING_LEN(str);
1921
1922 RUBY_ASSERT(STR_EMBED_P(dup));
1923 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1924 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1927}
1928
1929static inline VALUE
1930str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1931{
1932 VALUE flags = FL_TEST_RAW(str, flag_mask);
1933 VALUE root = str;
1934 if (FL_TEST_RAW(str, STR_SHARED)) {
1935 root = RSTRING(str)->as.heap.aux.shared;
1936 }
1937 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1938 root = str = str_new_frozen(klass, str);
1939 flags = FL_TEST_RAW(str, flag_mask);
1940 }
1941 RUBY_ASSERT(!STR_SHARED_P(root));
1943
1944 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1945 FL_SET_RAW(dup, RSTRING_NOEMBED);
1946 STR_SET_SHARED(dup, root);
1947 flags |= RSTRING_NOEMBED | STR_SHARED;
1948
1949 STR_SET_LEN(dup, RSTRING_LEN(str));
1950 return str_duplicate_setup_encoding(str, dup, flags);
1951}
1952
1953static inline VALUE
1954str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1955{
1956 if (STR_EMBED_P(str)) {
1957 return str_duplicate_setup_embed(klass, str, dup);
1958 }
1959 else {
1960 return str_duplicate_setup_heap(klass, str, dup);
1961 }
1962}
1963
1964static inline VALUE
1965str_duplicate(VALUE klass, VALUE str)
1966{
1967 VALUE dup;
1968 if (STR_EMBED_P(str)) {
1969 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1970 }
1971 else {
1972 dup = str_alloc_heap(klass);
1973 }
1974
1975 return str_duplicate_setup(klass, str, dup);
1976}
1977
1978VALUE
1980{
1981 return str_duplicate(rb_obj_class(str), str);
1982}
1983
1984/* :nodoc: */
1985VALUE
1986rb_str_dup_m(VALUE str)
1987{
1988 if (LIKELY(BARE_STRING_P(str))) {
1989 return str_duplicate(rb_cString, str);
1990 }
1991 else {
1992 return rb_obj_dup(str);
1993 }
1994}
1995
1996VALUE
1998{
1999 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2000 return str_duplicate(rb_cString, str);
2001}
2002
2003VALUE
2004rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2005{
2006 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2007 VALUE new_str, klass = rb_cString;
2008
2009 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2010 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2011 str_duplicate_setup_embed(klass, str, new_str);
2012 }
2013 else {
2014 new_str = ec_str_alloc_heap(ec, klass);
2015 str_duplicate_setup_heap(klass, str, new_str);
2016 }
2017 if (chilled) {
2018 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2019 }
2020 return new_str;
2021}
2022
2023VALUE
2024rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2025{
2026 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2027 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2028 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2029 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2030 return rb_str_freeze(str);
2031}
2032
2033/*
2034 * The documentation block below uses an include (instead of inline text)
2035 * because the included text has non-ASCII characters (which are not allowed in a C file).
2036 */
2037
2038/*
2039 *
2040 * call-seq:
2041 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2042 *
2043 * :include: doc/string/new.rdoc
2044 *
2045 */
2046
2047static VALUE
2048rb_str_init(int argc, VALUE *argv, VALUE str)
2049{
2050 static ID keyword_ids[2];
2051 VALUE orig, opt, venc, vcapa;
2052 VALUE kwargs[2];
2053 rb_encoding *enc = 0;
2054 int n;
2055
2056 if (!keyword_ids[0]) {
2057 keyword_ids[0] = rb_id_encoding();
2058 CONST_ID(keyword_ids[1], "capacity");
2059 }
2060
2061 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2062 if (!NIL_P(opt)) {
2063 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2064 venc = kwargs[0];
2065 vcapa = kwargs[1];
2066 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2067 enc = rb_to_encoding(venc);
2068 }
2069 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2070 long capa = NUM2LONG(vcapa);
2071 long len = 0;
2072 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2073
2074 if (capa < STR_BUF_MIN_SIZE) {
2075 capa = STR_BUF_MIN_SIZE;
2076 }
2077 if (n == 1) {
2078 StringValue(orig);
2079 len = RSTRING_LEN(orig);
2080 if (capa < len) {
2081 capa = len;
2082 }
2083 if (orig == str) n = 0;
2084 }
2085 str_modifiable(str);
2086 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2087 /* make noembed always */
2088 const size_t size = (size_t)capa + termlen;
2089 const char *const old_ptr = RSTRING_PTR(str);
2090 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2091 char *new_ptr = ALLOC_N(char, size);
2092 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2093 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2094 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2095 RSTRING(str)->as.heap.ptr = new_ptr;
2096 }
2097 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2098 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2099 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2100 }
2101 STR_SET_LEN(str, len);
2102 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2103 if (n == 1) {
2104 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2105 rb_enc_cr_str_exact_copy(str, orig);
2106 }
2107 FL_SET(str, STR_NOEMBED);
2108 RSTRING(str)->as.heap.aux.capa = capa;
2109 }
2110 else if (n == 1) {
2111 rb_str_replace(str, orig);
2112 }
2113 if (enc) {
2114 rb_enc_associate(str, enc);
2116 }
2117 }
2118 else if (n == 1) {
2119 rb_str_replace(str, orig);
2120 }
2121 return str;
2122}
2123
2124/* :nodoc: */
2125static VALUE
2126rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2127{
2128 if (klass != rb_cString) {
2129 return rb_class_new_instance_pass_kw(argc, argv, klass);
2130 }
2131
2132 static ID keyword_ids[2];
2133 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2134 VALUE kwargs[2];
2135 rb_encoding *enc = NULL;
2136
2137 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2138 if (NIL_P(opt)) {
2139 return rb_class_new_instance_pass_kw(argc, argv, klass);
2140 }
2141
2142 keyword_ids[0] = rb_id_encoding();
2143 CONST_ID(keyword_ids[1], "capacity");
2144 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2145 encoding = kwargs[0];
2146 capacity = kwargs[1];
2147
2148 if (n == 1) {
2149 orig = StringValue(orig);
2150 }
2151 else {
2152 orig = Qnil;
2153 }
2154
2155 if (UNDEF_P(encoding)) {
2156 if (!NIL_P(orig)) {
2157 encoding = rb_obj_encoding(orig);
2158 }
2159 }
2160
2161 if (!UNDEF_P(encoding)) {
2162 enc = rb_to_encoding(encoding);
2163 }
2164
2165 // If capacity is nil, we're basically just duping `orig`.
2166 if (UNDEF_P(capacity)) {
2167 if (NIL_P(orig)) {
2168 VALUE empty_str = str_new(klass, "", 0);
2169 if (enc) {
2170 rb_enc_associate(empty_str, enc);
2171 }
2172 return empty_str;
2173 }
2174 VALUE copy = str_duplicate(klass, orig);
2175 rb_enc_associate(copy, enc);
2176 ENC_CODERANGE_CLEAR(copy);
2177 return copy;
2178 }
2179
2180 long capa = 0;
2181 capa = NUM2LONG(capacity);
2182 if (capa < 0) {
2183 capa = 0;
2184 }
2185
2186 if (!NIL_P(orig)) {
2187 long orig_capa = rb_str_capacity(orig);
2188 if (orig_capa > capa) {
2189 capa = orig_capa;
2190 }
2191 }
2192
2193 VALUE str = str_enc_new(klass, NULL, capa, enc);
2194 STR_SET_LEN(str, 0);
2195 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2196
2197 if (!NIL_P(orig)) {
2198 rb_str_buf_append(str, orig);
2199 }
2200
2201 return str;
2202}
2203
2204#ifdef NONASCII_MASK
2205#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2206
2207/*
2208 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2209 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2210 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2211 *
2212 * if (!(byte & 0x80))
2213 * byte |= 0x40; // turn on bit6
2214 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2215 *
2216 * This function calculates whether a byte is leading or not for all bytes
2217 * in the argument word by concurrently using the above logic, and then
2218 * adds up the number of leading bytes in the word.
2219 */
2220static inline uintptr_t
2221count_utf8_lead_bytes_with_word(const uintptr_t *s)
2222{
2223 uintptr_t d = *s;
2224
2225 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2226 d = (d>>6) | (~d>>7);
2227 d &= NONASCII_MASK >> 7;
2228
2229 /* Gather all bytes. */
2230#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2231 /* use only if it can use POPCNT */
2232 return rb_popcount_intptr(d);
2233#else
2234 d += (d>>8);
2235 d += (d>>16);
2236# if SIZEOF_VOIDP == 8
2237 d += (d>>32);
2238# endif
2239 return (d&0xF);
2240#endif
2241}
2242#endif
2243
2244static inline long
2245enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2246{
2247 long c;
2248 const char *q;
2249
2250 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2251 long diff = (long)(e - p);
2252 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2253 }
2254#ifdef NONASCII_MASK
2255 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2256 uintptr_t len = 0;
2257 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2258 const uintptr_t *s, *t;
2259 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2260 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2261 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2262 while (p < (const char *)s) {
2263 if (is_utf8_lead_byte(*p)) len++;
2264 p++;
2265 }
2266 while (s < t) {
2267 len += count_utf8_lead_bytes_with_word(s);
2268 s++;
2269 }
2270 p = (const char *)s;
2271 }
2272 while (p < e) {
2273 if (is_utf8_lead_byte(*p)) len++;
2274 p++;
2275 }
2276 return (long)len;
2277 }
2278#endif
2279 else if (rb_enc_asciicompat(enc)) {
2280 c = 0;
2281 if (ENC_CODERANGE_CLEAN_P(cr)) {
2282 while (p < e) {
2283 if (ISASCII(*p)) {
2284 q = search_nonascii(p, e);
2285 if (!q)
2286 return c + (e - p);
2287 c += q - p;
2288 p = q;
2289 }
2290 p += rb_enc_fast_mbclen(p, e, enc);
2291 c++;
2292 }
2293 }
2294 else {
2295 while (p < e) {
2296 if (ISASCII(*p)) {
2297 q = search_nonascii(p, e);
2298 if (!q)
2299 return c + (e - p);
2300 c += q - p;
2301 p = q;
2302 }
2303 p += rb_enc_mbclen(p, e, enc);
2304 c++;
2305 }
2306 }
2307 return c;
2308 }
2309
2310 for (c=0; p<e; c++) {
2311 p += rb_enc_mbclen(p, e, enc);
2312 }
2313 return c;
2314}
2315
2316long
2317rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2318{
2319 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2320}
2321
2322/* To get strlen with cr
2323 * Note that given cr is not used.
2324 */
2325long
2326rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2327{
2328 long c;
2329 const char *q;
2330 int ret;
2331
2332 *cr = 0;
2333 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2334 long diff = (long)(e - p);
2335 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2336 }
2337 else if (rb_enc_asciicompat(enc)) {
2338 c = 0;
2339 while (p < e) {
2340 if (ISASCII(*p)) {
2341 q = search_nonascii(p, e);
2342 if (!q) {
2343 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2344 return c + (e - p);
2345 }
2346 c += q - p;
2347 p = q;
2348 }
2349 ret = rb_enc_precise_mbclen(p, e, enc);
2350 if (MBCLEN_CHARFOUND_P(ret)) {
2351 *cr |= ENC_CODERANGE_VALID;
2352 p += MBCLEN_CHARFOUND_LEN(ret);
2353 }
2354 else {
2356 p++;
2357 }
2358 c++;
2359 }
2360 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2361 return c;
2362 }
2363
2364 for (c=0; p<e; c++) {
2365 ret = rb_enc_precise_mbclen(p, e, enc);
2366 if (MBCLEN_CHARFOUND_P(ret)) {
2367 *cr |= ENC_CODERANGE_VALID;
2368 p += MBCLEN_CHARFOUND_LEN(ret);
2369 }
2370 else {
2372 if (p + rb_enc_mbminlen(enc) <= e)
2373 p += rb_enc_mbminlen(enc);
2374 else
2375 p = e;
2376 }
2377 }
2378 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2379 return c;
2380}
2381
2382/* enc must be str's enc or rb_enc_check(str, str2) */
2383static long
2384str_strlen(VALUE str, rb_encoding *enc)
2385{
2386 const char *p, *e;
2387 int cr;
2388
2389 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2390 if (!enc) enc = STR_ENC_GET(str);
2391 p = RSTRING_PTR(str);
2392 e = RSTRING_END(str);
2393 cr = ENC_CODERANGE(str);
2394
2395 if (cr == ENC_CODERANGE_UNKNOWN) {
2396 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2397 if (cr) ENC_CODERANGE_SET(str, cr);
2398 return n;
2399 }
2400 else {
2401 return enc_strlen(p, e, enc, cr);
2402 }
2403}
2404
2405long
2407{
2408 return str_strlen(str, NULL);
2409}
2410
2411/*
2412 * call-seq:
2413 * length -> integer
2414 *
2415 * :include: doc/string/length.rdoc
2416 *
2417 */
2418
2419VALUE
2421{
2422 return LONG2NUM(str_strlen(str, NULL));
2423}
2424
2425/*
2426 * call-seq:
2427 * bytesize -> integer
2428 *
2429 * :include: doc/string/bytesize.rdoc
2430 *
2431 */
2432
2433VALUE
2434rb_str_bytesize(VALUE str)
2435{
2436 return LONG2NUM(RSTRING_LEN(str));
2437}
2438
2439/*
2440 * call-seq:
2441 * empty? -> true or false
2442 *
2443 * Returns whether the length of +self+ is zero:
2444 *
2445 * 'hello'.empty? # => false
2446 * ' '.empty? # => false
2447 * ''.empty? # => true
2448 *
2449 * Related: see {Querying}[rdoc-ref:String@Querying].
2450 */
2451
2452static VALUE
2453rb_str_empty(VALUE str)
2454{
2455 return RBOOL(RSTRING_LEN(str) == 0);
2456}
2457
2458/*
2459 * call-seq:
2460 * self + other_string -> new_string
2461 *
2462 * Returns a new string containing +other_string+ concatenated to +self+:
2463 *
2464 * 'Hello from ' + self.to_s # => "Hello from main"
2465 *
2466 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2467 */
2468
2469VALUE
2471{
2472 VALUE str3;
2473 rb_encoding *enc;
2474 char *ptr1, *ptr2, *ptr3;
2475 long len1, len2;
2476 int termlen;
2477
2478 StringValue(str2);
2479 enc = rb_enc_check_str(str1, str2);
2480 RSTRING_GETMEM(str1, ptr1, len1);
2481 RSTRING_GETMEM(str2, ptr2, len2);
2482 termlen = rb_enc_mbminlen(enc);
2483 if (len1 > LONG_MAX - len2) {
2484 rb_raise(rb_eArgError, "string size too big");
2485 }
2486 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2487 ptr3 = RSTRING_PTR(str3);
2488 memcpy(ptr3, ptr1, len1);
2489 memcpy(ptr3+len1, ptr2, len2);
2490 TERM_FILL(&ptr3[len1+len2], termlen);
2491
2492 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2494 RB_GC_GUARD(str1);
2495 RB_GC_GUARD(str2);
2496 return str3;
2497}
2498
2499/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2500VALUE
2501rb_str_opt_plus(VALUE str1, VALUE str2)
2502{
2505 long len1, len2;
2506 MAYBE_UNUSED(char) *ptr1, *ptr2;
2507 RSTRING_GETMEM(str1, ptr1, len1);
2508 RSTRING_GETMEM(str2, ptr2, len2);
2509 int enc1 = rb_enc_get_index(str1);
2510 int enc2 = rb_enc_get_index(str2);
2511
2512 if (enc1 < 0) {
2513 return Qundef;
2514 }
2515 else if (enc2 < 0) {
2516 return Qundef;
2517 }
2518 else if (enc1 != enc2) {
2519 return Qundef;
2520 }
2521 else if (len1 > LONG_MAX - len2) {
2522 return Qundef;
2523 }
2524 else {
2525 return rb_str_plus(str1, str2);
2526 }
2527
2528}
2529
2530/*
2531 * call-seq:
2532 * self * n -> new_string
2533 *
2534 * Returns a new string containing +n+ copies of +self+:
2535 *
2536 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2537 * 'No!' * 0 # => ""
2538 *
2539 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2540 */
2541
2542VALUE
2544{
2545 VALUE str2;
2546 long n, len;
2547 char *ptr2;
2548 int termlen;
2549
2550 if (times == INT2FIX(1)) {
2551 return str_duplicate(rb_cString, str);
2552 }
2553 if (times == INT2FIX(0)) {
2554 str2 = str_alloc_embed(rb_cString, 0);
2555 rb_enc_copy(str2, str);
2556 return str2;
2557 }
2558 len = NUM2LONG(times);
2559 if (len < 0) {
2560 rb_raise(rb_eArgError, "negative argument");
2561 }
2562 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2563 if (STR_EMBEDDABLE_P(len, 1)) {
2564 str2 = str_alloc_embed(rb_cString, len + 1);
2565 memset(RSTRING_PTR(str2), 0, len + 1);
2566 }
2567 else {
2568 str2 = str_alloc_heap(rb_cString);
2569 RSTRING(str2)->as.heap.aux.capa = len;
2570 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2571 }
2572 STR_SET_LEN(str2, len);
2573 rb_enc_copy(str2, str);
2574 return str2;
2575 }
2576 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2577 rb_raise(rb_eArgError, "argument too big");
2578 }
2579
2580 len *= RSTRING_LEN(str);
2581 termlen = TERM_LEN(str);
2582 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2583 ptr2 = RSTRING_PTR(str2);
2584 if (len) {
2585 n = RSTRING_LEN(str);
2586 memcpy(ptr2, RSTRING_PTR(str), n);
2587 while (n <= len/2) {
2588 memcpy(ptr2 + n, ptr2, n);
2589 n *= 2;
2590 }
2591 memcpy(ptr2 + n, ptr2, len-n);
2592 }
2593 STR_SET_LEN(str2, len);
2594 TERM_FILL(&ptr2[len], termlen);
2595 rb_enc_cr_str_copy_for_substr(str2, str);
2596
2597 return str2;
2598}
2599
2600/*
2601 * call-seq:
2602 * self % object -> new_string
2603 *
2604 * Returns the result of formatting +object+ into the format specifications
2605 * contained in +self+
2606 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2607 *
2608 * '%05d' % 123 # => "00123"
2609 *
2610 * If +self+ contains multiple format specifications,
2611 * +object+ must be an array or hash containing the objects to be formatted:
2612 *
2613 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2614 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2615 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2616 *
2617 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2618 */
2619
2620static VALUE
2621rb_str_format_m(VALUE str, VALUE arg)
2622{
2623 VALUE tmp = rb_check_array_type(arg);
2624
2625 if (!NIL_P(tmp)) {
2626 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2627 RB_GC_GUARD(tmp);
2628 return result;
2629 }
2630 return rb_str_format(1, &arg, str);
2631}
2632
2633static inline void
2634rb_check_lockedtmp(VALUE str)
2635{
2636 if (FL_TEST(str, STR_TMPLOCK)) {
2637 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2638 }
2639}
2640
2641// If none of these flags are set, we know we have an modifiable string.
2642// If any is set, we need to do more detailed checks.
2643#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2644static inline void
2645str_modifiable(VALUE str)
2646{
2647 RUBY_ASSERT(ruby_thread_has_gvl_p());
2648
2649 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2650 if (CHILLED_STRING_P(str)) {
2651 CHILLED_STRING_MUTATED(str);
2652 }
2653 rb_check_lockedtmp(str);
2654 rb_check_frozen(str);
2655 }
2656}
2657
2658static inline int
2659str_dependent_p(VALUE str)
2660{
2661 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2662 return FALSE;
2663 }
2664 else {
2665 return TRUE;
2666 }
2667}
2668
2669// If none of these flags are set, we know we have an independent string.
2670// If any is set, we need to do more detailed checks.
2671#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2672static inline int
2673str_independent(VALUE str)
2674{
2675 RUBY_ASSERT(ruby_thread_has_gvl_p());
2676
2677 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2678 str_modifiable(str);
2679 return !str_dependent_p(str);
2680 }
2681 return TRUE;
2682}
2683
2684static void
2685str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2686{
2687 RUBY_ASSERT(ruby_thread_has_gvl_p());
2688
2689 char *ptr;
2690 char *oldptr;
2691 long capa = len + expand;
2692
2693 if (len > capa) len = capa;
2694
2695 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2696 ptr = RSTRING(str)->as.heap.ptr;
2697 STR_SET_EMBED(str);
2698 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2699 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2700 STR_SET_LEN(str, len);
2701 return;
2702 }
2703
2704 ptr = ALLOC_N(char, (size_t)capa + termlen);
2705 oldptr = RSTRING_PTR(str);
2706 if (oldptr) {
2707 memcpy(ptr, oldptr, len);
2708 }
2709 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2710 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2711 }
2712 STR_SET_NOEMBED(str);
2713 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2714 TERM_FILL(ptr + len, termlen);
2715 RSTRING(str)->as.heap.ptr = ptr;
2716 STR_SET_LEN(str, len);
2717 RSTRING(str)->as.heap.aux.capa = capa;
2718}
2719
2720void
2721rb_str_modify(VALUE str)
2722{
2723 if (!str_independent(str))
2724 str_make_independent(str);
2726}
2727
2728void
2730{
2731 RUBY_ASSERT(ruby_thread_has_gvl_p());
2732
2733 int termlen = TERM_LEN(str);
2734 long len = RSTRING_LEN(str);
2735
2736 if (expand < 0) {
2737 rb_raise(rb_eArgError, "negative expanding string size");
2738 }
2739 if (expand >= LONG_MAX - len) {
2740 rb_raise(rb_eArgError, "string size too big");
2741 }
2742
2743 if (!str_independent(str)) {
2744 str_make_independent_expand(str, len, expand, termlen);
2745 }
2746 else if (expand > 0) {
2747 RESIZE_CAPA_TERM(str, len + expand, termlen);
2748 }
2750}
2751
2752/* As rb_str_modify(), but don't clear coderange */
2753static void
2754str_modify_keep_cr(VALUE str)
2755{
2756 if (!str_independent(str))
2757 str_make_independent(str);
2759 /* Force re-scan later */
2761}
2762
2763static inline void
2764str_discard(VALUE str)
2765{
2766 str_modifiable(str);
2767 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2768 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2769 RSTRING(str)->as.heap.ptr = 0;
2770 STR_SET_LEN(str, 0);
2771 }
2772}
2773
2774void
2776{
2777 int encindex = rb_enc_get_index(str);
2778
2779 if (RB_UNLIKELY(encindex == -1)) {
2780 rb_raise(rb_eTypeError, "not encoding capable object");
2781 }
2782
2783 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2784 return;
2785 }
2786
2787 rb_encoding *enc = rb_enc_from_index(encindex);
2788 if (!rb_enc_asciicompat(enc)) {
2789 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2790 }
2791}
2792
2793VALUE
2795{
2796 RUBY_ASSERT(ruby_thread_has_gvl_p());
2797
2798 VALUE s = *ptr;
2799 if (!RB_TYPE_P(s, T_STRING)) {
2800 s = rb_str_to_str(s);
2801 *ptr = s;
2802 }
2803 return s;
2804}
2805
2806char *
2808{
2809 VALUE str = rb_string_value(ptr);
2810 return RSTRING_PTR(str);
2811}
2812
2813static int
2814zero_filled(const char *s, int n)
2815{
2816 for (; n > 0; --n) {
2817 if (*s++) return 0;
2818 }
2819 return 1;
2820}
2821
2822static const char *
2823str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2824{
2825 const char *e = s + len;
2826
2827 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2828 if (zero_filled(s, minlen)) return s;
2829 }
2830 return 0;
2831}
2832
2833static char *
2834str_fill_term(VALUE str, char *s, long len, int termlen)
2835{
2836 /* This function assumes that (capa + termlen) bytes of memory
2837 * is allocated, like many other functions in this file.
2838 */
2839 if (str_dependent_p(str)) {
2840 if (!zero_filled(s + len, termlen))
2841 str_make_independent_expand(str, len, 0L, termlen);
2842 }
2843 else {
2844 TERM_FILL(s + len, termlen);
2845 return s;
2846 }
2847 return RSTRING_PTR(str);
2848}
2849
2850void
2851rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2852{
2853 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2854 long len = RSTRING_LEN(str);
2855
2856 RUBY_ASSERT(capa >= len);
2857 if (capa - len < termlen) {
2858 rb_check_lockedtmp(str);
2859 str_make_independent_expand(str, len, 0L, termlen);
2860 }
2861 else if (str_dependent_p(str)) {
2862 if (termlen > oldtermlen)
2863 str_make_independent_expand(str, len, 0L, termlen);
2864 }
2865 else {
2866 if (!STR_EMBED_P(str)) {
2867 /* modify capa instead of realloc */
2868 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2869 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2870 }
2871 if (termlen > oldtermlen) {
2872 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2873 }
2874 }
2875
2876 return;
2877}
2878
2879static char *
2880str_null_check(VALUE str, int *w)
2881{
2882 char *s = RSTRING_PTR(str);
2883 long len = RSTRING_LEN(str);
2884 int minlen = 1;
2885
2886 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2887 rb_encoding *enc = rb_str_enc_get(str);
2888 minlen = rb_enc_mbminlen(enc);
2889
2890 if (minlen > 1) {
2891 *w = 1;
2892 if (str_null_char(s, len, minlen, enc)) {
2893 return NULL;
2894 }
2895 return str_fill_term(str, s, len, minlen);
2896 }
2897 }
2898
2899 *w = 0;
2900 if (!s || memchr(s, 0, len)) {
2901 return NULL;
2902 }
2903 if (s[len]) {
2904 s = str_fill_term(str, s, len, minlen);
2905 }
2906 return s;
2907}
2908
2909const char *
2910rb_str_null_check(VALUE str)
2911{
2913
2914 char *s;
2915 long len;
2916 RSTRING_GETMEM(str, s, len);
2917
2918 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2919 if (!s || memchr(s, 0, len)) {
2920 rb_raise(rb_eArgError, "string contains null byte");
2921 }
2922 }
2923 else {
2924 int w;
2925 const char *s = str_null_check(str, &w);
2926 if (!s) {
2927 if (w) {
2928 rb_raise(rb_eArgError, "string contains null char");
2929 }
2930 rb_raise(rb_eArgError, "string contains null byte");
2931 }
2932 }
2933
2934 return s;
2935}
2936
2937char *
2938rb_str_to_cstr(VALUE str)
2939{
2940 int w;
2941 return str_null_check(str, &w);
2942}
2943
2944char *
2946{
2947 VALUE str = rb_string_value(ptr);
2948 int w;
2949 char *s = str_null_check(str, &w);
2950 if (!s) {
2951 if (w) {
2952 rb_raise(rb_eArgError, "string contains null char");
2953 }
2954 rb_raise(rb_eArgError, "string contains null byte");
2955 }
2956 return s;
2957}
2958
2959char *
2960rb_str_fill_terminator(VALUE str, const int newminlen)
2961{
2962 char *s = RSTRING_PTR(str);
2963 long len = RSTRING_LEN(str);
2964 return str_fill_term(str, s, len, newminlen);
2965}
2966
2967VALUE
2969{
2970 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2971 return str;
2972}
2973
2974/*
2975 * call-seq:
2976 * String.try_convert(object) -> object, new_string, or nil
2977 *
2978 * Attempts to convert the given +object+ to a string.
2979 *
2980 * If +object+ is already a string, returns +object+, unmodified.
2981 *
2982 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2983 * calls <tt>object.to_str</tt> and returns the result.
2984 *
2985 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2986 *
2987 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2988 */
2989static VALUE
2990rb_str_s_try_convert(VALUE dummy, VALUE str)
2991{
2992 return rb_check_string_type(str);
2993}
2994
2995static char*
2996str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2997{
2998 long nth = *nthp;
2999 if (rb_enc_mbmaxlen(enc) == 1) {
3000 p += nth;
3001 }
3002 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3003 p += nth * rb_enc_mbmaxlen(enc);
3004 }
3005 else if (rb_enc_asciicompat(enc)) {
3006 const char *p2, *e2;
3007 int n;
3008
3009 while (p < e && 0 < nth) {
3010 e2 = p + nth;
3011 if (e < e2) {
3012 *nthp = nth;
3013 return (char *)e;
3014 }
3015 if (ISASCII(*p)) {
3016 p2 = search_nonascii(p, e2);
3017 if (!p2) {
3018 nth -= e2 - p;
3019 *nthp = nth;
3020 return (char *)e2;
3021 }
3022 nth -= p2 - p;
3023 p = p2;
3024 }
3025 n = rb_enc_mbclen(p, e, enc);
3026 p += n;
3027 nth--;
3028 }
3029 *nthp = nth;
3030 if (nth != 0) {
3031 return (char *)e;
3032 }
3033 return (char *)p;
3034 }
3035 else {
3036 while (p < e && nth--) {
3037 p += rb_enc_mbclen(p, e, enc);
3038 }
3039 }
3040 if (p > e) p = e;
3041 *nthp = nth;
3042 return (char*)p;
3043}
3044
3045char*
3046rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3047{
3048 return str_nth_len(p, e, &nth, enc);
3049}
3050
3051static char*
3052str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3053{
3054 if (singlebyte)
3055 p += nth;
3056 else {
3057 p = str_nth_len(p, e, &nth, enc);
3058 }
3059 if (!p) return 0;
3060 if (p > e) p = e;
3061 return (char *)p;
3062}
3063
3064/* char offset to byte offset */
3065static long
3066str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3067{
3068 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3069 if (!pp) return e - p;
3070 return pp - p;
3071}
3072
3073long
3074rb_str_offset(VALUE str, long pos)
3075{
3076 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3077 STR_ENC_GET(str), single_byte_optimizable(str));
3078}
3079
3080#ifdef NONASCII_MASK
3081static char *
3082str_utf8_nth(const char *p, const char *e, long *nthp)
3083{
3084 long nth = *nthp;
3085 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3086 const uintptr_t *s, *t;
3087 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3088 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3089 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3090 while (p < (const char *)s) {
3091 if (is_utf8_lead_byte(*p)) nth--;
3092 p++;
3093 }
3094 do {
3095 nth -= count_utf8_lead_bytes_with_word(s);
3096 s++;
3097 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3098 p = (char *)s;
3099 }
3100 while (p < e) {
3101 if (is_utf8_lead_byte(*p)) {
3102 if (nth == 0) break;
3103 nth--;
3104 }
3105 p++;
3106 }
3107 *nthp = nth;
3108 return (char *)p;
3109}
3110
3111static long
3112str_utf8_offset(const char *p, const char *e, long nth)
3113{
3114 const char *pp = str_utf8_nth(p, e, &nth);
3115 return pp - p;
3116}
3117#endif
3118
3119/* byte offset to char offset */
3120long
3121rb_str_sublen(VALUE str, long pos)
3122{
3123 if (single_byte_optimizable(str) || pos < 0)
3124 return pos;
3125 else {
3126 char *p = RSTRING_PTR(str);
3127 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3128 }
3129}
3130
3131static VALUE
3132str_subseq(VALUE str, long beg, long len)
3133{
3134 VALUE str2;
3135
3136 RUBY_ASSERT(beg >= 0);
3137 RUBY_ASSERT(len >= 0);
3138 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3139
3140 const int termlen = TERM_LEN(str);
3141 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3142 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
3143 RB_GC_GUARD(str);
3144 return str2;
3145 }
3146
3147 str2 = str_alloc_heap(rb_cString);
3148 if (str_embed_capa(str2) >= len + termlen) {
3149 char *ptr2 = RSTRING(str2)->as.embed.ary;
3150 STR_SET_EMBED(str2);
3151 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3152 TERM_FILL(ptr2+len, termlen);
3153
3154 STR_SET_LEN(str2, len);
3155 RB_GC_GUARD(str);
3156 }
3157 else {
3158 str_replace_shared(str2, str);
3159 RUBY_ASSERT(!STR_EMBED_P(str2));
3160 ENC_CODERANGE_CLEAR(str2);
3161 RSTRING(str2)->as.heap.ptr += beg;
3162 if (RSTRING_LEN(str2) > len) {
3163 STR_SET_LEN(str2, len);
3164 }
3165 }
3166
3167 return str2;
3168}
3169
3170VALUE
3171rb_str_subseq(VALUE str, long beg, long len)
3172{
3173 VALUE str2 = str_subseq(str, beg, len);
3174 rb_enc_cr_str_copy_for_substr(str2, str);
3175 return str2;
3176}
3177
3178char *
3179rb_str_subpos(VALUE str, long beg, long *lenp)
3180{
3181 long len = *lenp;
3182 long slen = -1L;
3183 const long blen = RSTRING_LEN(str);
3184 rb_encoding *enc = STR_ENC_GET(str);
3185 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3186
3187 if (len < 0) return 0;
3188 if (beg < 0 && -beg < 0) return 0;
3189 if (!blen) {
3190 len = 0;
3191 }
3192 if (single_byte_optimizable(str)) {
3193 if (beg > blen) return 0;
3194 if (beg < 0) {
3195 beg += blen;
3196 if (beg < 0) return 0;
3197 }
3198 if (len > blen - beg)
3199 len = blen - beg;
3200 if (len < 0) return 0;
3201 p = s + beg;
3202 goto end;
3203 }
3204 if (beg < 0) {
3205 if (len > -beg) len = -beg;
3206 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3207 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3208 beg = -beg;
3209 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3210 p = e;
3211 if (!p) return 0;
3212 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3213 if (!p) return 0;
3214 len = e - p;
3215 goto end;
3216 }
3217 else {
3218 slen = str_strlen(str, enc);
3219 beg += slen;
3220 if (beg < 0) return 0;
3221 p = s + beg;
3222 if (len == 0) goto end;
3223 }
3224 }
3225 else if (beg > 0 && beg > blen) {
3226 return 0;
3227 }
3228 if (len == 0) {
3229 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3230 p = s + beg;
3231 }
3232#ifdef NONASCII_MASK
3233 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3234 enc == rb_utf8_encoding()) {
3235 p = str_utf8_nth(s, e, &beg);
3236 if (beg > 0) return 0;
3237 len = str_utf8_offset(p, e, len);
3238 }
3239#endif
3240 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3241 int char_sz = rb_enc_mbmaxlen(enc);
3242
3243 p = s + beg * char_sz;
3244 if (p > e) {
3245 return 0;
3246 }
3247 else if (len * char_sz > e - p)
3248 len = e - p;
3249 else
3250 len *= char_sz;
3251 }
3252 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3253 if (beg > 0) return 0;
3254 len = 0;
3255 }
3256 else {
3257 len = str_offset(p, e, len, enc, 0);
3258 }
3259 end:
3260 *lenp = len;
3261 RB_GC_GUARD(str);
3262 return p;
3263}
3264
3265static VALUE str_substr(VALUE str, long beg, long len, int empty);
3266
3267VALUE
3268rb_str_substr(VALUE str, long beg, long len)
3269{
3270 return str_substr(str, beg, len, TRUE);
3271}
3272
3273VALUE
3274rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3275{
3276 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3277}
3278
3279static VALUE
3280str_substr(VALUE str, long beg, long len, int empty)
3281{
3282 char *p = rb_str_subpos(str, beg, &len);
3283
3284 if (!p) return Qnil;
3285 if (!len && !empty) return Qnil;
3286
3287 beg = p - RSTRING_PTR(str);
3288
3289 VALUE str2 = str_subseq(str, beg, len);
3290 rb_enc_cr_str_copy_for_substr(str2, str);
3291 return str2;
3292}
3293
3294/* :nodoc: */
3295VALUE
3297{
3298 if (CHILLED_STRING_P(str)) {
3299 FL_UNSET_RAW(str, STR_CHILLED);
3300 }
3301
3302 if (OBJ_FROZEN(str)) return str;
3303 rb_str_resize(str, RSTRING_LEN(str));
3304 return rb_obj_freeze(str);
3305}
3306
3307/*
3308 * call-seq:
3309 * +string -> new_string or self
3310 *
3311 * Returns +self+ if +self+ is not frozen and can be mutated
3312 * without warning issuance.
3313 *
3314 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3315 *
3316 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3317 */
3318static VALUE
3319str_uplus(VALUE str)
3320{
3321 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3322 return rb_str_dup(str);
3323 }
3324 else {
3325 return str;
3326 }
3327}
3328
3329/*
3330 * call-seq:
3331 * -self -> frozen_string
3332 *
3333 * Returns a frozen string equal to +self+.
3334 *
3335 * The returned string is +self+ if and only if all of the following are true:
3336 *
3337 * - +self+ is already frozen.
3338 * - +self+ is an instance of \String (rather than of a subclass of \String)
3339 * - +self+ has no instance variables set on it.
3340 *
3341 * Otherwise, the returned string is a frozen copy of +self+.
3342 *
3343 * Returning +self+, when possible, saves duplicating +self+;
3344 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3345 *
3346 * It may also save duplicating other, already-existing, strings:
3347 *
3348 * s0 = 'foo'
3349 * s1 = 'foo'
3350 * s0.object_id == s1.object_id # => false
3351 * (-s0).object_id == (-s1).object_id # => true
3352 *
3353 * Note that method #-@ is convenient for defining a constant:
3354 *
3355 * FileName = -'config/database.yml'
3356 *
3357 * While its alias #dedup is better suited for chaining:
3358 *
3359 * 'foo'.dedup.gsub!('o')
3360 *
3361 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3362 */
3363static VALUE
3364str_uminus(VALUE str)
3365{
3366 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3367 str = rb_str_dup(str);
3368 }
3369 return rb_fstring(str);
3370}
3371
3372RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3373#define rb_str_dup_frozen rb_str_new_frozen
3374
3375VALUE
3377{
3378 rb_check_frozen(str);
3379 if (FL_TEST(str, STR_TMPLOCK)) {
3380 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3381 }
3382 FL_SET(str, STR_TMPLOCK);
3383 return str;
3384}
3385
3386VALUE
3388{
3389 rb_check_frozen(str);
3390 if (!FL_TEST(str, STR_TMPLOCK)) {
3391 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3392 }
3393 FL_UNSET(str, STR_TMPLOCK);
3394 return str;
3395}
3396
3397VALUE
3398rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3399{
3400 rb_str_locktmp(str);
3401 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3402}
3403
3404void
3406{
3407 RUBY_ASSERT(ruby_thread_has_gvl_p());
3408
3409 long capa;
3410 const int termlen = TERM_LEN(str);
3411
3412 str_modifiable(str);
3413 if (STR_SHARED_P(str)) {
3414 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3415 }
3416 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3417 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3418 }
3419
3420 int cr = ENC_CODERANGE(str);
3421 if (len == 0) {
3422 /* Empty string does not contain non-ASCII */
3424 }
3425 else if (cr == ENC_CODERANGE_UNKNOWN) {
3426 /* Leave unknown. */
3427 }
3428 else if (len > RSTRING_LEN(str)) {
3429 if (ENC_CODERANGE_CLEAN_P(cr)) {
3430 /* Update the coderange regarding the extended part. */
3431 const char *const prev_end = RSTRING_END(str);
3432 const char *const new_end = RSTRING_PTR(str) + len;
3433 rb_encoding *enc = rb_enc_get(str);
3434 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3435 ENC_CODERANGE_SET(str, cr);
3436 }
3437 else if (cr == ENC_CODERANGE_BROKEN) {
3438 /* May be valid now, by appended part. */
3440 }
3441 }
3442 else if (len < RSTRING_LEN(str)) {
3443 if (cr != ENC_CODERANGE_7BIT) {
3444 /* ASCII-only string is keeping after truncated. Valid
3445 * and broken may be invalid or valid, leave unknown. */
3447 }
3448 }
3449
3450 STR_SET_LEN(str, len);
3451 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3452}
3453
3454VALUE
3455rb_str_resize(VALUE str, long len)
3456{
3457 if (len < 0) {
3458 rb_raise(rb_eArgError, "negative string size (or size too big)");
3459 }
3460
3461 int independent = str_independent(str);
3462 long slen = RSTRING_LEN(str);
3463 const int termlen = TERM_LEN(str);
3464
3465 if (slen > len || (termlen != 1 && slen < len)) {
3467 }
3468
3469 {
3470 long capa;
3471 if (STR_EMBED_P(str)) {
3472 if (len == slen) return str;
3473 if (str_embed_capa(str) >= len + termlen) {
3474 STR_SET_LEN(str, len);
3475 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3476 return str;
3477 }
3478 str_make_independent_expand(str, slen, len - slen, termlen);
3479 }
3480 else if (str_embed_capa(str) >= len + termlen) {
3481 capa = RSTRING(str)->as.heap.aux.capa;
3482 char *ptr = STR_HEAP_PTR(str);
3483 STR_SET_EMBED(str);
3484 if (slen > len) slen = len;
3485 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3486 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3487 STR_SET_LEN(str, len);
3488 if (independent) {
3489 SIZED_FREE_N(ptr, capa + termlen);
3490 }
3491 return str;
3492 }
3493 else if (!independent) {
3494 if (len == slen) return str;
3495 str_make_independent_expand(str, slen, len - slen, termlen);
3496 }
3497 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3498 (capa - len) > (len < 1024 ? len : 1024)) {
3499 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3500 (size_t)len + termlen, STR_HEAP_SIZE(str));
3501 RSTRING(str)->as.heap.aux.capa = len;
3502 }
3503 else if (len == slen) return str;
3504 STR_SET_LEN(str, len);
3505 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3506 }
3507 return str;
3508}
3509
3510static void
3511str_ensure_available_capa(VALUE str, long len)
3512{
3513 str_modify_keep_cr(str);
3514
3515 const int termlen = TERM_LEN(str);
3516 long olen = RSTRING_LEN(str);
3517
3518 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3519 rb_raise(rb_eArgError, "string sizes too big");
3520 }
3521
3522 long total = olen + len;
3523 long capa = str_capacity(str, termlen);
3524
3525 if (capa < total) {
3526 if (total >= LONG_MAX / 2) {
3527 capa = total;
3528 }
3529 while (total > capa) {
3530 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3531 }
3532 RESIZE_CAPA_TERM(str, capa, termlen);
3533 }
3534}
3535
3536static VALUE
3537str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3538{
3539 if (keep_cr) {
3540 str_modify_keep_cr(str);
3541 }
3542 else {
3543 rb_str_modify(str);
3544 }
3545 if (len == 0) return 0;
3546
3547 long total, olen, off = -1;
3548 char *sptr;
3549 const int termlen = TERM_LEN(str);
3550
3551 RSTRING_GETMEM(str, sptr, olen);
3552 if (ptr >= sptr && ptr <= sptr + olen) {
3553 off = ptr - sptr;
3554 }
3555
3556 long capa = str_capacity(str, termlen);
3557
3558 if (olen > LONG_MAX - len) {
3559 rb_raise(rb_eArgError, "string sizes too big");
3560 }
3561 total = olen + len;
3562 if (capa < total) {
3563 if (total >= LONG_MAX / 2) {
3564 capa = total;
3565 }
3566 while (total > capa) {
3567 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3568 }
3569 RESIZE_CAPA_TERM(str, capa, termlen);
3570 sptr = RSTRING_PTR(str);
3571 }
3572 if (off != -1) {
3573 ptr = sptr + off;
3574 }
3575 memcpy(sptr + olen, ptr, len);
3576 STR_SET_LEN(str, total);
3577 TERM_FILL(sptr + total, termlen); /* sentinel */
3578
3579 return str;
3580}
3581
3582#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3583#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3584
3585VALUE
3586rb_str_cat(VALUE str, const char *ptr, long len)
3587{
3588 if (len == 0) return str;
3589 if (len < 0) {
3590 rb_raise(rb_eArgError, "negative string size (or size too big)");
3591 }
3592 return str_buf_cat(str, ptr, len);
3593}
3594
3595VALUE
3596rb_str_cat_cstr(VALUE str, const char *ptr)
3597{
3598 must_not_null(ptr);
3599 return rb_str_buf_cat(str, ptr, strlen(ptr));
3600}
3601
3602static void
3603rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3604{
3605 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3606
3607 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3608 if (UNLIKELY(!str_independent(str))) {
3609 str_make_independent(str);
3610 }
3611
3612 long string_length = -1;
3613 const int null_terminator_length = 1;
3614 char *sptr;
3615 RSTRING_GETMEM(str, sptr, string_length);
3616
3617 // Ensure the resulting string wouldn't be too long.
3618 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3619 rb_raise(rb_eArgError, "string sizes too big");
3620 }
3621
3622 long string_capacity = str_capacity(str, null_terminator_length);
3623
3624 // Get the code range before any modifications since those might clear the code range.
3625 int cr = ENC_CODERANGE(str);
3626
3627 // Check if the string has spare string_capacity to write the new byte.
3628 if (LIKELY(string_capacity >= string_length + 1)) {
3629 // In fast path we can write the new byte and note the string's new length.
3630 sptr[string_length] = byte;
3631 STR_SET_LEN(str, string_length + 1);
3632 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3633 }
3634 else {
3635 // If there's not enough string_capacity, make a call into the general string concatenation function.
3636 str_buf_cat(str, (char *)&byte, 1);
3637 }
3638
3639 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3640 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3641 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3642 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3643 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3644 if (ISASCII(byte)) {
3646 }
3647 else {
3649
3650 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3651 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3652 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3653 }
3654 }
3655 }
3656}
3657
3658RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3659RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3660RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3661
3662static VALUE
3663rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3664 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3665{
3666 int str_encindex = ENCODING_GET(str);
3667 int res_encindex;
3668 int str_cr, res_cr;
3669 rb_encoding *str_enc, *ptr_enc;
3670
3671 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3672
3673 if (str_encindex == ptr_encindex) {
3674 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3675 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3676 }
3677 }
3678 else {
3679 str_enc = rb_enc_from_index(str_encindex);
3680 ptr_enc = rb_enc_from_index(ptr_encindex);
3681 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3682 if (len == 0)
3683 return str;
3684 if (RSTRING_LEN(str) == 0) {
3685 rb_str_buf_cat(str, ptr, len);
3686 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3687 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3688 return str;
3689 }
3690 goto incompatible;
3691 }
3692 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3693 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3694 }
3695 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3696 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3697 str_cr = rb_enc_str_coderange(str);
3698 }
3699 }
3700 }
3701 if (ptr_cr_ret)
3702 *ptr_cr_ret = ptr_cr;
3703
3704 if (str_encindex != ptr_encindex &&
3705 str_cr != ENC_CODERANGE_7BIT &&
3706 ptr_cr != ENC_CODERANGE_7BIT) {
3707 str_enc = rb_enc_from_index(str_encindex);
3708 ptr_enc = rb_enc_from_index(ptr_encindex);
3709 goto incompatible;
3710 }
3711
3712 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3713 res_encindex = str_encindex;
3714 res_cr = ENC_CODERANGE_UNKNOWN;
3715 }
3716 else if (str_cr == ENC_CODERANGE_7BIT) {
3717 if (ptr_cr == ENC_CODERANGE_7BIT) {
3718 res_encindex = str_encindex;
3719 res_cr = ENC_CODERANGE_7BIT;
3720 }
3721 else {
3722 res_encindex = ptr_encindex;
3723 res_cr = ptr_cr;
3724 }
3725 }
3726 else if (str_cr == ENC_CODERANGE_VALID) {
3727 res_encindex = str_encindex;
3728 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3729 res_cr = str_cr;
3730 else
3731 res_cr = ptr_cr;
3732 }
3733 else { /* str_cr == ENC_CODERANGE_BROKEN */
3734 res_encindex = str_encindex;
3735 res_cr = str_cr;
3736 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3737 }
3738
3739 if (len < 0) {
3740 rb_raise(rb_eArgError, "negative string size (or size too big)");
3741 }
3742 str_buf_cat(str, ptr, len);
3743 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3744 return str;
3745
3746 incompatible:
3747 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3748 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3750}
3751
3752VALUE
3753rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3754{
3755 return rb_enc_cr_str_buf_cat(str, ptr, len,
3756 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3757}
3758
3759VALUE
3761{
3762 /* ptr must reference NUL terminated ASCII string. */
3763 int encindex = ENCODING_GET(str);
3764 rb_encoding *enc = rb_enc_from_index(encindex);
3765 if (rb_enc_asciicompat(enc)) {
3766 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3767 encindex, ENC_CODERANGE_7BIT, 0);
3768 }
3769 else {
3770 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3771 while (*ptr) {
3772 unsigned int c = (unsigned char)*ptr;
3773 int len = rb_enc_codelen(c, enc);
3774 rb_enc_mbcput(c, buf, enc);
3775 rb_enc_cr_str_buf_cat(str, buf, len,
3776 encindex, ENC_CODERANGE_VALID, 0);
3777 ptr++;
3778 }
3779 return str;
3780 }
3781}
3782
3783VALUE
3785{
3786 int str2_cr = rb_enc_str_coderange(str2);
3787
3788 if (rb_str_enc_fastpath(str)) {
3789 switch (str2_cr) {
3790 case ENC_CODERANGE_7BIT:
3791 // If RHS is 7bit we can do simple concatenation
3792 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3793 RB_GC_GUARD(str2);
3794 return str;
3796 // If RHS is valid, we can do simple concatenation if encodings are the same
3797 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3798 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3799 int str_cr = ENC_CODERANGE(str);
3800 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3801 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3802 }
3803 RB_GC_GUARD(str2);
3804 return str;
3805 }
3806 }
3807 }
3808
3809 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3810 ENCODING_GET(str2), str2_cr, &str2_cr);
3811
3812 ENC_CODERANGE_SET(str2, str2_cr);
3813
3814 return str;
3815}
3816
3817VALUE
3819{
3820 StringValue(str2);
3821 return rb_str_buf_append(str, str2);
3822}
3823
3824VALUE
3825rb_str_concat_literals(size_t num, const VALUE *strary)
3826{
3827 VALUE str;
3828 size_t i, s = 0;
3829 unsigned long len = 1;
3830
3831 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3832 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3833
3834 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3835 str = rb_str_buf_new(len);
3836 str_enc_copy_direct(str, strary[0]);
3837
3838 for (i = s; i < num; ++i) {
3839 const VALUE v = strary[i];
3840 int encidx = ENCODING_GET(v);
3841
3842 rb_str_buf_append(str, v);
3843 if (encidx != ENCINDEX_US_ASCII) {
3844 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3845 rb_enc_set_index(str, encidx);
3846 }
3847 }
3848 return str;
3849}
3850
3851/*
3852 * call-seq:
3853 * concat(*objects) -> string
3854 *
3855 * :include: doc/string/concat.rdoc
3856 */
3857static VALUE
3858rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3859{
3860 str_modifiable(str);
3861
3862 if (argc == 1) {
3863 return rb_str_concat(str, argv[0]);
3864 }
3865 else if (argc > 1) {
3866 int i;
3867 VALUE arg_str = rb_str_tmp_new(0);
3868 rb_enc_copy(arg_str, str);
3869 for (i = 0; i < argc; i++) {
3870 rb_str_concat(arg_str, argv[i]);
3871 }
3872 rb_str_buf_append(str, arg_str);
3873 }
3874
3875 return str;
3876}
3877
3878/*
3879 * call-seq:
3880 * append_as_bytes(*objects) -> self
3881 *
3882 * Concatenates each object in +objects+ into +self+; returns +self+;
3883 * performs no encoding validation or conversion:
3884 *
3885 * s = 'foo'
3886 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3887 * s.valid_encoding? # => false
3888 * s.append_as_bytes("\xAC 12")
3889 * s.valid_encoding? # => true
3890 *
3891 * When a given object is an integer,
3892 * the value is considered an 8-bit byte;
3893 * if the integer occupies more than one byte (i.e,. is greater than 255),
3894 * appends only the low-order byte (similar to String#setbyte):
3895 *
3896 * s = ""
3897 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3898 * s.bytesize # => 2
3899 *
3900 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3901 */
3902
3903VALUE
3904rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3905{
3906 long needed_capacity = 0;
3907 volatile VALUE t0;
3908 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3909
3910 for (int index = 0; index < argc; index++) {
3911 VALUE obj = argv[index];
3912 enum ruby_value_type type = types[index] = rb_type(obj);
3913 switch (type) {
3914 case T_FIXNUM:
3915 case T_BIGNUM:
3916 needed_capacity++;
3917 break;
3918 case T_STRING:
3919 needed_capacity += RSTRING_LEN(obj);
3920 break;
3921 default:
3922 rb_raise(
3924 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3925 rb_obj_class(obj)
3926 );
3927 break;
3928 }
3929 }
3930
3931 str_ensure_available_capa(str, needed_capacity);
3932 char *sptr = RSTRING_END(str);
3933
3934 for (int index = 0; index < argc; index++) {
3935 VALUE obj = argv[index];
3936 enum ruby_value_type type = types[index];
3937 switch (type) {
3938 case T_FIXNUM:
3939 case T_BIGNUM: {
3940 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3941 char byte = (char)(NUM2INT(obj) & 0xFF);
3942 *sptr = byte;
3943 sptr++;
3944 break;
3945 }
3946 case T_STRING: {
3947 const char *ptr;
3948 long len;
3949 RSTRING_GETMEM(obj, ptr, len);
3950 memcpy(sptr, ptr, len);
3951 sptr += len;
3952 break;
3953 }
3954 default:
3955 rb_bug("append_as_bytes arguments should have been validated");
3956 }
3957 }
3958
3959 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3960 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3961
3962 int cr = ENC_CODERANGE(str);
3963 switch (cr) {
3964 case ENC_CODERANGE_7BIT: {
3965 for (int index = 0; index < argc; index++) {
3966 VALUE obj = argv[index];
3967 enum ruby_value_type type = types[index];
3968 switch (type) {
3969 case T_FIXNUM:
3970 case T_BIGNUM: {
3971 if (!ISASCII(NUM2INT(obj))) {
3972 goto clear_cr;
3973 }
3974 break;
3975 }
3976 case T_STRING: {
3977 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3978 goto clear_cr;
3979 }
3980 break;
3981 }
3982 default:
3983 rb_bug("append_as_bytes arguments should have been validated");
3984 }
3985 }
3986 break;
3987 }
3989 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3990 goto keep_cr;
3991 }
3992 else {
3993 goto clear_cr;
3994 }
3995 break;
3996 default:
3997 goto clear_cr;
3998 break;
3999 }
4000
4001 RB_GC_GUARD(t0);
4002
4003 clear_cr:
4004 // If no fast path was hit, we clear the coderange.
4005 // append_as_bytes is predominantly meant to be used in
4006 // buffering situation, hence it's likely the coderange
4007 // will never be scanned, so it's not worth spending time
4008 // precomputing the coderange except for simple and common
4009 // situations.
4011 keep_cr:
4012 return str;
4013}
4014
4015/*
4016 * call-seq:
4017 * self << object -> self
4018 *
4019 * Appends a string representation of +object+ to +self+;
4020 * returns +self+.
4021 *
4022 * If +object+ is a string, appends it to +self+:
4023 *
4024 * s = 'foo'
4025 * s << 'bar' # => "foobar"
4026 * s # => "foobar"
4027 *
4028 * If +object+ is an integer,
4029 * its value is considered a codepoint;
4030 * converts the value to a character before concatenating:
4031 *
4032 * s = 'foo'
4033 * s << 33 # => "foo!"
4034 *
4035 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4036 * and the encoding of +self+ is Encoding::US_ASCII,
4037 * changes the encoding to Encoding::ASCII_8BIT:
4038 *
4039 * s = 'foo'.encode(Encoding::US_ASCII)
4040 * s.encoding # => #<Encoding:US-ASCII>
4041 * s << 0xff # => "foo\xFF"
4042 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4043 *
4044 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4045 *
4046 * s = 'foo'
4047 * s.encoding # => <Encoding:UTF-8>
4048 * s << 0x00110000 # 1114112 out of char range (RangeError)
4049 * s = 'foo'.encode(Encoding::EUC_JP)
4050 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4051 *
4052 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4053 */
4054VALUE
4056{
4057 unsigned int code;
4058 rb_encoding *enc = STR_ENC_GET(str1);
4059 int encidx;
4060
4061 if (RB_INTEGER_TYPE_P(str2)) {
4062 if (rb_num_to_uint(str2, &code) == 0) {
4063 }
4064 else if (FIXNUM_P(str2)) {
4065 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4066 }
4067 else {
4068 rb_raise(rb_eRangeError, "bignum out of char range");
4069 }
4070 }
4071 else {
4072 return rb_str_append(str1, str2);
4073 }
4074
4075 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4076
4077 if (encidx >= 0) {
4078 rb_str_buf_cat_byte(str1, (unsigned char)code);
4079 }
4080 else {
4081 long pos = RSTRING_LEN(str1);
4082 int cr = ENC_CODERANGE(str1);
4083 int len;
4084 char *buf;
4085
4086 switch (len = rb_enc_codelen(code, enc)) {
4087 case ONIGERR_INVALID_CODE_POINT_VALUE:
4088 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4089 break;
4090 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4091 case 0:
4092 rb_raise(rb_eRangeError, "%u out of char range", code);
4093 break;
4094 }
4095 buf = ALLOCA_N(char, len + 1);
4096 rb_enc_mbcput(code, buf, enc);
4097 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4098 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4099 }
4100 rb_str_resize(str1, pos+len);
4101 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4102 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4104 }
4105 else if (cr == ENC_CODERANGE_BROKEN) {
4107 }
4108 ENC_CODERANGE_SET(str1, cr);
4109 }
4110 return str1;
4111}
4112
4113int
4114rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4115{
4116 int encidx = rb_enc_to_index(enc);
4117
4118 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4119 /* US-ASCII automatically extended to ASCII-8BIT */
4120 if (code > 0xFF) {
4121 rb_raise(rb_eRangeError, "%u out of char range", code);
4122 }
4123 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4124 return ENCINDEX_ASCII_8BIT;
4125 }
4126 return encidx;
4127 }
4128 else {
4129 return -1;
4130 }
4131}
4132
4133/*
4134 * call-seq:
4135 * prepend(*other_strings) -> new_string
4136 *
4137 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4138 *
4139 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4140 *
4141 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4142 *
4143 */
4144
4145static VALUE
4146rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4147{
4148 str_modifiable(str);
4149
4150 if (argc == 1) {
4151 rb_str_update(str, 0L, 0L, argv[0]);
4152 }
4153 else if (argc > 1) {
4154 int i;
4155 VALUE arg_str = rb_str_tmp_new(0);
4156 rb_enc_copy(arg_str, str);
4157 for (i = 0; i < argc; i++) {
4158 rb_str_append(arg_str, argv[i]);
4159 }
4160 rb_str_update(str, 0L, 0L, arg_str);
4161 }
4162
4163 return str;
4164}
4165
4166st_index_t
4168{
4169 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4170 st_index_t precomputed_hash;
4171 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4172
4173 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4174 return precomputed_hash;
4175 }
4176
4177 return str_do_hash(str);
4178}
4179
4180int
4182{
4183 long len1, len2;
4184 const char *ptr1, *ptr2;
4185 RSTRING_GETMEM(str1, ptr1, len1);
4186 RSTRING_GETMEM(str2, ptr2, len2);
4187 return (len1 != len2 ||
4188 !rb_str_comparable(str1, str2) ||
4189 memcmp(ptr1, ptr2, len1) != 0);
4190}
4191
4192/*
4193 * call-seq:
4194 * hash -> integer
4195 *
4196 * :include: doc/string/hash.rdoc
4197 *
4198 */
4199
4200static VALUE
4201rb_str_hash_m(VALUE str)
4202{
4203 st_index_t hval = rb_str_hash(str);
4204 return ST2FIX(hval);
4205}
4206
4207#define lesser(a,b) (((a)>(b))?(b):(a))
4208
4209int
4211{
4212 int idx1, idx2;
4213 int rc1, rc2;
4214
4215 if (RSTRING_LEN(str1) == 0) return TRUE;
4216 if (RSTRING_LEN(str2) == 0) return TRUE;
4217 idx1 = ENCODING_GET(str1);
4218 idx2 = ENCODING_GET(str2);
4219 if (idx1 == idx2) return TRUE;
4220 rc1 = rb_enc_str_coderange(str1);
4221 rc2 = rb_enc_str_coderange(str2);
4222 if (rc1 == ENC_CODERANGE_7BIT) {
4223 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4224 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4225 return TRUE;
4226 }
4227 if (rc2 == ENC_CODERANGE_7BIT) {
4228 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4229 return TRUE;
4230 }
4231 return FALSE;
4232}
4233
4234int
4236{
4237 long len1, len2;
4238 const char *ptr1, *ptr2;
4239 int retval;
4240
4241 if (str1 == str2) return 0;
4242 RSTRING_GETMEM(str1, ptr1, len1);
4243 RSTRING_GETMEM(str2, ptr2, len2);
4244 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4245 if (len1 == len2) {
4246 if (!rb_str_comparable(str1, str2)) {
4247 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4248 return 1;
4249 return -1;
4250 }
4251 return 0;
4252 }
4253 if (len1 > len2) return 1;
4254 return -1;
4255 }
4256 if (retval > 0) return 1;
4257 return -1;
4258}
4259
4260/*
4261 * call-seq:
4262 * self == other -> true or false
4263 *
4264 * Returns whether +other+ is equal to +self+.
4265 *
4266 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4267 *
4268 * s = 'foo'
4269 * s == 'foo' # => true
4270 * s == 'food' # => false
4271 * s == 'FOO' # => false
4272 *
4273 * Returns +false+ if the two strings' encodings are not compatible:
4274 *
4275 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4276 *
4277 * When +other+ is not a string:
4278 *
4279 * - If +other+ responds to method <tt>to_str</tt>,
4280 * <tt>other == self</tt> is called and its return value is returned.
4281 * - If +other+ does not respond to <tt>to_str</tt>,
4282 * +false+ is returned.
4283 *
4284 * Related: {Comparing}[rdoc-ref:String@Comparing].
4285 */
4286
4287VALUE
4289{
4290 if (str1 == str2) return Qtrue;
4291 if (!RB_TYPE_P(str2, T_STRING)) {
4292 if (!rb_respond_to(str2, idTo_str)) {
4293 return Qfalse;
4294 }
4295 return rb_equal(str2, str1);
4296 }
4297 return rb_str_eql_internal(str1, str2);
4298}
4299
4300/*
4301 * call-seq:
4302 * eql?(object) -> true or false
4303 *
4304 * :include: doc/string/eql_p.rdoc
4305 *
4306 */
4307
4308VALUE
4309rb_str_eql(VALUE str1, VALUE str2)
4310{
4311 if (str1 == str2) return Qtrue;
4312 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4313 return rb_str_eql_internal(str1, str2);
4314}
4315
4316/*
4317 * call-seq:
4318 * self <=> other -> -1, 0, 1, or nil
4319 *
4320 * Compares +self+ and +other+,
4321 * evaluating their _contents_, not their _lengths_.
4322 *
4323 * Returns:
4324 *
4325 * - +-1+, if +self+ is smaller.
4326 * - +0+, if the two are equal.
4327 * - +1+, if +self+ is larger.
4328 * - +nil+, if the two are incomparable.
4329 *
4330 * Examples:
4331 *
4332 * 'a' <=> 'b' # => -1
4333 * 'a' <=> 'ab' # => -1
4334 * 'a' <=> 'a' # => 0
4335 * 'b' <=> 'a' # => 1
4336 * 'ab' <=> 'a' # => 1
4337 * 'a' <=> :a # => nil
4338 *
4339 * \Class \String includes module Comparable,
4340 * each of whose methods uses String#<=> for comparison.
4341 *
4342 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4343 */
4344
4345static VALUE
4346rb_str_cmp_m(VALUE str1, VALUE str2)
4347{
4348 int result;
4349 VALUE s = rb_check_string_type(str2);
4350 if (NIL_P(s)) {
4351 return rb_invcmp(str1, str2);
4352 }
4353 result = rb_str_cmp(str1, s);
4354 return INT2FIX(result);
4355}
4356
4357static VALUE str_casecmp(VALUE str1, VALUE str2);
4358static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4359
4360/*
4361 * call-seq:
4362 * casecmp(other_string) -> -1, 0, 1, or nil
4363 *
4364 * Ignoring case, compares +self+ and +other_string+; returns:
4365 *
4366 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4367 * - 0 if the two are equal.
4368 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4369 * - +nil+ if the two are incomparable.
4370 *
4371 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4372 *
4373 * Examples:
4374 *
4375 * 'foo'.casecmp('goo') # => -1
4376 * 'goo'.casecmp('foo') # => 1
4377 * 'foo'.casecmp('food') # => -1
4378 * 'food'.casecmp('foo') # => 1
4379 * 'FOO'.casecmp('foo') # => 0
4380 * 'foo'.casecmp('FOO') # => 0
4381 * 'foo'.casecmp(1) # => nil
4382 *
4383 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4384 */
4385
4386static VALUE
4387rb_str_casecmp(VALUE str1, VALUE str2)
4388{
4389 VALUE s = rb_check_string_type(str2);
4390 if (NIL_P(s)) {
4391 return Qnil;
4392 }
4393 return str_casecmp(str1, s);
4394}
4395
4396static VALUE
4397str_casecmp(VALUE str1, VALUE str2)
4398{
4399 long len;
4400 rb_encoding *enc;
4401 const char *p1, *p1end, *p2, *p2end;
4402
4403 enc = rb_enc_compatible(str1, str2);
4404 if (!enc) {
4405 return Qnil;
4406 }
4407
4408 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4409 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4410 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4411 while (p1 < p1end && p2 < p2end) {
4412 if (*p1 != *p2) {
4413 unsigned int c1 = TOLOWER(*p1 & 0xff);
4414 unsigned int c2 = TOLOWER(*p2 & 0xff);
4415 if (c1 != c2)
4416 return INT2FIX(c1 < c2 ? -1 : 1);
4417 }
4418 p1++;
4419 p2++;
4420 }
4421 }
4422 else {
4423 while (p1 < p1end && p2 < p2end) {
4424 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4425 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4426
4427 if (0 <= c1 && 0 <= c2) {
4428 c1 = TOLOWER(c1);
4429 c2 = TOLOWER(c2);
4430 if (c1 != c2)
4431 return INT2FIX(c1 < c2 ? -1 : 1);
4432 }
4433 else {
4434 int r;
4435 l1 = rb_enc_mbclen(p1, p1end, enc);
4436 l2 = rb_enc_mbclen(p2, p2end, enc);
4437 len = l1 < l2 ? l1 : l2;
4438 r = memcmp(p1, p2, len);
4439 if (r != 0)
4440 return INT2FIX(r < 0 ? -1 : 1);
4441 if (l1 != l2)
4442 return INT2FIX(l1 < l2 ? -1 : 1);
4443 }
4444 p1 += l1;
4445 p2 += l2;
4446 }
4447 }
4448 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4449 if (p1 == p1end) return INT2FIX(-1);
4450 return INT2FIX(1);
4451}
4452
4453/*
4454 * call-seq:
4455 * casecmp?(other_string) -> true, false, or nil
4456 *
4457 * Returns +true+ if +self+ and +other_string+ are equal after
4458 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4459 *
4460 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4461 *
4462 * Examples:
4463 *
4464 * 'foo'.casecmp?('goo') # => false
4465 * 'goo'.casecmp?('foo') # => false
4466 * 'foo'.casecmp?('food') # => false
4467 * 'food'.casecmp?('foo') # => false
4468 * 'FOO'.casecmp?('foo') # => true
4469 * 'foo'.casecmp?('FOO') # => true
4470 * 'foo'.casecmp?(1) # => nil
4471 *
4472 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4473 */
4474
4475static VALUE
4476rb_str_casecmp_p(VALUE str1, VALUE str2)
4477{
4478 VALUE s = rb_check_string_type(str2);
4479 if (NIL_P(s)) {
4480 return Qnil;
4481 }
4482 return str_casecmp_p(str1, s);
4483}
4484
4485static VALUE
4486str_casecmp_p(VALUE str1, VALUE str2)
4487{
4488 rb_encoding *enc;
4489 VALUE folded_str1, folded_str2;
4490 VALUE fold_opt = sym_fold;
4491
4492 enc = rb_enc_compatible(str1, str2);
4493 if (!enc) {
4494 return Qnil;
4495 }
4496
4497 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4498 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4499
4500 return rb_str_eql(folded_str1, folded_str2);
4501}
4502
4503static long
4504strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4505 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4506{
4507 const char *search_start = str_ptr;
4508 long pos, search_len = str_len - offset;
4509
4510 for (;;) {
4511 const char *t;
4512 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4513 if (pos < 0) return pos;
4514 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4515 if (t == search_start + pos) break;
4516 search_len -= t - search_start;
4517 if (search_len <= 0) return -1;
4518 offset += t - search_start;
4519 search_start = t;
4520 }
4521 return pos + offset;
4522}
4523
4524/* found index in byte */
4525#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4526#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4527
4528static long
4529rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4530{
4531 const char *str_ptr, *str_ptr_end, *sub_ptr;
4532 long str_len, sub_len;
4533 rb_encoding *enc;
4534
4535 enc = rb_enc_check(str, sub);
4536 if (is_broken_string(sub)) return -1;
4537
4538 str_ptr = RSTRING_PTR(str);
4539 str_ptr_end = RSTRING_END(str);
4540 str_len = RSTRING_LEN(str);
4541 sub_ptr = RSTRING_PTR(sub);
4542 sub_len = RSTRING_LEN(sub);
4543
4544 if (str_len < sub_len) return -1;
4545
4546 if (offset != 0) {
4547 long str_len_char, sub_len_char;
4548 int single_byte = single_byte_optimizable(str);
4549 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4550 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4551 if (offset < 0) {
4552 offset += str_len_char;
4553 if (offset < 0) return -1;
4554 }
4555 if (str_len_char - offset < sub_len_char) return -1;
4556 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4557 str_ptr += offset;
4558 }
4559 if (sub_len == 0) return offset;
4560
4561 /* need proceed one character at a time */
4562 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4563}
4564
4565
4566/*
4567 * call-seq:
4568 * index(pattern, offset = 0) -> integer or nil
4569 *
4570 * :include: doc/string/index.rdoc
4571 *
4572 */
4573
4574static VALUE
4575rb_str_index_m(int argc, VALUE *argv, VALUE str)
4576{
4577 VALUE sub;
4578 VALUE initpos;
4579 rb_encoding *enc = STR_ENC_GET(str);
4580 long pos;
4581
4582 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4583 long slen = str_strlen(str, enc); /* str's enc */
4584 pos = NUM2LONG(initpos);
4585 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4586 if (RB_TYPE_P(sub, T_REGEXP)) {
4588 }
4589 return Qnil;
4590 }
4591 }
4592 else {
4593 pos = 0;
4594 }
4595
4596 if (RB_TYPE_P(sub, T_REGEXP)) {
4597 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4598 enc, single_byte_optimizable(str));
4599
4600 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4601 VALUE match = rb_backref_get();
4602 struct re_registers *regs = RMATCH_REGS(match);
4603 pos = rb_str_sublen(str, BEG(0));
4604 return LONG2NUM(pos);
4605 }
4606 }
4607 else {
4608 StringValue(sub);
4609 pos = rb_str_index(str, sub, pos);
4610 if (pos >= 0) {
4611 pos = rb_str_sublen(str, pos);
4612 return LONG2NUM(pos);
4613 }
4614 }
4615 return Qnil;
4616}
4617
4618/* Ensure that the given pos is a valid character boundary.
4619 * Note that in this function, "character" means a code point
4620 * (Unicode scalar value), not a grapheme cluster.
4621 */
4622static void
4623str_ensure_byte_pos(VALUE str, long pos)
4624{
4625 if (!single_byte_optimizable(str)) {
4626 const char *s = RSTRING_PTR(str);
4627 const char *e = RSTRING_END(str);
4628 const char *p = s + pos;
4629 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4630 rb_raise(rb_eIndexError,
4631 "offset %ld does not land on character boundary", pos);
4632 }
4633 }
4634}
4635
4636/*
4637 * call-seq:
4638 * byteindex(object, offset = 0) -> integer or nil
4639 *
4640 * Returns the 0-based integer index of a substring of +self+
4641 * specified by +object+ (a string or Regexp) and +offset+,
4642 * or +nil+ if there is no such substring;
4643 * the returned index is the count of _bytes_ (not characters).
4644 *
4645 * When +object+ is a string,
4646 * returns the index of the first found substring equal to +object+:
4647 *
4648 * s = 'foo' # => "foo"
4649 * s.size # => 3 # Three 1-byte characters.
4650 * s.bytesize # => 3 # Three bytes.
4651 * s.byteindex('f') # => 0
4652 * s.byteindex('o') # => 1
4653 * s.byteindex('oo') # => 1
4654 * s.byteindex('ooo') # => nil
4655 *
4656 * When +object+ is a Regexp,
4657 * returns the index of the first found substring matching +object+;
4658 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4659 *
4660 * s = 'foo'
4661 * s.byteindex(/f/) # => 0
4662 * $~ # => #<MatchData "f">
4663 * s.byteindex(/o/) # => 1
4664 * s.byteindex(/oo/) # => 1
4665 * s.byteindex(/ooo/) # => nil
4666 * $~ # => nil
4667 *
4668 * \Integer argument +offset+, if given, specifies the 0-based index
4669 * of the byte where searching is to begin.
4670 *
4671 * When +offset+ is non-negative,
4672 * searching begins at byte position +offset+:
4673 *
4674 * s = 'foo'
4675 * s.byteindex('o', 1) # => 1
4676 * s.byteindex('o', 2) # => 2
4677 * s.byteindex('o', 3) # => nil
4678 *
4679 * When +offset+ is negative, counts backward from the end of +self+:
4680 *
4681 * s = 'foo'
4682 * s.byteindex('o', -1) # => 2
4683 * s.byteindex('o', -2) # => 1
4684 * s.byteindex('o', -3) # => 1
4685 * s.byteindex('o', -4) # => nil
4686 *
4687 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4688 *
4689 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4690 * s.size # => 2 # Two 3-byte characters.
4691 * s.bytesize # => 6 # Six bytes.
4692 * s.byteindex("\uFFFF") # => 0
4693 * s.byteindex("\uFFFF", 1) # Raises IndexError
4694 * s.byteindex("\uFFFF", 2) # Raises IndexError
4695 * s.byteindex("\uFFFF", 3) # => 3
4696 * s.byteindex("\uFFFF", 4) # Raises IndexError
4697 * s.byteindex("\uFFFF", 5) # Raises IndexError
4698 * s.byteindex("\uFFFF", 6) # => nil
4699 *
4700 * Related: see {Querying}[rdoc-ref:String@Querying].
4701 */
4702
4703static VALUE
4704rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4705{
4706 VALUE sub;
4707 VALUE initpos;
4708 long pos;
4709
4710 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4711 long slen = RSTRING_LEN(str);
4712 pos = NUM2LONG(initpos);
4713 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4714 if (RB_TYPE_P(sub, T_REGEXP)) {
4716 }
4717 return Qnil;
4718 }
4719 }
4720 else {
4721 pos = 0;
4722 }
4723
4724 str_ensure_byte_pos(str, pos);
4725
4726 if (RB_TYPE_P(sub, T_REGEXP)) {
4727 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4728 VALUE match = rb_backref_get();
4729 struct re_registers *regs = RMATCH_REGS(match);
4730 pos = BEG(0);
4731 return LONG2NUM(pos);
4732 }
4733 }
4734 else {
4735 StringValue(sub);
4736 pos = rb_str_byteindex(str, sub, pos);
4737 if (pos >= 0) return LONG2NUM(pos);
4738 }
4739 return Qnil;
4740}
4741
4742#ifndef HAVE_MEMRCHR
4743static void*
4744memrchr(const char *search_str, int chr, long search_len)
4745{
4746 const char *ptr = search_str + search_len;
4747 while (ptr > search_str) {
4748 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4749 }
4750
4751 return ((void *)0);
4752}
4753#endif
4754
4755static long
4756str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4757{
4758 char *hit, *adjusted;
4759 int c;
4760 long slen, searchlen;
4761 char *sbeg, *e, *t;
4762
4763 sbeg = RSTRING_PTR(str);
4764 slen = RSTRING_LEN(sub);
4765 if (slen == 0) return s - sbeg;
4766 e = RSTRING_END(str);
4767 t = RSTRING_PTR(sub);
4768 c = *t & 0xff;
4769 searchlen = s - sbeg + 1;
4770
4771 if (memcmp(s, t, slen) == 0) {
4772 return s - sbeg;
4773 }
4774
4775 do {
4776 hit = memrchr(sbeg, c, searchlen);
4777 if (!hit) break;
4778 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4779 if (hit != adjusted) {
4780 searchlen = adjusted - sbeg;
4781 continue;
4782 }
4783 if (memcmp(hit, t, slen) == 0)
4784 return hit - sbeg;
4785 searchlen = adjusted - sbeg;
4786 } while (searchlen > 0);
4787
4788 return -1;
4789}
4790
4791/* found index in byte */
4792static long
4793rb_str_rindex(VALUE str, VALUE sub, long pos)
4794{
4795 long len, slen;
4796 char *sbeg, *s;
4797 rb_encoding *enc;
4798 int singlebyte;
4799
4800 enc = rb_enc_check(str, sub);
4801 if (is_broken_string(sub)) return -1;
4802 singlebyte = single_byte_optimizable(str);
4803 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4804 slen = str_strlen(sub, enc); /* rb_enc_check */
4805
4806 /* substring longer than string */
4807 if (len < slen) return -1;
4808 if (len - pos < slen) pos = len - slen;
4809 if (len == 0) return pos;
4810
4811 sbeg = RSTRING_PTR(str);
4812
4813 if (pos == 0) {
4814 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4815 return 0;
4816 else
4817 return -1;
4818 }
4819
4820 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4821 return str_rindex(str, sub, s, enc);
4822}
4823
4824/*
4825 * call-seq:
4826 * rindex(pattern, offset = self.length) -> integer or nil
4827 *
4828 * :include:doc/string/rindex.rdoc
4829 *
4830 */
4831
4832static VALUE
4833rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4834{
4835 VALUE sub;
4836 VALUE initpos;
4837 rb_encoding *enc = STR_ENC_GET(str);
4838 long pos, len = str_strlen(str, enc); /* str's enc */
4839
4840 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4841 pos = NUM2LONG(initpos);
4842 if (pos < 0 && (pos += len) < 0) {
4843 if (RB_TYPE_P(sub, T_REGEXP)) {
4845 }
4846 return Qnil;
4847 }
4848 if (pos > len) pos = len;
4849 }
4850 else {
4851 pos = len;
4852 }
4853
4854 if (RB_TYPE_P(sub, T_REGEXP)) {
4855 /* enc = rb_enc_check(str, sub); */
4856 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4857 enc, single_byte_optimizable(str));
4858
4859 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4860 VALUE match = rb_backref_get();
4861 struct re_registers *regs = RMATCH_REGS(match);
4862 pos = rb_str_sublen(str, BEG(0));
4863 return LONG2NUM(pos);
4864 }
4865 }
4866 else {
4867 StringValue(sub);
4868 pos = rb_str_rindex(str, sub, pos);
4869 if (pos >= 0) {
4870 pos = rb_str_sublen(str, pos);
4871 return LONG2NUM(pos);
4872 }
4873 }
4874 return Qnil;
4875}
4876
4877static long
4878rb_str_byterindex(VALUE str, VALUE sub, long pos)
4879{
4880 long len, slen;
4881 char *sbeg, *s;
4882 rb_encoding *enc;
4883
4884 enc = rb_enc_check(str, sub);
4885 if (is_broken_string(sub)) return -1;
4886 len = RSTRING_LEN(str);
4887 slen = RSTRING_LEN(sub);
4888
4889 /* substring longer than string */
4890 if (len < slen) return -1;
4891 if (len - pos < slen) pos = len - slen;
4892 if (len == 0) return pos;
4893
4894 sbeg = RSTRING_PTR(str);
4895
4896 if (pos == 0) {
4897 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4898 return 0;
4899 else
4900 return -1;
4901 }
4902
4903 s = sbeg + pos;
4904 return str_rindex(str, sub, s, enc);
4905}
4906
4907/*
4908 * call-seq:
4909 * byterindex(object, offset = self.bytesize) -> integer or nil
4910 *
4911 * Returns the 0-based integer index of a substring of +self+
4912 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4913 * or +nil+ if there is no such substring;
4914 * the returned index is the count of _bytes_ (not characters).
4915 *
4916 * When +object+ is a string,
4917 * returns the index of the _last_ found substring equal to +object+:
4918 *
4919 * s = 'foo' # => "foo"
4920 * s.size # => 3 # Three 1-byte characters.
4921 * s.bytesize # => 3 # Three bytes.
4922 * s.byterindex('f') # => 0
4923 s.byterindex('o') # => 2
4924 s.byterindex('oo') # => 1
4925 s.byterindex('ooo') # => nil
4926 *
4927 * When +object+ is a Regexp,
4928 * returns the index of the last found substring matching +object+;
4929 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4930 *
4931 * s = 'foo'
4932 * s.byterindex(/f/) # => 0
4933 * $~ # => #<MatchData "f">
4934 * s.byterindex(/o/) # => 2
4935 * s.byterindex(/oo/) # => 1
4936 * s.byterindex(/ooo/) # => nil
4937 * $~ # => nil
4938 *
4939 * The last match means starting at the possible last position,
4940 * not the last of the longest matches:
4941 *
4942 * s = 'foo'
4943 * s.byterindex(/o+/) # => 2
4944 * $~ #=> #<MatchData "o">
4945 *
4946 * To get the last longest match, use a negative lookbehind:
4947 *
4948 * s = 'foo'
4949 * s.byterindex(/(?<!o)o+/) # => 1
4950 * $~ # => #<MatchData "oo">
4951 *
4952 * Or use method #byteindex with negative lookahead:
4953 *
4954 * s = 'foo'
4955 * s.byteindex(/o+(?!.*o)/) # => 1
4956 * $~ #=> #<MatchData "oo">
4957 *
4958 * \Integer argument +offset+, if given, specifies the 0-based index
4959 * of the byte where searching is to end.
4960 *
4961 * When +offset+ is non-negative,
4962 * searching ends at byte position +offset+:
4963 *
4964 * s = 'foo'
4965 * s.byterindex('o', 0) # => nil
4966 * s.byterindex('o', 1) # => 1
4967 * s.byterindex('o', 2) # => 2
4968 * s.byterindex('o', 3) # => 2
4969 *
4970 * When +offset+ is negative, counts backward from the end of +self+:
4971 *
4972 * s = 'foo'
4973 * s.byterindex('o', -1) # => 2
4974 * s.byterindex('o', -2) # => 1
4975 * s.byterindex('o', -3) # => nil
4976 *
4977 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4978 *
4979 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4980 * s.size # => 2 # Two 3-byte characters.
4981 * s.bytesize # => 6 # Six bytes.
4982 * s.byterindex("\uFFFF") # => 3
4983 * s.byterindex("\uFFFF", 1) # Raises IndexError
4984 * s.byterindex("\uFFFF", 2) # Raises IndexError
4985 * s.byterindex("\uFFFF", 3) # => 3
4986 * s.byterindex("\uFFFF", 4) # Raises IndexError
4987 * s.byterindex("\uFFFF", 5) # Raises IndexError
4988 * s.byterindex("\uFFFF", 6) # => nil
4989 *
4990 * Related: see {Querying}[rdoc-ref:String@Querying].
4991 */
4992
4993static VALUE
4994rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4995{
4996 VALUE sub;
4997 VALUE initpos;
4998 long pos, len = RSTRING_LEN(str);
4999
5000 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5001 pos = NUM2LONG(initpos);
5002 if (pos < 0 && (pos += len) < 0) {
5003 if (RB_TYPE_P(sub, T_REGEXP)) {
5005 }
5006 return Qnil;
5007 }
5008 if (pos > len) pos = len;
5009 }
5010 else {
5011 pos = len;
5012 }
5013
5014 str_ensure_byte_pos(str, pos);
5015
5016 if (RB_TYPE_P(sub, T_REGEXP)) {
5017 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5018 VALUE match = rb_backref_get();
5019 struct re_registers *regs = RMATCH_REGS(match);
5020 pos = BEG(0);
5021 return LONG2NUM(pos);
5022 }
5023 }
5024 else {
5025 StringValue(sub);
5026 pos = rb_str_byterindex(str, sub, pos);
5027 if (pos >= 0) return LONG2NUM(pos);
5028 }
5029 return Qnil;
5030}
5031
5032/*
5033 * call-seq:
5034 * self =~ other -> integer or nil
5035 *
5036 * When +other+ is a Regexp:
5037 *
5038 * - Returns the integer index (in characters) of the first match
5039 * for +self+ and +other+, or +nil+ if none;
5040 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5041 *
5042 * Examples:
5043 *
5044 * 'foo' =~ /f/ # => 0
5045 * $~ # => #<MatchData "f">
5046 * 'foo' =~ /o/ # => 1
5047 * $~ # => #<MatchData "o">
5048 * 'foo' =~ /x/ # => nil
5049 * $~ # => nil
5050 *
5051 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5052 * (see Regexp#=~):
5053 *
5054 * number = nil
5055 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5056 * number # => nil # Not assigned.
5057 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5058 * number # => "9" # Assigned.
5059 *
5060 * When +other+ is not a Regexp, returns the value
5061 * returned by <tt>other =~ self</tt>.
5062 *
5063 * Related: see {Querying}[rdoc-ref:String@Querying].
5064 */
5065
5066static VALUE
5067rb_str_match(VALUE x, VALUE y)
5068{
5069 switch (OBJ_BUILTIN_TYPE(y)) {
5070 case T_STRING:
5071 rb_raise(rb_eTypeError, "type mismatch: String given");
5072
5073 case T_REGEXP:
5074 return rb_reg_match(y, x);
5075
5076 default:
5077 return rb_funcall(y, idEqTilde, 1, x);
5078 }
5079}
5080
5081
5082static VALUE get_pat(VALUE);
5083
5084
5085/*
5086 * call-seq:
5087 * match(pattern, offset = 0) -> matchdata or nil
5088 * match(pattern, offset = 0) {|matchdata| ... } -> object
5089 *
5090 * Creates a MatchData object based on +self+ and the given arguments;
5091 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5092 *
5093 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5094 *
5095 * regexp = Regexp.new(pattern)
5096 *
5097 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5098 * (see Regexp#match):
5099 *
5100 * matchdata = regexp.match(self[offset..])
5101 *
5102 * With no block given, returns the computed +matchdata+ or +nil+:
5103 *
5104 * 'foo'.match('f') # => #<MatchData "f">
5105 * 'foo'.match('o') # => #<MatchData "o">
5106 * 'foo'.match('x') # => nil
5107 * 'foo'.match('f', 1) # => nil
5108 * 'foo'.match('o', 1) # => #<MatchData "o">
5109 *
5110 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5111 * returns the block's return value:
5112 *
5113 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5114 *
5115 * With a block given and +nil+ +matchdata+, does not call the block:
5116 *
5117 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5118 *
5119 * Related: see {Querying}[rdoc-ref:String@Querying].
5120 */
5121
5122static VALUE
5123rb_str_match_m(int argc, VALUE *argv, VALUE str)
5124{
5125 VALUE re, result;
5126 if (argc < 1)
5127 rb_check_arity(argc, 1, 2);
5128 re = argv[0];
5129 argv[0] = str;
5130 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5131 if (!NIL_P(result) && rb_block_given_p()) {
5132 return rb_yield(result);
5133 }
5134 return result;
5135}
5136
5137/*
5138 * call-seq:
5139 * match?(pattern, offset = 0) -> true or false
5140 *
5141 * Returns whether a match is found for +self+ and the given arguments;
5142 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5143 *
5144 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5145 *
5146 * regexp = Regexp.new(pattern)
5147 *
5148 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5149 * +false+ otherwise:
5150 *
5151 * 'foo'.match?(/o/) # => true
5152 * 'foo'.match?('o') # => true
5153 * 'foo'.match?(/x/) # => false
5154 * 'foo'.match?('f', 1) # => false
5155 * 'foo'.match?('o', 1) # => true
5156 *
5157 * Related: see {Querying}[rdoc-ref:String@Querying].
5158 */
5159
5160static VALUE
5161rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5162{
5163 VALUE re;
5164 rb_check_arity(argc, 1, 2);
5165 re = get_pat(argv[0]);
5166 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5167}
5168
5169enum neighbor_char {
5170 NEIGHBOR_NOT_CHAR,
5171 NEIGHBOR_FOUND,
5172 NEIGHBOR_WRAPPED
5173};
5174
5175static enum neighbor_char
5176enc_succ_char(char *p, long len, rb_encoding *enc)
5177{
5178 long i;
5179 int l;
5180
5181 if (rb_enc_mbminlen(enc) > 1) {
5182 /* wchar, trivial case */
5183 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5184 if (!MBCLEN_CHARFOUND_P(r)) {
5185 return NEIGHBOR_NOT_CHAR;
5186 }
5187 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5188 l = rb_enc_code_to_mbclen(c, enc);
5189 if (!l) return NEIGHBOR_NOT_CHAR;
5190 if (l != len) return NEIGHBOR_WRAPPED;
5191 rb_enc_mbcput(c, p, enc);
5192 r = rb_enc_precise_mbclen(p, p + len, enc);
5193 if (!MBCLEN_CHARFOUND_P(r)) {
5194 return NEIGHBOR_NOT_CHAR;
5195 }
5196 return NEIGHBOR_FOUND;
5197 }
5198 while (1) {
5199 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5200 p[i] = '\0';
5201 if (i < 0)
5202 return NEIGHBOR_WRAPPED;
5203 ++((unsigned char*)p)[i];
5204 l = rb_enc_precise_mbclen(p, p+len, enc);
5205 if (MBCLEN_CHARFOUND_P(l)) {
5206 l = MBCLEN_CHARFOUND_LEN(l);
5207 if (l == len) {
5208 return NEIGHBOR_FOUND;
5209 }
5210 else {
5211 memset(p+l, 0xff, len-l);
5212 }
5213 }
5214 if (MBCLEN_INVALID_P(l) && i < len-1) {
5215 long len2;
5216 int l2;
5217 for (len2 = len-1; 0 < len2; len2--) {
5218 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5219 if (!MBCLEN_INVALID_P(l2))
5220 break;
5221 }
5222 memset(p+len2+1, 0xff, len-(len2+1));
5223 }
5224 }
5225}
5226
5227static enum neighbor_char
5228enc_pred_char(char *p, long len, rb_encoding *enc)
5229{
5230 long i;
5231 int l;
5232 if (rb_enc_mbminlen(enc) > 1) {
5233 /* wchar, trivial case */
5234 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5235 if (!MBCLEN_CHARFOUND_P(r)) {
5236 return NEIGHBOR_NOT_CHAR;
5237 }
5238 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5239 if (!c) return NEIGHBOR_NOT_CHAR;
5240 --c;
5241 l = rb_enc_code_to_mbclen(c, enc);
5242 if (!l) return NEIGHBOR_NOT_CHAR;
5243 if (l != len) return NEIGHBOR_WRAPPED;
5244 rb_enc_mbcput(c, p, enc);
5245 r = rb_enc_precise_mbclen(p, p + len, enc);
5246 if (!MBCLEN_CHARFOUND_P(r)) {
5247 return NEIGHBOR_NOT_CHAR;
5248 }
5249 return NEIGHBOR_FOUND;
5250 }
5251 while (1) {
5252 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5253 p[i] = '\xff';
5254 if (i < 0)
5255 return NEIGHBOR_WRAPPED;
5256 --((unsigned char*)p)[i];
5257 l = rb_enc_precise_mbclen(p, p+len, enc);
5258 if (MBCLEN_CHARFOUND_P(l)) {
5259 l = MBCLEN_CHARFOUND_LEN(l);
5260 if (l == len) {
5261 return NEIGHBOR_FOUND;
5262 }
5263 else {
5264 memset(p+l, 0, len-l);
5265 }
5266 }
5267 if (MBCLEN_INVALID_P(l) && i < len-1) {
5268 long len2;
5269 int l2;
5270 for (len2 = len-1; 0 < len2; len2--) {
5271 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5272 if (!MBCLEN_INVALID_P(l2))
5273 break;
5274 }
5275 memset(p+len2+1, 0, len-(len2+1));
5276 }
5277 }
5278}
5279
5280/*
5281 overwrite +p+ by succeeding letter in +enc+ and returns
5282 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5283 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5284 assuming each ranges are successive, and mbclen
5285 never change in each ranges.
5286 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5287 character.
5288 */
5289static enum neighbor_char
5290enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5291{
5292 enum neighbor_char ret;
5293 unsigned int c;
5294 int ctype;
5295 int range;
5296 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5297
5298 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5299 int try;
5300 const int max_gaps = 1;
5301
5302 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5303 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5304 ctype = ONIGENC_CTYPE_DIGIT;
5305 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5306 ctype = ONIGENC_CTYPE_ALPHA;
5307 else
5308 return NEIGHBOR_NOT_CHAR;
5309
5310 MEMCPY(save, p, char, len);
5311 for (try = 0; try <= max_gaps; ++try) {
5312 ret = enc_succ_char(p, len, enc);
5313 if (ret == NEIGHBOR_FOUND) {
5314 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5315 if (rb_enc_isctype(c, ctype, enc))
5316 return NEIGHBOR_FOUND;
5317 }
5318 }
5319 MEMCPY(p, save, char, len);
5320 range = 1;
5321 while (1) {
5322 MEMCPY(save, p, char, len);
5323 ret = enc_pred_char(p, len, enc);
5324 if (ret == NEIGHBOR_FOUND) {
5325 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5326 if (!rb_enc_isctype(c, ctype, enc)) {
5327 MEMCPY(p, save, char, len);
5328 break;
5329 }
5330 }
5331 else {
5332 MEMCPY(p, save, char, len);
5333 break;
5334 }
5335 range++;
5336 }
5337 if (range == 1) {
5338 return NEIGHBOR_NOT_CHAR;
5339 }
5340
5341 if (ctype != ONIGENC_CTYPE_DIGIT) {
5342 MEMCPY(carry, p, char, len);
5343 return NEIGHBOR_WRAPPED;
5344 }
5345
5346 MEMCPY(carry, p, char, len);
5347 enc_succ_char(carry, len, enc);
5348 return NEIGHBOR_WRAPPED;
5349}
5350
5351
5352static VALUE str_succ(VALUE str);
5353
5354/*
5355 * call-seq:
5356 * succ -> new_str
5357 *
5358 * :include: doc/string/succ.rdoc
5359 *
5360 */
5361
5362VALUE
5364{
5365 VALUE str;
5366 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5367 rb_enc_cr_str_copy_for_substr(str, orig);
5368 return str_succ(str);
5369}
5370
5371static VALUE
5372str_succ(VALUE str)
5373{
5374 rb_encoding *enc;
5375 char *sbeg, *s, *e, *last_alnum = 0;
5376 int found_alnum = 0;
5377 long l, slen;
5378 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5379 long carry_pos = 0, carry_len = 1;
5380 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5381
5382 slen = RSTRING_LEN(str);
5383 if (slen == 0) return str;
5384
5385 enc = STR_ENC_GET(str);
5386 sbeg = RSTRING_PTR(str);
5387 s = e = sbeg + slen;
5388
5389 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5390 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5391 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5392 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5393 break;
5394 }
5395 }
5396 l = rb_enc_precise_mbclen(s, e, enc);
5397 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5398 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5399 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5400 switch (neighbor) {
5401 case NEIGHBOR_NOT_CHAR:
5402 continue;
5403 case NEIGHBOR_FOUND:
5404 return str;
5405 case NEIGHBOR_WRAPPED:
5406 last_alnum = s;
5407 break;
5408 }
5409 found_alnum = 1;
5410 carry_pos = s - sbeg;
5411 carry_len = l;
5412 }
5413 if (!found_alnum) { /* str contains no alnum */
5414 s = e;
5415 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5416 enum neighbor_char neighbor;
5417 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5418 l = rb_enc_precise_mbclen(s, e, enc);
5419 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5420 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5421 MEMCPY(tmp, s, char, l);
5422 neighbor = enc_succ_char(tmp, l, enc);
5423 switch (neighbor) {
5424 case NEIGHBOR_FOUND:
5425 MEMCPY(s, tmp, char, l);
5426 return str;
5427 break;
5428 case NEIGHBOR_WRAPPED:
5429 MEMCPY(s, tmp, char, l);
5430 break;
5431 case NEIGHBOR_NOT_CHAR:
5432 break;
5433 }
5434 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5435 /* wrapped to \0...\0. search next valid char. */
5436 enc_succ_char(s, l, enc);
5437 }
5438 if (!rb_enc_asciicompat(enc)) {
5439 MEMCPY(carry, s, char, l);
5440 carry_len = l;
5441 }
5442 carry_pos = s - sbeg;
5443 }
5445 }
5446 RESIZE_CAPA(str, slen + carry_len);
5447 sbeg = RSTRING_PTR(str);
5448 s = sbeg + carry_pos;
5449 memmove(s + carry_len, s, slen - carry_pos);
5450 memmove(s, carry, carry_len);
5451 slen += carry_len;
5452 STR_SET_LEN(str, slen);
5453 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5455 return str;
5456}
5457
5458
5459/*
5460 * call-seq:
5461 * succ! -> self
5462 *
5463 * Like String#succ, but modifies +self+ in place; returns +self+.
5464 *
5465 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5466 */
5467
5468static VALUE
5469rb_str_succ_bang(VALUE str)
5470{
5471 rb_str_modify(str);
5472 str_succ(str);
5473 return str;
5474}
5475
5476static int
5477all_digits_p(const char *s, long len)
5478{
5479 while (len-- > 0) {
5480 if (!ISDIGIT(*s)) return 0;
5481 s++;
5482 }
5483 return 1;
5484}
5485
5486static int
5487str_upto_i(VALUE str, VALUE arg)
5488{
5489 rb_yield(str);
5490 return 0;
5491}
5492
5493/*
5494 * call-seq:
5495 * upto(other_string, exclusive = false) {|string| ... } -> self
5496 * upto(other_string, exclusive = false) -> new_enumerator
5497 *
5498 * :include: doc/string/upto.rdoc
5499 *
5500 */
5501
5502static VALUE
5503rb_str_upto(int argc, VALUE *argv, VALUE beg)
5504{
5505 VALUE end, exclusive;
5506
5507 rb_scan_args(argc, argv, "11", &end, &exclusive);
5508 RETURN_ENUMERATOR(beg, argc, argv);
5509 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5510}
5511
5512VALUE
5513rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5514{
5515 VALUE current, after_end;
5516 ID succ;
5517 int n, ascii;
5518 rb_encoding *enc;
5519
5520 CONST_ID(succ, "succ");
5521 StringValue(end);
5522 enc = rb_enc_check(beg, end);
5523 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5524 /* single character */
5525 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5526 char c = RSTRING_PTR(beg)[0];
5527 char e = RSTRING_PTR(end)[0];
5528
5529 if (c > e || (excl && c == e)) return beg;
5530 for (;;) {
5531 VALUE str = rb_enc_str_new(&c, 1, enc);
5533 if ((*each)(str, arg)) break;
5534 if (!excl && c == e) break;
5535 c++;
5536 if (excl && c == e) break;
5537 }
5538 return beg;
5539 }
5540 /* both edges are all digits */
5541 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5542 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5543 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5544 VALUE b, e;
5545 int width;
5546
5547 width = RSTRING_LENINT(beg);
5548 b = rb_str_to_inum(beg, 10, FALSE);
5549 e = rb_str_to_inum(end, 10, FALSE);
5550 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5551 long bi = FIX2LONG(b);
5552 long ei = FIX2LONG(e);
5553 rb_encoding *usascii = rb_usascii_encoding();
5554
5555 while (bi <= ei) {
5556 if (excl && bi == ei) break;
5557 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5558 bi++;
5559 }
5560 }
5561 else {
5562 ID op = excl ? '<' : idLE;
5563 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5564
5565 args[0] = INT2FIX(width);
5566 while (rb_funcall(b, op, 1, e)) {
5567 args[1] = b;
5568 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5569 b = rb_funcallv(b, succ, 0, 0);
5570 }
5571 }
5572 return beg;
5573 }
5574 /* normal case */
5575 n = rb_str_cmp(beg, end);
5576 if (n > 0 || (excl && n == 0)) return beg;
5577
5578 after_end = rb_funcallv(end, succ, 0, 0);
5579 current = str_duplicate(rb_cString, beg);
5580 while (!rb_str_equal(current, after_end)) {
5581 VALUE next = Qnil;
5582 if (excl || !rb_str_equal(current, end))
5583 next = rb_funcallv(current, succ, 0, 0);
5584 if ((*each)(current, arg)) break;
5585 if (NIL_P(next)) break;
5586 current = next;
5587 StringValue(current);
5588 if (excl && rb_str_equal(current, end)) break;
5589 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5590 break;
5591 }
5592
5593 return beg;
5594}
5595
5596VALUE
5597rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5598{
5599 VALUE current;
5600 ID succ;
5601
5602 CONST_ID(succ, "succ");
5603 /* both edges are all digits */
5604 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5605 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5606 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5607 int width = RSTRING_LENINT(beg);
5608 b = rb_str_to_inum(beg, 10, FALSE);
5609 if (FIXNUM_P(b)) {
5610 long bi = FIX2LONG(b);
5611 rb_encoding *usascii = rb_usascii_encoding();
5612
5613 while (FIXABLE(bi)) {
5614 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5615 bi++;
5616 }
5617 b = LONG2NUM(bi);
5618 }
5619 args[0] = INT2FIX(width);
5620 while (1) {
5621 args[1] = b;
5622 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5623 b = rb_funcallv(b, succ, 0, 0);
5624 }
5625 }
5626 /* normal case */
5627 current = str_duplicate(rb_cString, beg);
5628 while (1) {
5629 VALUE next = rb_funcallv(current, succ, 0, 0);
5630 if ((*each)(current, arg)) break;
5631 current = next;
5632 StringValue(current);
5633 if (RSTRING_LEN(current) == 0)
5634 break;
5635 }
5636
5637 return beg;
5638}
5639
5640static int
5641include_range_i(VALUE str, VALUE arg)
5642{
5643 VALUE *argp = (VALUE *)arg;
5644 if (!rb_equal(str, *argp)) return 0;
5645 *argp = Qnil;
5646 return 1;
5647}
5648
5649VALUE
5650rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5651{
5652 beg = rb_str_new_frozen(beg);
5653 StringValue(end);
5654 end = rb_str_new_frozen(end);
5655 if (NIL_P(val)) return Qfalse;
5656 val = rb_check_string_type(val);
5657 if (NIL_P(val)) return Qfalse;
5658 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5659 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5660 rb_enc_asciicompat(STR_ENC_GET(val))) {
5661 const char *bp = RSTRING_PTR(beg);
5662 const char *ep = RSTRING_PTR(end);
5663 const char *vp = RSTRING_PTR(val);
5664 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5665 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5666 return Qfalse;
5667 else {
5668 char b = *bp;
5669 char e = *ep;
5670 char v = *vp;
5671
5672 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5673 if (b <= v && v < e) return Qtrue;
5674 return RBOOL(!RTEST(exclusive) && v == e);
5675 }
5676 }
5677 }
5678#if 0
5679 /* both edges are all digits */
5680 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5681 all_digits_p(bp, RSTRING_LEN(beg)) &&
5682 all_digits_p(ep, RSTRING_LEN(end))) {
5683 /* TODO */
5684 }
5685#endif
5686 }
5687 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5688
5689 return RBOOL(NIL_P(val));
5690}
5691
5692static VALUE
5693rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5694{
5695 if (rb_reg_search(re, str, 0, 0) >= 0) {
5696 VALUE match = rb_backref_get();
5697 int nth = rb_reg_backref_number(match, backref);
5698 return rb_reg_nth_match(nth, match);
5699 }
5700 return Qnil;
5701}
5702
5703static VALUE
5704rb_str_aref(VALUE str, VALUE indx)
5705{
5706 long idx;
5707
5708 if (FIXNUM_P(indx)) {
5709 idx = FIX2LONG(indx);
5710 }
5711 else if (RB_TYPE_P(indx, T_REGEXP)) {
5712 return rb_str_subpat(str, indx, INT2FIX(0));
5713 }
5714 else if (RB_TYPE_P(indx, T_STRING)) {
5715 if (rb_str_index(str, indx, 0) != -1)
5716 return str_duplicate(rb_cString, indx);
5717 return Qnil;
5718 }
5719 else {
5720 /* check if indx is Range */
5721 long beg, len = str_strlen(str, NULL);
5722 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5723 case Qfalse:
5724 break;
5725 case Qnil:
5726 return Qnil;
5727 default:
5728 return rb_str_substr(str, beg, len);
5729 }
5730 idx = NUM2LONG(indx);
5731 }
5732
5733 return str_substr(str, idx, 1, FALSE);
5734}
5735
5736
5737/*
5738 * call-seq:
5739 * self[offset] -> new_string or nil
5740 * self[offset, size] -> new_string or nil
5741 * self[range] -> new_string or nil
5742 * self[regexp, capture = 0] -> new_string or nil
5743 * self[substring] -> new_string or nil
5744 *
5745 * :include: doc/string/aref.rdoc
5746 *
5747 */
5748
5749static VALUE
5750rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5751{
5752 if (argc == 2) {
5753 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5754 return rb_str_subpat(str, argv[0], argv[1]);
5755 }
5756 else {
5757 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5758 }
5759 }
5760 rb_check_arity(argc, 1, 2);
5761 return rb_str_aref(str, argv[0]);
5762}
5763
5764VALUE
5766{
5767 char *ptr = RSTRING_PTR(str);
5768 long olen = RSTRING_LEN(str), nlen;
5769
5770 str_modifiable(str);
5771 if (len > olen) len = olen;
5772 nlen = olen - len;
5773 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5774 char *oldptr = ptr;
5775 size_t old_capa = RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5776 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5777 STR_SET_EMBED(str);
5778 ptr = RSTRING(str)->as.embed.ary;
5779 memmove(ptr, oldptr + len, nlen);
5780 if (fl == STR_NOEMBED) {
5781 SIZED_FREE_N(oldptr, old_capa);
5782 }
5783 }
5784 else {
5785 if (!STR_SHARED_P(str)) {
5786 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5787 rb_enc_cr_str_exact_copy(shared, str);
5788 OBJ_FREEZE(shared);
5789 }
5790 ptr = RSTRING(str)->as.heap.ptr += len;
5791 }
5792 STR_SET_LEN(str, nlen);
5793
5794 if (!SHARABLE_MIDDLE_SUBSTRING) {
5795 TERM_FILL(ptr + nlen, TERM_LEN(str));
5796 }
5798 return str;
5799}
5800
5801static void
5802rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5803{
5804 char *sptr;
5805 long slen;
5806 int cr;
5807
5808 if (beg == 0 && vlen == 0) {
5809 rb_str_drop_bytes(str, len);
5810 return;
5811 }
5812
5813 str_modify_keep_cr(str);
5814 RSTRING_GETMEM(str, sptr, slen);
5815 if (len < vlen) {
5816 /* expand string */
5817 RESIZE_CAPA(str, slen + vlen - len);
5818 sptr = RSTRING_PTR(str);
5819 }
5820
5822 cr = rb_enc_str_coderange(val);
5823 else
5825
5826 if (vlen != len) {
5827 memmove(sptr + beg + vlen,
5828 sptr + beg + len,
5829 slen - (beg + len));
5830 }
5831 if (vlen < beg && len < 0) {
5832 MEMZERO(sptr + slen, char, -len);
5833 }
5834 if (vlen > 0) {
5835 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5836 }
5837 slen += vlen - len;
5838 STR_SET_LEN(str, slen);
5839 TERM_FILL(&sptr[slen], TERM_LEN(str));
5840 ENC_CODERANGE_SET(str, cr);
5841}
5842
5843static inline void
5844rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5845{
5846 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5847}
5848
5849void
5850rb_str_update(VALUE str, long beg, long len, VALUE val)
5851{
5852 long slen;
5853 char *p, *e;
5854 rb_encoding *enc;
5855 int singlebyte = single_byte_optimizable(str);
5856 int cr;
5857
5858 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5859
5860 StringValue(val);
5861 enc = rb_enc_check(str, val);
5862 slen = str_strlen(str, enc); /* rb_enc_check */
5863
5864 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5865 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5866 }
5867 if (beg < 0) {
5868 beg += slen;
5869 }
5870 RUBY_ASSERT(beg >= 0);
5871 RUBY_ASSERT(beg <= slen);
5872
5873 if (len > slen - beg) {
5874 len = slen - beg;
5875 }
5876 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5877 if (!p) p = RSTRING_END(str);
5878 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5879 if (!e) e = RSTRING_END(str);
5880 /* error check */
5881 beg = p - RSTRING_PTR(str); /* physical position */
5882 len = e - p; /* physical length */
5883 rb_str_update_0(str, beg, len, val);
5884 rb_enc_associate(str, enc);
5886 if (cr != ENC_CODERANGE_BROKEN)
5887 ENC_CODERANGE_SET(str, cr);
5888}
5889
5890static void
5891rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5892{
5893 int nth;
5894 VALUE match;
5895 long start, end, len;
5896 rb_encoding *enc;
5897 struct re_registers *regs;
5898
5899 if (rb_reg_search(re, str, 0, 0) < 0) {
5900 rb_raise(rb_eIndexError, "regexp not matched");
5901 }
5902 match = rb_backref_get();
5903 nth = rb_reg_backref_number(match, backref);
5904 regs = RMATCH_REGS(match);
5905 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5906 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5907 }
5908 if (nth < 0) {
5909 nth += regs->num_regs;
5910 }
5911
5912 start = BEG(nth);
5913 if (start == -1) {
5914 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5915 }
5916 end = END(nth);
5917 len = end - start;
5918 StringValue(val);
5919 enc = rb_enc_check_str(str, val);
5920 rb_str_update_0(str, start, len, val);
5921 rb_enc_associate(str, enc);
5922}
5923
5924static VALUE
5925rb_str_aset(VALUE str, VALUE indx, VALUE val)
5926{
5927 long idx, beg;
5928
5929 switch (TYPE(indx)) {
5930 case T_REGEXP:
5931 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5932 return val;
5933
5934 case T_STRING:
5935 beg = rb_str_index(str, indx, 0);
5936 if (beg < 0) {
5937 rb_raise(rb_eIndexError, "string not matched");
5938 }
5939 beg = rb_str_sublen(str, beg);
5940 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5941 return val;
5942
5943 default:
5944 /* check if indx is Range */
5945 {
5946 long beg, len;
5947 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5948 rb_str_update(str, beg, len, val);
5949 return val;
5950 }
5951 }
5952 /* FALLTHROUGH */
5953
5954 case T_FIXNUM:
5955 idx = NUM2LONG(indx);
5956 rb_str_update(str, idx, 1, val);
5957 return val;
5958 }
5959}
5960
5961/*
5962 * call-seq:
5963 * self[index] = other_string -> new_string
5964 * self[start, length] = other_string -> new_string
5965 * self[range] = other_string -> new_string
5966 * self[regexp, capture = 0] = other_string -> new_string
5967 * self[substring] = other_string -> new_string
5968 *
5969 * :include: doc/string/aset.rdoc
5970 *
5971 */
5972
5973static VALUE
5974rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5975{
5976 if (argc == 3) {
5977 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5978 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5979 }
5980 else {
5981 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5982 }
5983 return argv[2];
5984 }
5985 rb_check_arity(argc, 2, 3);
5986 return rb_str_aset(str, argv[0], argv[1]);
5987}
5988
5989/*
5990 * call-seq:
5991 * insert(offset, other_string) -> self
5992 *
5993 * :include: doc/string/insert.rdoc
5994 *
5995 */
5996
5997static VALUE
5998rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5999{
6000 long pos = NUM2LONG(idx);
6001
6002 if (pos == -1) {
6003 return rb_str_append(str, str2);
6004 }
6005 else if (pos < 0) {
6006 pos++;
6007 }
6008 rb_str_update(str, pos, 0, str2);
6009 return str;
6010}
6011
6012
6013/*
6014 * call-seq:
6015 * slice!(index) -> new_string or nil
6016 * slice!(start, length) -> new_string or nil
6017 * slice!(range) -> new_string or nil
6018 * slice!(regexp, capture = 0) -> new_string or nil
6019 * slice!(substring) -> new_string or nil
6020 *
6021 * Like String#[] (and its alias String#slice), except that:
6022 *
6023 * - Performs substitutions in +self+ (not in a copy of +self+).
6024 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6025 *
6026 * A few examples:
6027 *
6028 * s = 'hello'
6029 * s.slice!('e') # => "e"
6030 * s # => "hllo"
6031 * s.slice!('e') # => nil
6032 * s # => "hllo"
6033 *
6034 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6035 */
6036
6037static VALUE
6038rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6039{
6040 VALUE result = Qnil;
6041 VALUE indx;
6042 long beg, len = 1;
6043 char *p;
6044
6045 rb_check_arity(argc, 1, 2);
6046 str_modify_keep_cr(str);
6047 indx = argv[0];
6048 if (RB_TYPE_P(indx, T_REGEXP)) {
6049 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6050 VALUE match = rb_backref_get();
6051 struct re_registers *regs = RMATCH_REGS(match);
6052 int nth = 0;
6053 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6054 if ((nth += regs->num_regs) <= 0) return Qnil;
6055 }
6056 else if (nth >= regs->num_regs) return Qnil;
6057 beg = BEG(nth);
6058 len = END(nth) - beg;
6059 goto subseq;
6060 }
6061 else if (argc == 2) {
6062 beg = NUM2LONG(indx);
6063 len = NUM2LONG(argv[1]);
6064 goto num_index;
6065 }
6066 else if (FIXNUM_P(indx)) {
6067 beg = FIX2LONG(indx);
6068 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6069 if (!len) return Qnil;
6070 beg = p - RSTRING_PTR(str);
6071 goto subseq;
6072 }
6073 else if (RB_TYPE_P(indx, T_STRING)) {
6074 beg = rb_str_index(str, indx, 0);
6075 if (beg == -1) return Qnil;
6076 len = RSTRING_LEN(indx);
6077 result = str_duplicate(rb_cString, indx);
6078 goto squash;
6079 }
6080 else {
6081 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6082 case Qnil:
6083 return Qnil;
6084 case Qfalse:
6085 beg = NUM2LONG(indx);
6086 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6087 if (!len) return Qnil;
6088 beg = p - RSTRING_PTR(str);
6089 goto subseq;
6090 default:
6091 goto num_index;
6092 }
6093 }
6094
6095 num_index:
6096 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6097 beg = p - RSTRING_PTR(str);
6098
6099 subseq:
6100 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6101 rb_enc_cr_str_copy_for_substr(result, str);
6102
6103 squash:
6104 if (len > 0) {
6105 if (beg == 0) {
6106 rb_str_drop_bytes(str, len);
6107 }
6108 else {
6109 char *sptr = RSTRING_PTR(str);
6110 long slen = RSTRING_LEN(str);
6111 if (beg + len > slen) /* pathological check */
6112 len = slen - beg;
6113 memmove(sptr + beg,
6114 sptr + beg + len,
6115 slen - (beg + len));
6116 slen -= len;
6117 STR_SET_LEN(str, slen);
6118 TERM_FILL(&sptr[slen], TERM_LEN(str));
6119 }
6120 }
6121 return result;
6122}
6123
6124static VALUE
6125get_pat(VALUE pat)
6126{
6127 VALUE val;
6128
6129 switch (OBJ_BUILTIN_TYPE(pat)) {
6130 case T_REGEXP:
6131 return pat;
6132
6133 case T_STRING:
6134 break;
6135
6136 default:
6137 val = rb_check_string_type(pat);
6138 if (NIL_P(val)) {
6139 Check_Type(pat, T_REGEXP);
6140 }
6141 pat = val;
6142 }
6143
6144 return rb_reg_regcomp(pat);
6145}
6146
6147static VALUE
6148get_pat_quoted(VALUE pat, int check)
6149{
6150 VALUE val;
6151
6152 switch (OBJ_BUILTIN_TYPE(pat)) {
6153 case T_REGEXP:
6154 return pat;
6155
6156 case T_STRING:
6157 break;
6158
6159 default:
6160 val = rb_check_string_type(pat);
6161 if (NIL_P(val)) {
6162 Check_Type(pat, T_REGEXP);
6163 }
6164 pat = val;
6165 }
6166 if (check && is_broken_string(pat)) {
6167 rb_exc_raise(rb_reg_check_preprocess(pat));
6168 }
6169 return pat;
6170}
6171
6172static long
6173rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6174{
6175 if (BUILTIN_TYPE(pat) == T_STRING) {
6176 pos = rb_str_byteindex(str, pat, pos);
6177 if (set_backref_str) {
6178 if (pos >= 0) {
6179 str = rb_str_new_frozen_String(str);
6180 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6181 if (match) {
6182 *match = match_data;
6183 }
6184 }
6185 else {
6187 }
6188 }
6189 return pos;
6190 }
6191 else {
6192 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6193 }
6194}
6195
6196static long
6197rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6198{
6199 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6200}
6201
6202
6203/*
6204 * call-seq:
6205 * sub!(pattern, replacement) -> self or nil
6206 * sub!(pattern) {|match| ... } -> self or nil
6207 *
6208 * Like String#sub, except that:
6209 *
6210 * - Changes are made to +self+, not to copy of +self+.
6211 * - Returns +self+ if any changes are made, +nil+ otherwise.
6212 *
6213 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6214 */
6215
6216static VALUE
6217rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6218{
6219 VALUE pat, repl, hash = Qnil;
6220 int iter = 0;
6221 long plen;
6222 int min_arity = rb_block_given_p() ? 1 : 2;
6223 long beg;
6224
6225 rb_check_arity(argc, min_arity, 2);
6226 if (argc == 1) {
6227 iter = 1;
6228 }
6229 else {
6230 repl = argv[1];
6231 if (!RB_TYPE_P(repl, T_STRING)) {
6232 hash = rb_check_hash_type(repl);
6233 if (NIL_P(hash)) {
6234 StringValue(repl);
6235 }
6236 }
6237 }
6238
6239 pat = get_pat_quoted(argv[0], 1);
6240
6241 str_modifiable(str);
6242 beg = rb_pat_search(pat, str, 0, 1);
6243 if (beg >= 0) {
6244 rb_encoding *enc;
6245 int cr = ENC_CODERANGE(str);
6246 long beg0, end0;
6247 VALUE match, match0 = Qnil;
6248 struct re_registers *regs;
6249 char *p, *rp;
6250 long len, rlen;
6251
6252 match = rb_backref_get();
6253 regs = RMATCH_REGS(match);
6254 if (RB_TYPE_P(pat, T_STRING)) {
6255 beg0 = beg;
6256 end0 = beg0 + RSTRING_LEN(pat);
6257 match0 = pat;
6258 }
6259 else {
6260 beg0 = BEG(0);
6261 end0 = END(0);
6262 if (iter) match0 = rb_reg_nth_match(0, match);
6263 }
6264
6265 if (iter || !NIL_P(hash)) {
6266 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6267
6268 if (iter) {
6269 repl = rb_obj_as_string(rb_yield(match0));
6270 }
6271 else {
6272 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6273 repl = rb_obj_as_string(repl);
6274 }
6275 str_mod_check(str, p, len);
6276 rb_check_frozen(str);
6277 }
6278 else {
6279 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6280 }
6281
6282 enc = rb_enc_compatible(str, repl);
6283 if (!enc) {
6284 rb_encoding *str_enc = STR_ENC_GET(str);
6285 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6286 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6287 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6288 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6289 rb_enc_inspect_name(str_enc),
6290 rb_enc_inspect_name(STR_ENC_GET(repl)));
6291 }
6292 enc = STR_ENC_GET(repl);
6293 }
6294 rb_str_modify(str);
6295 rb_enc_associate(str, enc);
6297 int cr2 = ENC_CODERANGE(repl);
6298 if (cr2 == ENC_CODERANGE_BROKEN ||
6299 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6301 else
6302 cr = cr2;
6303 }
6304 plen = end0 - beg0;
6305 rlen = RSTRING_LEN(repl);
6306 len = RSTRING_LEN(str);
6307 if (rlen > plen) {
6308 RESIZE_CAPA(str, len + rlen - plen);
6309 }
6310 p = RSTRING_PTR(str);
6311 if (rlen != plen) {
6312 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6313 }
6314 rp = RSTRING_PTR(repl);
6315 memmove(p + beg0, rp, rlen);
6316 len += rlen - plen;
6317 STR_SET_LEN(str, len);
6318 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6319 ENC_CODERANGE_SET(str, cr);
6320
6321 RB_GC_GUARD(match);
6322
6323 return str;
6324 }
6325 return Qnil;
6326}
6327
6328
6329/*
6330 * call-seq:
6331 * sub(pattern, replacement) -> new_string
6332 * sub(pattern) {|match| ... } -> new_string
6333 *
6334 * :include: doc/string/sub.rdoc
6335 */
6336
6337static VALUE
6338rb_str_sub(int argc, VALUE *argv, VALUE str)
6339{
6340 str = str_duplicate(rb_cString, str);
6341 rb_str_sub_bang(argc, argv, str);
6342 return str;
6343}
6344
6345static VALUE
6346str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6347{
6348 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6349 long beg, beg0, end0;
6350 long offset, blen, slen, len, last;
6351 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6352 char *sp, *cp;
6353 int need_backref_str = -1;
6354 rb_encoding *str_enc;
6355
6356 switch (argc) {
6357 case 1:
6358 RETURN_ENUMERATOR(str, argc, argv);
6359 mode = ITER;
6360 break;
6361 case 2:
6362 repl = argv[1];
6363 if (!RB_TYPE_P(repl, T_STRING)) {
6364 hash = rb_check_hash_type(repl);
6365 if (NIL_P(hash)) {
6366 StringValue(repl);
6367 }
6368 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6369 mode = FAST_MAP;
6370 }
6371 else {
6372 mode = MAP;
6373 }
6374 }
6375 break;
6376 default:
6377 rb_error_arity(argc, 1, 2);
6378 }
6379
6380 pat = get_pat_quoted(argv[0], 1);
6381 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6382
6383 if (beg < 0) {
6384 if (bang) return Qnil; /* no match, no substitution */
6385 return str_duplicate(rb_cString, str);
6386 }
6387
6388 offset = 0;
6389 blen = RSTRING_LEN(str) + 30; /* len + margin */
6390 dest = rb_str_buf_new(blen);
6391 sp = RSTRING_PTR(str);
6392 slen = RSTRING_LEN(str);
6393 cp = sp;
6394 str_enc = STR_ENC_GET(str);
6395 rb_enc_associate(dest, str_enc);
6396 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6397
6398 do {
6399 struct re_registers *regs = RMATCH_REGS(match);
6400 if (RB_TYPE_P(pat, T_STRING)) {
6401 beg0 = beg;
6402 end0 = beg0 + RSTRING_LEN(pat);
6403 match0 = pat;
6404 }
6405 else {
6406 beg0 = BEG(0);
6407 end0 = END(0);
6408 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6409 }
6410
6411 if (mode != STR) {
6412 if (mode == ITER) {
6413 val = rb_obj_as_string(rb_yield(match0));
6414 }
6415 else {
6416 struct RString fake_str = {RBASIC_INIT};
6417 VALUE key;
6418 if (mode == FAST_MAP) {
6419 // It is safe to use a fake_str here because we established that it won't escape,
6420 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6421 // default proc.
6422 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6423 }
6424 else {
6425 key = rb_str_subseq(str, beg0, end0 - beg0);
6426 }
6427 val = rb_hash_aref(hash, key);
6428 val = rb_obj_as_string(val);
6429 }
6430 str_mod_check(str, sp, slen);
6431 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6432 rb_raise(rb_eRuntimeError, "block should not cheat");
6433 }
6434 }
6435 else if (need_backref_str) {
6436 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6437 if (need_backref_str < 0) {
6438 need_backref_str = val != repl;
6439 }
6440 }
6441 else {
6442 val = repl;
6443 }
6444
6445 len = beg0 - offset; /* copy pre-match substr */
6446 if (len) {
6447 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6448 }
6449
6450 rb_str_buf_append(dest, val);
6451
6452 last = offset;
6453 offset = end0;
6454 if (beg0 == end0) {
6455 /*
6456 * Always consume at least one character of the input string
6457 * in order to prevent infinite loops.
6458 */
6459 if (RSTRING_LEN(str) <= end0) break;
6460 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6461 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6462 offset = end0 + len;
6463 }
6464 cp = RSTRING_PTR(str) + offset;
6465 if (offset > RSTRING_LEN(str)) break;
6466
6467 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6468 if (mode != FAST_MAP && mode != STR) {
6469 match = Qnil;
6470 }
6471 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6472
6473 RB_GC_GUARD(match);
6474 } while (beg >= 0);
6475
6476 if (RSTRING_LEN(str) > offset) {
6477 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6478 }
6479 rb_pat_search0(pat, str, last, 1, &match);
6480 if (bang) {
6481 str_shared_replace(str, dest);
6482 }
6483 else {
6484 str = dest;
6485 }
6486
6487 return str;
6488}
6489
6490
6491/*
6492 * call-seq:
6493 * gsub!(pattern, replacement) -> self or nil
6494 * gsub!(pattern) {|match| ... } -> self or nil
6495 * gsub!(pattern) -> an_enumerator
6496 *
6497 * Like String#gsub, except that:
6498 *
6499 * - Performs substitutions in +self+ (not in a copy of +self+).
6500 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6501 *
6502 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6503 */
6504
6505static VALUE
6506rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6507{
6508 str_modify_keep_cr(str);
6509 return str_gsub(argc, argv, str, 1);
6510}
6511
6512
6513/*
6514 * call-seq:
6515 * gsub(pattern, replacement) -> new_string
6516 * gsub(pattern) {|match| ... } -> new_string
6517 * gsub(pattern) -> enumerator
6518 *
6519 * Returns a copy of +self+ with zero or more substrings replaced.
6520 *
6521 * Argument +pattern+ may be a string or a Regexp;
6522 * argument +replacement+ may be a string or a Hash.
6523 * Varying types for the argument values makes this method very versatile.
6524 *
6525 * Below are some simple examples;
6526 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6527 *
6528 * With arguments +pattern+ and string +replacement+ given,
6529 * replaces each matching substring with the given +replacement+ string:
6530 *
6531 * s = 'abracadabra'
6532 * s.gsub('ab', 'AB') # => "ABracadABra"
6533 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6534 *
6535 * With arguments +pattern+ and hash +replacement+ given,
6536 * replaces each matching substring with a value from the given +replacement+ hash,
6537 * or removes it:
6538 *
6539 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6540 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6541 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6542 *
6543 * With argument +pattern+ and a block given,
6544 * calls the block with each matching substring;
6545 * replaces that substring with the block's return value:
6546 *
6547 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6548 * # => "ABrACADABrA"
6549 *
6550 * With argument +pattern+ and no block given,
6551 * returns a new Enumerator.
6552 *
6553 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6554 */
6555
6556static VALUE
6557rb_str_gsub(int argc, VALUE *argv, VALUE str)
6558{
6559 return str_gsub(argc, argv, str, 0);
6560}
6561
6562
6563/*
6564 * call-seq:
6565 * replace(other_string) -> self
6566 *
6567 * Replaces the contents of +self+ with the contents of +other_string+;
6568 * returns +self+:
6569 *
6570 * s = 'foo' # => "foo"
6571 * s.replace('bar') # => "bar"
6572 *
6573 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6574 */
6575
6576VALUE
6578{
6579 str_modifiable(str);
6580 if (str == str2) return str;
6581
6582 StringValue(str2);
6583 str_discard(str);
6584 return str_replace(str, str2);
6585}
6586
6587/*
6588 * call-seq:
6589 * clear -> self
6590 *
6591 * Removes the contents of +self+:
6592 *
6593 * s = 'foo'
6594 * s.clear # => ""
6595 * s # => ""
6596 *
6597 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6598 */
6599
6600static VALUE
6601rb_str_clear(VALUE str)
6602{
6603 str_discard(str);
6604 STR_SET_EMBED(str);
6605 STR_SET_LEN(str, 0);
6606 RSTRING_PTR(str)[0] = 0;
6607 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6609 else
6611 return str;
6612}
6613
6614/*
6615 * call-seq:
6616 * chr -> string
6617 *
6618 * :include: doc/string/chr.rdoc
6619 *
6620 */
6621
6622static VALUE
6623rb_str_chr(VALUE str)
6624{
6625 return rb_str_substr(str, 0, 1);
6626}
6627
6628/*
6629 * call-seq:
6630 * getbyte(index) -> integer or nil
6631 *
6632 * :include: doc/string/getbyte.rdoc
6633 *
6634 */
6635VALUE
6636rb_str_getbyte(VALUE str, VALUE index)
6637{
6638 long pos = NUM2LONG(index);
6639
6640 if (pos < 0)
6641 pos += RSTRING_LEN(str);
6642 if (pos < 0 || RSTRING_LEN(str) <= pos)
6643 return Qnil;
6644
6645 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6646}
6647
6648/*
6649 * call-seq:
6650 * setbyte(index, integer) -> integer
6651 *
6652 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6653 * returns +integer+:
6654 *
6655 * s = 'xyzzy'
6656 * s.setbyte(2, 129) # => 129
6657 * s # => "xy\x81zy"
6658 *
6659 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6660 */
6661VALUE
6662rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6663{
6664 long pos = NUM2LONG(index);
6665 long len = RSTRING_LEN(str);
6666 char *ptr, *head, *left = 0;
6667 rb_encoding *enc;
6668 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6669
6670 if (pos < -len || len <= pos)
6671 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6672 if (pos < 0)
6673 pos += len;
6674
6675 VALUE v = rb_to_int(value);
6676 VALUE w = rb_int_and(v, INT2FIX(0xff));
6677 char byte = (char)(NUM2INT(w) & 0xFF);
6678
6679 if (!str_independent(str))
6680 str_make_independent(str);
6681 enc = STR_ENC_GET(str);
6682 head = RSTRING_PTR(str);
6683 ptr = &head[pos];
6684 if (!STR_EMBED_P(str)) {
6685 cr = ENC_CODERANGE(str);
6686 switch (cr) {
6687 case ENC_CODERANGE_7BIT:
6688 left = ptr;
6689 *ptr = byte;
6690 if (ISASCII(byte)) goto end;
6691 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6692 if (!MBCLEN_CHARFOUND_P(nlen))
6694 else
6696 goto end;
6698 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6699 width = rb_enc_precise_mbclen(left, head+len, enc);
6700 *ptr = byte;
6701 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6702 if (!MBCLEN_CHARFOUND_P(nlen))
6704 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6706 goto end;
6707 }
6708 }
6710 *ptr = byte;
6711
6712 end:
6713 return value;
6714}
6715
6716static VALUE
6717str_byte_substr(VALUE str, long beg, long len, int empty)
6718{
6719 long n = RSTRING_LEN(str);
6720
6721 if (beg > n || len < 0) return Qnil;
6722 if (beg < 0) {
6723 beg += n;
6724 if (beg < 0) return Qnil;
6725 }
6726 if (len > n - beg)
6727 len = n - beg;
6728 if (len <= 0) {
6729 if (!empty) return Qnil;
6730 len = 0;
6731 }
6732
6733 VALUE str2 = str_subseq(str, beg, len);
6734
6735 str_enc_copy_direct(str2, str);
6736
6737 if (RSTRING_LEN(str2) == 0) {
6738 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6740 else
6742 }
6743 else {
6744 switch (ENC_CODERANGE(str)) {
6745 case ENC_CODERANGE_7BIT:
6747 break;
6748 default:
6750 break;
6751 }
6752 }
6753
6754 return str2;
6755}
6756
6757VALUE
6758rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6759{
6760 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6761}
6762
6763static VALUE
6764str_byte_aref(VALUE str, VALUE indx)
6765{
6766 long idx;
6767 if (FIXNUM_P(indx)) {
6768 idx = FIX2LONG(indx);
6769 }
6770 else {
6771 /* check if indx is Range */
6772 long beg, len = RSTRING_LEN(str);
6773
6774 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6775 case Qfalse:
6776 break;
6777 case Qnil:
6778 return Qnil;
6779 default:
6780 return str_byte_substr(str, beg, len, TRUE);
6781 }
6782
6783 idx = NUM2LONG(indx);
6784 }
6785 return str_byte_substr(str, idx, 1, FALSE);
6786}
6787
6788/*
6789 * call-seq:
6790 * byteslice(offset, length = 1) -> string or nil
6791 * byteslice(range) -> string or nil
6792 *
6793 * :include: doc/string/byteslice.rdoc
6794 */
6795
6796static VALUE
6797rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6798{
6799 if (argc == 2) {
6800 long beg = NUM2LONG(argv[0]);
6801 long len = NUM2LONG(argv[1]);
6802 return str_byte_substr(str, beg, len, TRUE);
6803 }
6804 rb_check_arity(argc, 1, 2);
6805 return str_byte_aref(str, argv[0]);
6806}
6807
6808static void
6809str_check_beg_len(VALUE str, long *beg, long *len)
6810{
6811 long end, slen = RSTRING_LEN(str);
6812
6813 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6814 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6815 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6816 }
6817 if (*beg < 0) {
6818 *beg += slen;
6819 }
6820 RUBY_ASSERT(*beg >= 0);
6821 RUBY_ASSERT(*beg <= slen);
6822
6823 if (*len > slen - *beg) {
6824 *len = slen - *beg;
6825 }
6826 end = *beg + *len;
6827 str_ensure_byte_pos(str, *beg);
6828 str_ensure_byte_pos(str, end);
6829}
6830
6831/*
6832 * call-seq:
6833 * bytesplice(offset, length, str) -> self
6834 * bytesplice(offset, length, str, str_offset, str_length) -> self
6835 * bytesplice(range, str) -> self
6836 * bytesplice(range, str, str_range) -> self
6837 *
6838 * :include: doc/string/bytesplice.rdoc
6839 */
6840
6841static VALUE
6842rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6843{
6844 long beg, len, vbeg, vlen;
6845 VALUE val;
6846 int cr;
6847
6848 rb_check_arity(argc, 2, 5);
6849 if (!(argc == 2 || argc == 3 || argc == 5)) {
6850 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6851 }
6852 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6853 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6854 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6855 rb_builtin_class_name(argv[0]));
6856 }
6857 val = argv[1];
6858 StringValue(val);
6859 if (argc == 2) {
6860 /* bytesplice(range, str) */
6861 vbeg = 0;
6862 vlen = RSTRING_LEN(val);
6863 }
6864 else {
6865 /* bytesplice(range, str, str_range) */
6866 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6867 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6868 rb_builtin_class_name(argv[2]));
6869 }
6870 }
6871 }
6872 else {
6873 beg = NUM2LONG(argv[0]);
6874 len = NUM2LONG(argv[1]);
6875 val = argv[2];
6876 StringValue(val);
6877 if (argc == 3) {
6878 /* bytesplice(index, length, str) */
6879 vbeg = 0;
6880 vlen = RSTRING_LEN(val);
6881 }
6882 else {
6883 /* bytesplice(index, length, str, str_index, str_length) */
6884 vbeg = NUM2LONG(argv[3]);
6885 vlen = NUM2LONG(argv[4]);
6886 }
6887 }
6888 str_check_beg_len(str, &beg, &len);
6889 str_check_beg_len(val, &vbeg, &vlen);
6890 str_modify_keep_cr(str);
6891
6892 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6893 rb_enc_associate(str, rb_enc_check(str, val));
6894 }
6895
6896 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6898 if (cr != ENC_CODERANGE_BROKEN)
6899 ENC_CODERANGE_SET(str, cr);
6900 return str;
6901}
6902
6903/*
6904 * call-seq:
6905 * reverse -> new_string
6906 *
6907 * Returns a new string with the characters from +self+ in reverse order.
6908 *
6909 * 'drawer'.reverse # => "reward"
6910 * 'reviled'.reverse # => "deliver"
6911 * 'stressed'.reverse # => "desserts"
6912 * 'semordnilaps'.reverse # => "spalindromes"
6913 *
6914 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6915 */
6916
6917static VALUE
6918rb_str_reverse(VALUE str)
6919{
6920 rb_encoding *enc;
6921 VALUE rev;
6922 char *s, *e, *p;
6923 int cr;
6924
6925 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6926 enc = STR_ENC_GET(str);
6927 rev = rb_str_new(0, RSTRING_LEN(str));
6928 s = RSTRING_PTR(str); e = RSTRING_END(str);
6929 p = RSTRING_END(rev);
6930 cr = ENC_CODERANGE(str);
6931
6932 if (RSTRING_LEN(str) > 1) {
6933 if (single_byte_optimizable(str)) {
6934 while (s < e) {
6935 *--p = *s++;
6936 }
6937 }
6938 else if (cr == ENC_CODERANGE_VALID) {
6939 while (s < e) {
6940 int clen = rb_enc_fast_mbclen(s, e, enc);
6941
6942 p -= clen;
6943 memcpy(p, s, clen);
6944 s += clen;
6945 }
6946 }
6947 else {
6948 cr = rb_enc_asciicompat(enc) ?
6950 while (s < e) {
6951 int clen = rb_enc_mbclen(s, e, enc);
6952
6953 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6954 p -= clen;
6955 memcpy(p, s, clen);
6956 s += clen;
6957 }
6958 }
6959 }
6960 STR_SET_LEN(rev, RSTRING_LEN(str));
6961 str_enc_copy_direct(rev, str);
6962 ENC_CODERANGE_SET(rev, cr);
6963
6964 return rev;
6965}
6966
6967
6968/*
6969 * call-seq:
6970 * reverse! -> self
6971 *
6972 * Returns +self+ with its characters reversed:
6973 *
6974 * 'drawer'.reverse! # => "reward"
6975 * 'reviled'.reverse! # => "deliver"
6976 * 'stressed'.reverse! # => "desserts"
6977 * 'semordnilaps'.reverse! # => "spalindromes"
6978 *
6979 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6980 */
6981
6982static VALUE
6983rb_str_reverse_bang(VALUE str)
6984{
6985 if (RSTRING_LEN(str) > 1) {
6986 if (single_byte_optimizable(str)) {
6987 char *s, *e, c;
6988
6989 str_modify_keep_cr(str);
6990 s = RSTRING_PTR(str);
6991 e = RSTRING_END(str) - 1;
6992 while (s < e) {
6993 c = *s;
6994 *s++ = *e;
6995 *e-- = c;
6996 }
6997 }
6998 else {
6999 str_shared_replace(str, rb_str_reverse(str));
7000 }
7001 }
7002 else {
7003 str_modify_keep_cr(str);
7004 }
7005 return str;
7006}
7007
7008
7009/*
7010 * call-seq:
7011 * include?(other_string) -> true or false
7012 *
7013 * Returns whether +self+ contains +other_string+:
7014 *
7015 * s = 'bar'
7016 * s.include?('ba') # => true
7017 * s.include?('ar') # => true
7018 * s.include?('bar') # => true
7019 * s.include?('a') # => true
7020 * s.include?('') # => true
7021 * s.include?('foo') # => false
7022 *
7023 * Related: see {Querying}[rdoc-ref:String@Querying].
7024 */
7025
7026VALUE
7027rb_str_include(VALUE str, VALUE arg)
7028{
7029 long i;
7030
7031 StringValue(arg);
7032 i = rb_str_index(str, arg, 0);
7033
7034 return RBOOL(i != -1);
7035}
7036
7037
7038/*
7039 * call-seq:
7040 * to_i(base = 10) -> integer
7041 *
7042 * Returns the result of interpreting leading characters in +self+
7043 * as an integer in the given +base+;
7044 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7045 *
7046 * '123456'.to_i # => 123456
7047 * '123def'.to_i(16) # => 1195503
7048 *
7049 * With +base+ zero given, string +object+ may contain leading characters
7050 * to specify the actual base:
7051 *
7052 * '123def'.to_i(0) # => 123
7053 * '0123def'.to_i(0) # => 83
7054 * '0b123def'.to_i(0) # => 1
7055 * '0o123def'.to_i(0) # => 83
7056 * '0d123def'.to_i(0) # => 123
7057 * '0x123def'.to_i(0) # => 1195503
7058 *
7059 * Characters past a leading valid number (in the given +base+) are ignored:
7060 *
7061 * '12.345'.to_i # => 12
7062 * '12345'.to_i(2) # => 1
7063 *
7064 * Returns zero if there is no leading valid number:
7065 *
7066 * 'abcdef'.to_i # => 0
7067 * '2'.to_i(2) # => 0
7068 *
7069 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7070 */
7071
7072static VALUE
7073rb_str_to_i(int argc, VALUE *argv, VALUE str)
7074{
7075 int base = 10;
7076
7077 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7078 rb_raise(rb_eArgError, "invalid radix %d", base);
7079 }
7080 return rb_str_to_inum(str, base, FALSE);
7081}
7082
7083
7084/*
7085 * call-seq:
7086 * to_f -> float
7087 *
7088 * Returns the result of interpreting leading characters in +self+ as a Float:
7089 *
7090 * '3.14159'.to_f # => 3.14159
7091 * '1.234e-2'.to_f # => 0.01234
7092 *
7093 * Characters past a leading valid number are ignored:
7094 *
7095 * '3.14 (pi to two places)'.to_f # => 3.14
7096 *
7097 * Returns zero if there is no leading valid number:
7098 *
7099 * 'abcdef'.to_f # => 0.0
7100 *
7101 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7102 */
7103
7104static VALUE
7105rb_str_to_f(VALUE str)
7106{
7107 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7108}
7109
7110
7111/*
7112 * call-seq:
7113 * to_s -> self or new_string
7114 *
7115 * Returns +self+ if +self+ is a +String+,
7116 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7117 *
7118 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7119 */
7120
7121static VALUE
7122rb_str_to_s(VALUE str)
7123{
7124 if (rb_obj_class(str) != rb_cString) {
7125 return str_duplicate(rb_cString, str);
7126 }
7127 return str;
7128}
7129
7130#if 0
7131static void
7132str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7133{
7134 char s[RUBY_MAX_CHAR_LEN];
7135 int n = rb_enc_codelen(c, enc);
7136
7137 rb_enc_mbcput(c, s, enc);
7138 rb_enc_str_buf_cat(str, s, n, enc);
7139}
7140#endif
7141
7142#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7143
7144int
7145rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7146{
7147 char buf[CHAR_ESC_LEN + 1];
7148 int l;
7149
7150#if SIZEOF_INT > 4
7151 c &= 0xffffffff;
7152#endif
7153 if (unicode_p) {
7154 if (c < 0x7F && ISPRINT(c)) {
7155 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7156 }
7157 else if (c < 0x10000) {
7158 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7159 }
7160 else {
7161 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7162 }
7163 }
7164 else {
7165 if (c < 0x100) {
7166 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7167 }
7168 else {
7169 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7170 }
7171 }
7172 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7173 rb_str_buf_cat(result, buf, l);
7174 return l;
7175}
7176
7177const char *
7178ruby_escaped_char(int c)
7179{
7180 switch (c) {
7181 case '\0': return "\\0";
7182 case '\n': return "\\n";
7183 case '\r': return "\\r";
7184 case '\t': return "\\t";
7185 case '\f': return "\\f";
7186 case '\013': return "\\v";
7187 case '\010': return "\\b";
7188 case '\007': return "\\a";
7189 case '\033': return "\\e";
7190 case '\x7f': return "\\c?";
7191 }
7192 return NULL;
7193}
7194
7195VALUE
7196rb_str_escape(VALUE str)
7197{
7198 int encidx = ENCODING_GET(str);
7199 rb_encoding *enc = rb_enc_from_index(encidx);
7200 const char *p = RSTRING_PTR(str);
7201 const char *pend = RSTRING_END(str);
7202 const char *prev = p;
7203 char buf[CHAR_ESC_LEN + 1];
7204 VALUE result = rb_str_buf_new(0);
7205 int unicode_p = rb_enc_unicode_p(enc);
7206 int asciicompat = rb_enc_asciicompat(enc);
7207
7208 while (p < pend) {
7209 unsigned int c;
7210 const char *cc;
7211 int n = rb_enc_precise_mbclen(p, pend, enc);
7212 if (!MBCLEN_CHARFOUND_P(n)) {
7213 if (p > prev) str_buf_cat(result, prev, p - prev);
7214 n = rb_enc_mbminlen(enc);
7215 if (pend < p + n)
7216 n = (int)(pend - p);
7217 while (n--) {
7218 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7219 str_buf_cat(result, buf, strlen(buf));
7220 prev = ++p;
7221 }
7222 continue;
7223 }
7224 n = MBCLEN_CHARFOUND_LEN(n);
7225 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7226 p += n;
7227 cc = ruby_escaped_char(c);
7228 if (cc) {
7229 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7230 str_buf_cat(result, cc, strlen(cc));
7231 prev = p;
7232 }
7233 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7234 }
7235 else {
7236 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7237 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7238 prev = p;
7239 }
7240 }
7241 if (p > prev) str_buf_cat(result, prev, p - prev);
7242 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7243
7244 return result;
7245}
7246
7247/*
7248 * call-seq:
7249 * inspect -> string
7250 *
7251 * :include: doc/string/inspect.rdoc
7252 *
7253 */
7254
7255VALUE
7257{
7258 int encidx = ENCODING_GET(str);
7259 rb_encoding *enc = rb_enc_from_index(encidx);
7260 const char *p, *pend, *prev;
7261 char buf[CHAR_ESC_LEN + 1];
7262 VALUE result = rb_str_buf_new(0);
7263 rb_encoding *resenc = rb_default_internal_encoding();
7264 int unicode_p = rb_enc_unicode_p(enc);
7265 int asciicompat = rb_enc_asciicompat(enc);
7266
7267 if (resenc == NULL) resenc = rb_default_external_encoding();
7268 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7269 rb_enc_associate(result, resenc);
7270 str_buf_cat2(result, "\"");
7271
7272 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7273 prev = p;
7274 while (p < pend) {
7275 unsigned int c, cc;
7276 int n;
7277
7278 n = rb_enc_precise_mbclen(p, pend, enc);
7279 if (!MBCLEN_CHARFOUND_P(n)) {
7280 if (p > prev) str_buf_cat(result, prev, p - prev);
7281 n = rb_enc_mbminlen(enc);
7282 if (pend < p + n)
7283 n = (int)(pend - p);
7284 while (n--) {
7285 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7286 str_buf_cat(result, buf, strlen(buf));
7287 prev = ++p;
7288 }
7289 continue;
7290 }
7291 n = MBCLEN_CHARFOUND_LEN(n);
7292 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7293 p += n;
7294 if ((asciicompat || unicode_p) &&
7295 (c == '"'|| c == '\\' ||
7296 (c == '#' &&
7297 p < pend &&
7298 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7299 (cc = rb_enc_codepoint(p,pend,enc),
7300 (cc == '$' || cc == '@' || cc == '{'))))) {
7301 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7302 str_buf_cat2(result, "\\");
7303 if (asciicompat || enc == resenc) {
7304 prev = p - n;
7305 continue;
7306 }
7307 }
7308 switch (c) {
7309 case '\n': cc = 'n'; break;
7310 case '\r': cc = 'r'; break;
7311 case '\t': cc = 't'; break;
7312 case '\f': cc = 'f'; break;
7313 case '\013': cc = 'v'; break;
7314 case '\010': cc = 'b'; break;
7315 case '\007': cc = 'a'; break;
7316 case 033: cc = 'e'; break;
7317 default: cc = 0; break;
7318 }
7319 if (cc) {
7320 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7321 buf[0] = '\\';
7322 buf[1] = (char)cc;
7323 str_buf_cat(result, buf, 2);
7324 prev = p;
7325 continue;
7326 }
7327 /* The special casing of 0x85 (NEXT_LINE) here is because
7328 * Oniguruma historically treats it as printable, but it
7329 * doesn't match the print POSIX bracket class or character
7330 * property in regexps.
7331 *
7332 * See Ruby Bug #16842 for details:
7333 * https://bugs.ruby-lang.org/issues/16842
7334 */
7335 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7336 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7337 continue;
7338 }
7339 else {
7340 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7341 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7342 prev = p;
7343 continue;
7344 }
7345 }
7346 if (p > prev) str_buf_cat(result, prev, p - prev);
7347 str_buf_cat2(result, "\"");
7348
7349 return result;
7350}
7351
7352#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7353
7354/*
7355 * call-seq:
7356 * dump -> new_string
7357 *
7358 * :include: doc/string/dump.rdoc
7359 *
7360 */
7361
7362VALUE
7364{
7365 int encidx = rb_enc_get_index(str);
7366 rb_encoding *enc = rb_enc_from_index(encidx);
7367 long len;
7368 const char *p, *pend;
7369 char *q, *qend;
7370 VALUE result;
7371 int u8 = (encidx == rb_utf8_encindex());
7372 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7373
7374 len = 2; /* "" */
7375 if (!rb_enc_asciicompat(enc)) {
7376 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7377 len += strlen(enc->name);
7378 }
7379
7380 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7381 while (p < pend) {
7382 int clen;
7383 unsigned char c = *p++;
7384
7385 switch (c) {
7386 case '"': case '\\':
7387 case '\n': case '\r':
7388 case '\t': case '\f':
7389 case '\013': case '\010': case '\007': case '\033':
7390 clen = 2;
7391 break;
7392
7393 case '#':
7394 clen = IS_EVSTR(p, pend) ? 2 : 1;
7395 break;
7396
7397 default:
7398 if (ISPRINT(c)) {
7399 clen = 1;
7400 }
7401 else {
7402 if (u8 && c > 0x7F) { /* \u notation */
7403 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7404 if (MBCLEN_CHARFOUND_P(n)) {
7405 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7406 if (cc <= 0xFFFF)
7407 clen = 6; /* \uXXXX */
7408 else if (cc <= 0xFFFFF)
7409 clen = 9; /* \u{XXXXX} */
7410 else
7411 clen = 10; /* \u{XXXXXX} */
7412 p += MBCLEN_CHARFOUND_LEN(n)-1;
7413 break;
7414 }
7415 }
7416 clen = 4; /* \xNN */
7417 }
7418 break;
7419 }
7420
7421 if (clen > LONG_MAX - len) {
7422 rb_raise(rb_eRuntimeError, "string size too big");
7423 }
7424 len += clen;
7425 }
7426
7427 result = rb_str_new(0, len);
7428 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7429 q = RSTRING_PTR(result); qend = q + len + 1;
7430
7431 *q++ = '"';
7432 while (p < pend) {
7433 unsigned char c = *p++;
7434
7435 if (c == '"' || c == '\\') {
7436 *q++ = '\\';
7437 *q++ = c;
7438 }
7439 else if (c == '#') {
7440 if (IS_EVSTR(p, pend)) *q++ = '\\';
7441 *q++ = '#';
7442 }
7443 else if (c == '\n') {
7444 *q++ = '\\';
7445 *q++ = 'n';
7446 }
7447 else if (c == '\r') {
7448 *q++ = '\\';
7449 *q++ = 'r';
7450 }
7451 else if (c == '\t') {
7452 *q++ = '\\';
7453 *q++ = 't';
7454 }
7455 else if (c == '\f') {
7456 *q++ = '\\';
7457 *q++ = 'f';
7458 }
7459 else if (c == '\013') {
7460 *q++ = '\\';
7461 *q++ = 'v';
7462 }
7463 else if (c == '\010') {
7464 *q++ = '\\';
7465 *q++ = 'b';
7466 }
7467 else if (c == '\007') {
7468 *q++ = '\\';
7469 *q++ = 'a';
7470 }
7471 else if (c == '\033') {
7472 *q++ = '\\';
7473 *q++ = 'e';
7474 }
7475 else if (ISPRINT(c)) {
7476 *q++ = c;
7477 }
7478 else {
7479 *q++ = '\\';
7480 if (u8) {
7481 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7482 if (MBCLEN_CHARFOUND_P(n)) {
7483 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7484 p += n;
7485 if (cc <= 0xFFFF)
7486 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7487 else
7488 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7489 q += strlen(q);
7490 continue;
7491 }
7492 }
7493 snprintf(q, qend-q, "x%02X", c);
7494 q += 3;
7495 }
7496 }
7497 *q++ = '"';
7498 *q = '\0';
7499 if (!rb_enc_asciicompat(enc)) {
7500 snprintf(q, qend-q, nonascii_suffix, enc->name);
7501 encidx = rb_ascii8bit_encindex();
7502 }
7503 /* result from dump is ASCII */
7504 rb_enc_associate_index(result, encidx);
7506 return result;
7507}
7508
7509static int
7510unescape_ascii(unsigned int c)
7511{
7512 switch (c) {
7513 case 'n':
7514 return '\n';
7515 case 'r':
7516 return '\r';
7517 case 't':
7518 return '\t';
7519 case 'f':
7520 return '\f';
7521 case 'v':
7522 return '\13';
7523 case 'b':
7524 return '\010';
7525 case 'a':
7526 return '\007';
7527 case 'e':
7528 return 033;
7529 }
7531}
7532
7533static void
7534undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7535{
7536 const char *s = *ss;
7537 unsigned int c;
7538 int codelen;
7539 size_t hexlen;
7540 unsigned char buf[6];
7541 static rb_encoding *enc_utf8 = NULL;
7542
7543 switch (*s) {
7544 case '\\':
7545 case '"':
7546 case '#':
7547 rb_str_cat(undumped, s, 1); /* cat itself */
7548 s++;
7549 break;
7550 case 'n':
7551 case 'r':
7552 case 't':
7553 case 'f':
7554 case 'v':
7555 case 'b':
7556 case 'a':
7557 case 'e':
7558 *buf = unescape_ascii(*s);
7559 rb_str_cat(undumped, (char *)buf, 1);
7560 s++;
7561 break;
7562 case 'u':
7563 if (*binary) {
7564 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7565 }
7566 *utf8 = true;
7567 if (++s >= s_end) {
7568 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7569 }
7570 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7571 if (*penc != enc_utf8) {
7572 *penc = enc_utf8;
7573 rb_enc_associate(undumped, enc_utf8);
7574 }
7575 if (*s == '{') { /* handle \u{...} form */
7576 s++;
7577 for (;;) {
7578 if (s >= s_end) {
7579 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7580 }
7581 if (*s == '}') {
7582 s++;
7583 break;
7584 }
7585 if (ISSPACE(*s)) {
7586 s++;
7587 continue;
7588 }
7589 c = scan_hex(s, s_end-s, &hexlen);
7590 if (hexlen == 0 || hexlen > 6) {
7591 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7592 }
7593 if (c > 0x10ffff) {
7594 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7595 }
7596 if (0xd800 <= c && c <= 0xdfff) {
7597 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7598 }
7599 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7600 rb_str_cat(undumped, (char *)buf, codelen);
7601 s += hexlen;
7602 }
7603 }
7604 else { /* handle \uXXXX form */
7605 c = scan_hex(s, 4, &hexlen);
7606 if (hexlen != 4) {
7607 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7608 }
7609 if (0xd800 <= c && c <= 0xdfff) {
7610 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7611 }
7612 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7613 rb_str_cat(undumped, (char *)buf, codelen);
7614 s += hexlen;
7615 }
7616 break;
7617 case 'x':
7618 if (++s >= s_end) {
7619 rb_raise(rb_eRuntimeError, "invalid hex escape");
7620 }
7621 *buf = scan_hex(s, 2, &hexlen);
7622 if (hexlen != 2) {
7623 rb_raise(rb_eRuntimeError, "invalid hex escape");
7624 }
7625 if (!ISASCII(*buf)) {
7626 if (*utf8) {
7627 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7628 }
7629 *binary = true;
7630 }
7631 rb_str_cat(undumped, (char *)buf, 1);
7632 s += hexlen;
7633 break;
7634 default:
7635 rb_str_cat(undumped, s-1, 2);
7636 s++;
7637 }
7638
7639 *ss = s;
7640}
7641
7642static VALUE rb_str_is_ascii_only_p(VALUE str);
7643
7644/*
7645 * call-seq:
7646 * undump -> new_string
7647 *
7648 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7649 *
7650 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7651 */
7652
7653static VALUE
7654str_undump(VALUE str)
7655{
7656 const char *s = RSTRING_PTR(str);
7657 const char *s_end = RSTRING_END(str);
7658 rb_encoding *enc = rb_enc_get(str);
7659 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7660 bool utf8 = false;
7661 bool binary = false;
7662 int w;
7663
7665 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7666 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7667 }
7668 if (!str_null_check(str, &w)) {
7669 rb_raise(rb_eRuntimeError, "string contains null byte");
7670 }
7671 if (RSTRING_LEN(str) < 2) goto invalid_format;
7672 if (*s != '"') goto invalid_format;
7673
7674 /* strip '"' at the start */
7675 s++;
7676
7677 for (;;) {
7678 if (s >= s_end) {
7679 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7680 }
7681
7682 if (*s == '"') {
7683 /* epilogue */
7684 s++;
7685 if (s == s_end) {
7686 /* ascii compatible dumped string */
7687 break;
7688 }
7689 else {
7690 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7691 static const char dup_suffix[] = ".dup";
7692 const char *encname;
7693 int encidx;
7694 ptrdiff_t size;
7695
7696 /* check separately for strings dumped by older versions */
7697 size = sizeof(dup_suffix) - 1;
7698 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7699
7700 size = sizeof(force_encoding_suffix) - 1;
7701 if (s_end - s <= size) goto invalid_format;
7702 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7703 s += size;
7704
7705 if (utf8) {
7706 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7707 }
7708
7709 encname = s;
7710 s = memchr(s, '"', s_end-s);
7711 size = s - encname;
7712 if (!s) goto invalid_format;
7713 if (s_end - s != 2) goto invalid_format;
7714 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7715
7716 encidx = rb_enc_find_index2(encname, (long)size);
7717 if (encidx < 0) {
7718 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7719 }
7720 rb_enc_associate_index(undumped, encidx);
7721 }
7722 break;
7723 }
7724
7725 if (*s == '\\') {
7726 s++;
7727 if (s >= s_end) {
7728 rb_raise(rb_eRuntimeError, "invalid escape");
7729 }
7730 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7731 }
7732 else {
7733 rb_str_cat(undumped, s++, 1);
7734 }
7735 }
7736
7737 RB_GC_GUARD(str);
7738
7739 return undumped;
7740invalid_format:
7741 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7742}
7743
7744static void
7745rb_str_check_dummy_enc(rb_encoding *enc)
7746{
7747 if (rb_enc_dummy_p(enc)) {
7748 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7749 rb_enc_name(enc));
7750 }
7751}
7752
7753static rb_encoding *
7754str_true_enc(VALUE str)
7755{
7756 rb_encoding *enc = STR_ENC_GET(str);
7757 rb_str_check_dummy_enc(enc);
7758 return enc;
7759}
7760
7761static OnigCaseFoldType
7762check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7763{
7764 if (argc==0)
7765 return flags;
7766 if (argc>2)
7767 rb_raise(rb_eArgError, "too many options");
7768 if (argv[0]==sym_turkic) {
7769 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7770 if (argc==2) {
7771 if (argv[1]==sym_lithuanian)
7772 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7773 else
7774 rb_raise(rb_eArgError, "invalid second option");
7775 }
7776 }
7777 else if (argv[0]==sym_lithuanian) {
7778 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7779 if (argc==2) {
7780 if (argv[1]==sym_turkic)
7781 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7782 else
7783 rb_raise(rb_eArgError, "invalid second option");
7784 }
7785 }
7786 else if (argc>1)
7787 rb_raise(rb_eArgError, "too many options");
7788 else if (argv[0]==sym_ascii)
7789 flags |= ONIGENC_CASE_ASCII_ONLY;
7790 else if (argv[0]==sym_fold) {
7791 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7792 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7793 else
7794 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7795 }
7796 else
7797 rb_raise(rb_eArgError, "invalid option");
7798 return flags;
7799}
7800
7801static inline bool
7802case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7803{
7804 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7805 return true;
7806 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7807}
7808
7809/* 16 should be long enough to absorb any kind of single character length increase */
7810#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7811#ifndef CASEMAP_DEBUG
7812# define CASEMAP_DEBUG 0
7813#endif
7814
7815struct mapping_buffer;
7816typedef struct mapping_buffer {
7817 size_t capa;
7818 size_t used;
7819 struct mapping_buffer *next;
7820 OnigUChar space[FLEX_ARY_LEN];
7822
7823static void
7824mapping_buffer_free(void *p)
7825{
7826 mapping_buffer *previous_buffer;
7827 mapping_buffer *current_buffer = p;
7828 while (current_buffer) {
7829 previous_buffer = current_buffer;
7830 current_buffer = current_buffer->next;
7831 ruby_sized_xfree(previous_buffer, offsetof(mapping_buffer, space) + previous_buffer->capa);
7832 }
7833}
7834
7835static const rb_data_type_t mapping_buffer_type = {
7836 "mapping_buffer",
7837 {0, mapping_buffer_free,},
7838 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7839};
7840
7841static VALUE
7842rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7843{
7844 VALUE target;
7845
7846 const OnigUChar *source_current, *source_end;
7847 int target_length = 0;
7848 VALUE buffer_anchor;
7849 mapping_buffer *current_buffer = 0;
7850 mapping_buffer **pre_buffer;
7851 size_t buffer_count = 0;
7852 int buffer_length_or_invalid;
7853
7854 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7855
7856 source_current = (OnigUChar*)RSTRING_PTR(source);
7857 source_end = (OnigUChar*)RSTRING_END(source);
7858
7859 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7860 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7861 while (source_current < source_end) {
7862 /* increase multiplier using buffer count to converge quickly */
7863 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7864 if (CASEMAP_DEBUG) {
7865 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7866 }
7867 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7868 *pre_buffer = current_buffer;
7869 pre_buffer = &current_buffer->next;
7870 current_buffer->next = NULL;
7871 current_buffer->capa = capa;
7872 buffer_length_or_invalid = enc->case_map(flags,
7873 &source_current, source_end,
7874 current_buffer->space,
7875 current_buffer->space+current_buffer->capa,
7876 enc);
7877 if (buffer_length_or_invalid < 0) {
7878 current_buffer = DATA_PTR(buffer_anchor);
7879 DATA_PTR(buffer_anchor) = 0;
7880 mapping_buffer_free(current_buffer);
7881 rb_raise(rb_eArgError, "input string invalid");
7882 }
7883 target_length += current_buffer->used = buffer_length_or_invalid;
7884 }
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7887 }
7888
7889 if (buffer_count==1) {
7890 target = rb_str_new((const char*)current_buffer->space, target_length);
7891 }
7892 else {
7893 char *target_current;
7894
7895 target = rb_str_new(0, target_length);
7896 target_current = RSTRING_PTR(target);
7897 current_buffer = DATA_PTR(buffer_anchor);
7898 while (current_buffer) {
7899 memcpy(target_current, current_buffer->space, current_buffer->used);
7900 target_current += current_buffer->used;
7901 current_buffer = current_buffer->next;
7902 }
7903 }
7904 current_buffer = DATA_PTR(buffer_anchor);
7905 DATA_PTR(buffer_anchor) = 0;
7906 mapping_buffer_free(current_buffer);
7907
7908 RB_GC_GUARD(buffer_anchor);
7909
7910 /* TODO: check about string terminator character */
7911 str_enc_copy_direct(target, source);
7912 /*ENC_CODERANGE_SET(mapped, cr);*/
7913
7914 return target;
7915}
7916
7917static VALUE
7918rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7919{
7920 const OnigUChar *source_current, *source_end;
7921 OnigUChar *target_current, *target_end;
7922 long old_length = RSTRING_LEN(source);
7923 int length_or_invalid;
7924
7925 if (old_length == 0) return Qnil;
7926
7927 source_current = (OnigUChar*)RSTRING_PTR(source);
7928 source_end = (OnigUChar*)RSTRING_END(source);
7929 if (source == target) {
7930 target_current = (OnigUChar*)source_current;
7931 target_end = (OnigUChar*)source_end;
7932 }
7933 else {
7934 target_current = (OnigUChar*)RSTRING_PTR(target);
7935 target_end = (OnigUChar*)RSTRING_END(target);
7936 }
7937
7938 length_or_invalid = onigenc_ascii_only_case_map(flags,
7939 &source_current, source_end,
7940 target_current, target_end, enc);
7941 if (length_or_invalid < 0)
7942 rb_raise(rb_eArgError, "input string invalid");
7943 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7944 fprintf(stderr, "problem with rb_str_ascii_casemap"
7945 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7946 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7947 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7948 }
7949
7950 str_enc_copy(target, source);
7951
7952 return target;
7953}
7954
7955static bool
7956upcase_single(VALUE str)
7957{
7958 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7959 bool modified = false;
7960
7961 while (s < send) {
7962 unsigned int c = *(unsigned char*)s;
7963
7964 if ('a' <= c && c <= 'z') {
7965 *s = 'A' + (c - 'a');
7966 modified = true;
7967 }
7968 s++;
7969 }
7970 return modified;
7971}
7972
7973/*
7974 * call-seq:
7975 * upcase!(mapping) -> self or nil
7976 *
7977 * Like String#upcase, except that:
7978 *
7979 * - Changes character casings in +self+ (not in a copy of +self+).
7980 * - Returns +self+ if any changes are made, +nil+ otherwise.
7981 *
7982 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7983 */
7984
7985static VALUE
7986rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7987{
7988 rb_encoding *enc;
7989 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7990
7991 flags = check_case_options(argc, argv, flags);
7992 str_modify_keep_cr(str);
7993 enc = str_true_enc(str);
7994 if (case_option_single_p(flags, enc, str)) {
7995 if (upcase_single(str))
7996 flags |= ONIGENC_CASE_MODIFIED;
7997 }
7998 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7999 rb_str_ascii_casemap(str, str, &flags, enc);
8000 else
8001 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8002
8003 if (ONIGENC_CASE_MODIFIED&flags) return str;
8004 return Qnil;
8005}
8006
8007
8008/*
8009 * call-seq:
8010 * upcase(mapping = :ascii) -> new_string
8011 *
8012 * :include: doc/string/upcase.rdoc
8013 */
8014
8015static VALUE
8016rb_str_upcase(int argc, VALUE *argv, VALUE str)
8017{
8018 rb_encoding *enc;
8019 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8020 VALUE ret;
8021
8022 flags = check_case_options(argc, argv, flags);
8023 enc = str_true_enc(str);
8024 if (case_option_single_p(flags, enc, str)) {
8025 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8026 str_enc_copy_direct(ret, str);
8027 upcase_single(ret);
8028 }
8029 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8030 ret = rb_str_new(0, RSTRING_LEN(str));
8031 rb_str_ascii_casemap(str, ret, &flags, enc);
8032 }
8033 else {
8034 ret = rb_str_casemap(str, &flags, enc);
8035 }
8036
8037 return ret;
8038}
8039
8040static bool
8041downcase_single(VALUE str)
8042{
8043 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8044 bool modified = false;
8045
8046 while (s < send) {
8047 unsigned int c = *(unsigned char*)s;
8048
8049 if ('A' <= c && c <= 'Z') {
8050 *s = 'a' + (c - 'A');
8051 modified = true;
8052 }
8053 s++;
8054 }
8055
8056 return modified;
8057}
8058
8059/*
8060 * call-seq:
8061 * downcase!(mapping) -> self or nil
8062 *
8063 * Like String#downcase, except that:
8064 *
8065 * - Changes character casings in +self+ (not in a copy of +self+).
8066 * - Returns +self+ if any changes are made, +nil+ otherwise.
8067 *
8068 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8069 */
8070
8071static VALUE
8072rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8073{
8074 rb_encoding *enc;
8075 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8076
8077 flags = check_case_options(argc, argv, flags);
8078 str_modify_keep_cr(str);
8079 enc = str_true_enc(str);
8080 if (case_option_single_p(flags, enc, str)) {
8081 if (downcase_single(str))
8082 flags |= ONIGENC_CASE_MODIFIED;
8083 }
8084 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8085 rb_str_ascii_casemap(str, str, &flags, enc);
8086 else
8087 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8088
8089 if (ONIGENC_CASE_MODIFIED&flags) return str;
8090 return Qnil;
8091}
8092
8093
8094/*
8095 * call-seq:
8096 * downcase(mapping = :ascii) -> new_string
8097 *
8098 * :include: doc/string/downcase.rdoc
8099 *
8100 */
8101
8102static VALUE
8103rb_str_downcase(int argc, VALUE *argv, VALUE str)
8104{
8105 rb_encoding *enc;
8106 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8107 VALUE ret;
8108
8109 flags = check_case_options(argc, argv, flags);
8110 enc = str_true_enc(str);
8111 if (case_option_single_p(flags, enc, str)) {
8112 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8113 str_enc_copy_direct(ret, str);
8114 downcase_single(ret);
8115 }
8116 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8117 ret = rb_str_new(0, RSTRING_LEN(str));
8118 rb_str_ascii_casemap(str, ret, &flags, enc);
8119 }
8120 else {
8121 ret = rb_str_casemap(str, &flags, enc);
8122 }
8123
8124 return ret;
8125}
8126
8127
8128/*
8129 * call-seq:
8130 * capitalize!(mapping = :ascii) -> self or nil
8131 *
8132 * Like String#capitalize, except that:
8133 *
8134 * - Changes character casings in +self+ (not in a copy of +self+).
8135 * - Returns +self+ if any changes are made, +nil+ otherwise.
8136 *
8137 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8138 */
8139
8140static VALUE
8141rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8142{
8143 rb_encoding *enc;
8144 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8145
8146 flags = check_case_options(argc, argv, flags);
8147 str_modify_keep_cr(str);
8148 enc = str_true_enc(str);
8149 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8150 if (flags&ONIGENC_CASE_ASCII_ONLY)
8151 rb_str_ascii_casemap(str, str, &flags, enc);
8152 else
8153 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8154
8155 if (ONIGENC_CASE_MODIFIED&flags) return str;
8156 return Qnil;
8157}
8158
8159
8160/*
8161 * call-seq:
8162 * capitalize(mapping = :ascii) -> new_string
8163 *
8164 * :include: doc/string/capitalize.rdoc
8165 *
8166 */
8167
8168static VALUE
8169rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8170{
8171 rb_encoding *enc;
8172 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8173 VALUE ret;
8174
8175 flags = check_case_options(argc, argv, flags);
8176 enc = str_true_enc(str);
8177 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8178 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8179 ret = rb_str_new(0, RSTRING_LEN(str));
8180 rb_str_ascii_casemap(str, ret, &flags, enc);
8181 }
8182 else {
8183 ret = rb_str_casemap(str, &flags, enc);
8184 }
8185 return ret;
8186}
8187
8188
8189/*
8190 * call-seq:
8191 * swapcase!(mapping) -> self or nil
8192 *
8193 * Like String#swapcase, except that:
8194 *
8195 * - Changes are made to +self+, not to copy of +self+.
8196 * - Returns +self+ if any changes are made, +nil+ otherwise.
8197 *
8198 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8199 */
8200
8201static VALUE
8202rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8203{
8204 rb_encoding *enc;
8205 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8206
8207 flags = check_case_options(argc, argv, flags);
8208 str_modify_keep_cr(str);
8209 enc = str_true_enc(str);
8210 if (flags&ONIGENC_CASE_ASCII_ONLY)
8211 rb_str_ascii_casemap(str, str, &flags, enc);
8212 else
8213 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8214
8215 if (ONIGENC_CASE_MODIFIED&flags) return str;
8216 return Qnil;
8217}
8218
8219
8220/*
8221 * call-seq:
8222 * swapcase(mapping = :ascii) -> new_string
8223 *
8224 * :include: doc/string/swapcase.rdoc
8225 *
8226 */
8227
8228static VALUE
8229rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8230{
8231 rb_encoding *enc;
8232 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8233 VALUE ret;
8234
8235 flags = check_case_options(argc, argv, flags);
8236 enc = str_true_enc(str);
8237 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8238 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8239 ret = rb_str_new(0, RSTRING_LEN(str));
8240 rb_str_ascii_casemap(str, ret, &flags, enc);
8241 }
8242 else {
8243 ret = rb_str_casemap(str, &flags, enc);
8244 }
8245 return ret;
8246}
8247
8248typedef unsigned char *USTR;
8249
8250struct tr {
8251 int gen;
8252 unsigned int now, max;
8253 char *p, *pend;
8254};
8255
8256static unsigned int
8257trnext(struct tr *t, rb_encoding *enc)
8258{
8259 int n;
8260
8261 for (;;) {
8262 nextpart:
8263 if (!t->gen) {
8264 if (t->p == t->pend) return -1;
8265 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8266 t->p += n;
8267 }
8268 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8269 t->p += n;
8270 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8271 t->p += n;
8272 if (t->p < t->pend) {
8273 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8274 t->p += n;
8275 if (t->now > c) {
8276 if (t->now < 0x80 && c < 0x80) {
8277 rb_raise(rb_eArgError,
8278 "invalid range \"%c-%c\" in string transliteration",
8279 t->now, c);
8280 }
8281 else {
8282 rb_raise(rb_eArgError, "invalid range in string transliteration");
8283 }
8284 continue; /* not reached */
8285 }
8286 else if (t->now < c) {
8287 t->gen = 1;
8288 t->max = c;
8289 }
8290 }
8291 }
8292 return t->now;
8293 }
8294 else {
8295 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8296 if (t->now == t->max) {
8297 t->gen = 0;
8298 goto nextpart;
8299 }
8300 }
8301 if (t->now < t->max) {
8302 return t->now;
8303 }
8304 else {
8305 t->gen = 0;
8306 return t->max;
8307 }
8308 }
8309 }
8310}
8311
8312static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8313
8314static VALUE
8315tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8316{
8317 const unsigned int errc = -1;
8318 unsigned int trans[256];
8319 rb_encoding *enc, *e1, *e2;
8320 struct tr trsrc, trrepl;
8321 int cflag = 0;
8322 unsigned int c, c0, last = 0;
8323 int modify = 0, i, l;
8324 unsigned char *s, *send;
8325 VALUE hash = 0;
8326 int singlebyte = single_byte_optimizable(str);
8327 int termlen;
8328 int cr;
8329
8330#define CHECK_IF_ASCII(c) \
8331 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8332 (cr = ENC_CODERANGE_VALID) : 0)
8333
8334 StringValue(src);
8335 StringValue(repl);
8336 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8337 if (RSTRING_LEN(repl) == 0) {
8338 return rb_str_delete_bang(1, &src, str);
8339 }
8340
8341 cr = ENC_CODERANGE(str);
8342 e1 = rb_enc_check(str, src);
8343 e2 = rb_enc_check(str, repl);
8344 if (e1 == e2) {
8345 enc = e1;
8346 }
8347 else {
8348 enc = rb_enc_check(src, repl);
8349 }
8350 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8351 if (RSTRING_LEN(src) > 1 &&
8352 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8353 trsrc.p + l < trsrc.pend) {
8354 cflag = 1;
8355 trsrc.p += l;
8356 }
8357 trrepl.p = RSTRING_PTR(repl);
8358 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8359 trsrc.gen = trrepl.gen = 0;
8360 trsrc.now = trrepl.now = 0;
8361 trsrc.max = trrepl.max = 0;
8362
8363 if (cflag) {
8364 for (i=0; i<256; i++) {
8365 trans[i] = 1;
8366 }
8367 while ((c = trnext(&trsrc, enc)) != errc) {
8368 if (c < 256) {
8369 trans[c] = errc;
8370 }
8371 else {
8372 if (!hash) hash = rb_hash_new();
8373 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8374 }
8375 }
8376 while ((c = trnext(&trrepl, enc)) != errc)
8377 /* retrieve last replacer */;
8378 last = trrepl.now;
8379 for (i=0; i<256; i++) {
8380 if (trans[i] != errc) {
8381 trans[i] = last;
8382 }
8383 }
8384 }
8385 else {
8386 unsigned int r;
8387
8388 for (i=0; i<256; i++) {
8389 trans[i] = errc;
8390 }
8391 while ((c = trnext(&trsrc, enc)) != errc) {
8392 r = trnext(&trrepl, enc);
8393 if (r == errc) r = trrepl.now;
8394 if (c < 256) {
8395 trans[c] = r;
8396 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8397 }
8398 else {
8399 if (!hash) hash = rb_hash_new();
8400 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8401 }
8402 }
8403 }
8404
8405 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8406 cr = ENC_CODERANGE_7BIT;
8407 str_modify_keep_cr(str);
8408 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8409 termlen = rb_enc_mbminlen(enc);
8410 if (sflag) {
8411 int clen, tlen;
8412 long offset, max = RSTRING_LEN(str);
8413 unsigned int save = -1;
8414 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8415
8416 while (s < send) {
8417 int may_modify = 0;
8418
8419 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8420 if (!MBCLEN_CHARFOUND_P(r)) {
8421 SIZED_FREE_N(buf, max + termlen);
8422 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8423 }
8424 clen = MBCLEN_CHARFOUND_LEN(r);
8425 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8426
8427 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8428
8429 s += clen;
8430 if (c < 256) {
8431 c = trans[c];
8432 }
8433 else if (hash) {
8434 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8435 if (NIL_P(tmp)) {
8436 if (cflag) c = last;
8437 else c = errc;
8438 }
8439 else if (cflag) c = errc;
8440 else c = NUM2INT(tmp);
8441 }
8442 else {
8443 c = errc;
8444 }
8445 if (c != (unsigned int)-1) {
8446 if (save == c) {
8447 CHECK_IF_ASCII(c);
8448 continue;
8449 }
8450 save = c;
8451 tlen = rb_enc_codelen(c, enc);
8452 modify = 1;
8453 }
8454 else {
8455 save = -1;
8456 c = c0;
8457 if (enc != e1) may_modify = 1;
8458 }
8459 if ((offset = t - buf) + tlen > max) {
8460 size_t MAYBE_UNUSED(old) = max + termlen;
8461 max = offset + tlen + (send - s);
8462 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8463 t = buf + offset;
8464 }
8465 rb_enc_mbcput(c, t, enc);
8466 if (may_modify && memcmp(s, t, tlen) != 0) {
8467 modify = 1;
8468 }
8469 CHECK_IF_ASCII(c);
8470 t += tlen;
8471 }
8472 if (!STR_EMBED_P(str)) {
8473 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8474 }
8475 TERM_FILL((char *)t, termlen);
8476 RSTRING(str)->as.heap.ptr = (char *)buf;
8477 STR_SET_LEN(str, t - buf);
8478 STR_SET_NOEMBED(str);
8479 RSTRING(str)->as.heap.aux.capa = max;
8480 }
8481 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8482 while (s < send) {
8483 c = (unsigned char)*s;
8484 if (trans[c] != errc) {
8485 if (!cflag) {
8486 c = trans[c];
8487 *s = c;
8488 modify = 1;
8489 }
8490 else {
8491 *s = last;
8492 modify = 1;
8493 }
8494 }
8495 CHECK_IF_ASCII(c);
8496 s++;
8497 }
8498 }
8499 else {
8500 int clen, tlen;
8501 long offset, max = (long)((send - s) * 1.2);
8502 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8503
8504 while (s < send) {
8505 int may_modify = 0;
8506
8507 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8508 if (!MBCLEN_CHARFOUND_P(r)) {
8509 SIZED_FREE_N(buf, max + termlen);
8510 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8511 }
8512 clen = MBCLEN_CHARFOUND_LEN(r);
8513 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8514
8515 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8516
8517 if (c < 256) {
8518 c = trans[c];
8519 }
8520 else if (hash) {
8521 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8522 if (NIL_P(tmp)) {
8523 if (cflag) c = last;
8524 else c = errc;
8525 }
8526 else if (cflag) c = errc;
8527 else c = NUM2INT(tmp);
8528 }
8529 else {
8530 c = cflag ? last : errc;
8531 }
8532 if (c != errc) {
8533 tlen = rb_enc_codelen(c, enc);
8534 modify = 1;
8535 }
8536 else {
8537 c = c0;
8538 if (enc != e1) may_modify = 1;
8539 }
8540 if ((offset = t - buf) + tlen > max) {
8541 size_t MAYBE_UNUSED(old) = max + termlen;
8542 max = offset + tlen + (long)((send - s) * 1.2);
8543 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8544 t = buf + offset;
8545 }
8546 if (s != t) {
8547 rb_enc_mbcput(c, t, enc);
8548 if (may_modify && memcmp(s, t, tlen) != 0) {
8549 modify = 1;
8550 }
8551 }
8552 CHECK_IF_ASCII(c);
8553 s += clen;
8554 t += tlen;
8555 }
8556 if (!STR_EMBED_P(str)) {
8557 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8558 }
8559 TERM_FILL((char *)t, termlen);
8560 RSTRING(str)->as.heap.ptr = (char *)buf;
8561 STR_SET_LEN(str, t - buf);
8562 STR_SET_NOEMBED(str);
8563 RSTRING(str)->as.heap.aux.capa = max;
8564 }
8565
8566 if (modify) {
8567 if (cr != ENC_CODERANGE_BROKEN)
8568 ENC_CODERANGE_SET(str, cr);
8569 rb_enc_associate(str, enc);
8570 return str;
8571 }
8572 return Qnil;
8573}
8574
8575
8576/*
8577 * call-seq:
8578 * tr!(selector, replacements) -> self or nil
8579 *
8580 * Like String#tr, except:
8581 *
8582 * - Performs substitutions in +self+ (not in a copy of +self+).
8583 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8584 *
8585 * Related: {Modifying}[rdoc-ref:String@Modifying].
8586 */
8587
8588static VALUE
8589rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8590{
8591 return tr_trans(str, src, repl, 0);
8592}
8593
8594
8595/*
8596 * call-seq:
8597 * tr(selector, replacements) -> new_string
8598 *
8599 * Returns a copy of +self+ with each character specified by string +selector+
8600 * translated to the corresponding character in string +replacements+.
8601 * The correspondence is _positional_:
8602 *
8603 * - Each occurrence of the first character specified by +selector+
8604 * is translated to the first character in +replacements+.
8605 * - Each occurrence of the second character specified by +selector+
8606 * is translated to the second character in +replacements+.
8607 * - And so on.
8608 *
8609 * Example:
8610 *
8611 * 'hello'.tr('el', 'ip') #=> "hippo"
8612 *
8613 * If +replacements+ is shorter than +selector+,
8614 * it is implicitly padded with its own last character:
8615 *
8616 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8617 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8618 *
8619 * Arguments +selector+ and +replacements+ must be valid character selectors
8620 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8621 * and may use any of its valid forms, including negation, ranges, and escapes:
8622 *
8623 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8624 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8625 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8626 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8627 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8628 *
8629 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8630 */
8631
8632static VALUE
8633rb_str_tr(VALUE str, VALUE src, VALUE repl)
8634{
8635 str = str_duplicate(rb_cString, str);
8636 tr_trans(str, src, repl, 0);
8637 return str;
8638}
8639
8640#define TR_TABLE_MAX (UCHAR_MAX+1)
8641#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8642static void
8643tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8644 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8645{
8646 const unsigned int errc = -1;
8647 char buf[TR_TABLE_MAX];
8648 struct tr tr;
8649 unsigned int c;
8650 VALUE table = 0, ptable = 0;
8651 int i, l, cflag = 0;
8652
8653 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8654 tr.gen = tr.now = tr.max = 0;
8655
8656 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8657 cflag = 1;
8658 tr.p += l;
8659 }
8660 if (first) {
8661 for (i=0; i<TR_TABLE_MAX; i++) {
8662 stable[i] = 1;
8663 }
8664 stable[TR_TABLE_MAX] = cflag;
8665 }
8666 else if (stable[TR_TABLE_MAX] && !cflag) {
8667 stable[TR_TABLE_MAX] = 0;
8668 }
8669 for (i=0; i<TR_TABLE_MAX; i++) {
8670 buf[i] = cflag;
8671 }
8672
8673 while ((c = trnext(&tr, enc)) != errc) {
8674 if (c < TR_TABLE_MAX) {
8675 buf[(unsigned char)c] = !cflag;
8676 }
8677 else {
8678 VALUE key = UINT2NUM(c);
8679
8680 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8681 if (cflag) {
8682 ptable = *ctablep;
8683 table = ptable ? ptable : rb_hash_new();
8684 *ctablep = table;
8685 }
8686 else {
8687 table = rb_hash_new();
8688 ptable = *tablep;
8689 *tablep = table;
8690 }
8691 }
8692 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8693 rb_hash_aset(table, key, Qtrue);
8694 }
8695 }
8696 }
8697 for (i=0; i<TR_TABLE_MAX; i++) {
8698 stable[i] = stable[i] && buf[i];
8699 }
8700 if (!table && !cflag) {
8701 *tablep = 0;
8702 }
8703}
8704
8705
8706static int
8707tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8708{
8709 if (c < TR_TABLE_MAX) {
8710 return table[c] != 0;
8711 }
8712 else {
8713 VALUE v = UINT2NUM(c);
8714
8715 if (del) {
8716 if (!NIL_P(rb_hash_lookup(del, v)) &&
8717 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8718 return TRUE;
8719 }
8720 }
8721 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8722 return FALSE;
8723 }
8724 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8725 }
8726}
8727
8728/*
8729 * call-seq:
8730 * delete!(*selectors) -> self or nil
8731 *
8732 * Like String#delete, but modifies +self+ in place;
8733 * returns +self+ if any characters were deleted, +nil+ otherwise.
8734 *
8735 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8736 */
8737
8738static VALUE
8739rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8740{
8741 char squeez[TR_TABLE_SIZE];
8742 rb_encoding *enc = 0;
8743 char *s, *send, *t;
8744 VALUE del = 0, nodel = 0;
8745 int modify = 0;
8746 int i, ascompat, cr;
8747
8748 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8750 for (i=0; i<argc; i++) {
8751 VALUE s = argv[i];
8752
8753 StringValue(s);
8754 enc = rb_enc_check(str, s);
8755 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8756 }
8757
8758 str_modify_keep_cr(str);
8759 ascompat = rb_enc_asciicompat(enc);
8760 s = t = RSTRING_PTR(str);
8761 send = RSTRING_END(str);
8762 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8763 while (s < send) {
8764 unsigned int c;
8765 int clen;
8766
8767 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8768 if (squeez[c]) {
8769 modify = 1;
8770 }
8771 else {
8772 if (t != s) *t = c;
8773 t++;
8774 }
8775 s++;
8776 }
8777 else {
8778 c = rb_enc_codepoint_len(s, send, &clen, enc);
8779
8780 if (tr_find(c, squeez, del, nodel)) {
8781 modify = 1;
8782 }
8783 else {
8784 if (t != s) rb_enc_mbcput(c, t, enc);
8785 t += clen;
8787 }
8788 s += clen;
8789 }
8790 }
8791 TERM_FILL(t, TERM_LEN(str));
8792 STR_SET_LEN(str, t - RSTRING_PTR(str));
8793 ENC_CODERANGE_SET(str, cr);
8794
8795 if (modify) return str;
8796 return Qnil;
8797}
8798
8799
8800/*
8801 * call-seq:
8802 * delete(*selectors) -> new_string
8803 *
8804 * :include: doc/string/delete.rdoc
8805 *
8806 */
8807
8808static VALUE
8809rb_str_delete(int argc, VALUE *argv, VALUE str)
8810{
8811 str = str_duplicate(rb_cString, str);
8812 rb_str_delete_bang(argc, argv, str);
8813 return str;
8814}
8815
8816
8817/*
8818 * call-seq:
8819 * squeeze!(*selectors) -> self or nil
8820 *
8821 * Like String#squeeze, except that:
8822 *
8823 * - Characters are squeezed in +self+ (not in a copy of +self+).
8824 * - Returns +self+ if any changes are made, +nil+ otherwise.
8825 *
8826 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8827 */
8828
8829static VALUE
8830rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8831{
8832 char squeez[TR_TABLE_SIZE];
8833 rb_encoding *enc = 0;
8834 VALUE del = 0, nodel = 0;
8835 unsigned char *s, *send, *t;
8836 int i, modify = 0;
8837 int ascompat, singlebyte = single_byte_optimizable(str);
8838 unsigned int save;
8839
8840 if (argc == 0) {
8841 enc = STR_ENC_GET(str);
8842 }
8843 else {
8844 for (i=0; i<argc; i++) {
8845 VALUE s = argv[i];
8846
8847 StringValue(s);
8848 enc = rb_enc_check(str, s);
8849 if (singlebyte && !single_byte_optimizable(s))
8850 singlebyte = 0;
8851 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8852 }
8853 }
8854
8855 str_modify_keep_cr(str);
8856 s = t = (unsigned char *)RSTRING_PTR(str);
8857 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8858 send = (unsigned char *)RSTRING_END(str);
8859 save = -1;
8860 ascompat = rb_enc_asciicompat(enc);
8861
8862 if (singlebyte) {
8863 while (s < send) {
8864 unsigned int c = *s++;
8865 if (c != save || (argc > 0 && !squeez[c])) {
8866 *t++ = save = c;
8867 }
8868 }
8869 }
8870 else {
8871 while (s < send) {
8872 unsigned int c;
8873 int clen;
8874
8875 if (ascompat && (c = *s) < 0x80) {
8876 if (c != save || (argc > 0 && !squeez[c])) {
8877 *t++ = save = c;
8878 }
8879 s++;
8880 }
8881 else {
8882 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8883
8884 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8885 if (t != s) rb_enc_mbcput(c, t, enc);
8886 save = c;
8887 t += clen;
8888 }
8889 s += clen;
8890 }
8891 }
8892 }
8893
8894 TERM_FILL((char *)t, TERM_LEN(str));
8895 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8896 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8897 modify = 1;
8898 }
8899
8900 if (modify) return str;
8901 return Qnil;
8902}
8903
8904
8905/*
8906 * call-seq:
8907 * squeeze(*selectors) -> new_string
8908 *
8909 * :include: doc/string/squeeze.rdoc
8910 *
8911 */
8912
8913static VALUE
8914rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8915{
8916 str = str_duplicate(rb_cString, str);
8917 rb_str_squeeze_bang(argc, argv, str);
8918 return str;
8919}
8920
8921
8922/*
8923 * call-seq:
8924 * tr_s!(selector, replacements) -> self or nil
8925 *
8926 * Like String#tr_s, except:
8927 *
8928 * - Modifies +self+ in place (not a copy of +self+).
8929 * - Returns +self+ if any changes were made, +nil+ otherwise.
8930 *
8931 * Related: {Modifying}[rdoc-ref:String@Modifying].
8932 */
8933
8934static VALUE
8935rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8936{
8937 return tr_trans(str, src, repl, 1);
8938}
8939
8940
8941/*
8942 * call-seq:
8943 * tr_s(selector, replacements) -> new_string
8944 *
8945 * Like String#tr, except:
8946 *
8947 * - Also squeezes the modified portions of the translated string;
8948 * see String#squeeze.
8949 * - Returns the translated and squeezed string.
8950 *
8951 * Examples:
8952 *
8953 * 'hello'.tr_s('l', 'r') #=> "hero"
8954 * 'hello'.tr_s('el', '-') #=> "h-o"
8955 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8956 *
8957 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8958 *
8959 */
8960
8961static VALUE
8962rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8963{
8964 str = str_duplicate(rb_cString, str);
8965 tr_trans(str, src, repl, 1);
8966 return str;
8967}
8968
8969
8970/*
8971 * call-seq:
8972 * count(*selectors) -> integer
8973 *
8974 * :include: doc/string/count.rdoc
8975 */
8976
8977static VALUE
8978rb_str_count(int argc, VALUE *argv, VALUE str)
8979{
8980 char table[TR_TABLE_SIZE];
8981 rb_encoding *enc = 0;
8982 VALUE del = 0, nodel = 0, tstr;
8983 char *s, *send;
8984 int i;
8985 int ascompat;
8986 size_t n = 0;
8987
8989
8990 tstr = argv[0];
8991 StringValue(tstr);
8992 enc = rb_enc_check(str, tstr);
8993 if (argc == 1) {
8994 const char *ptstr;
8995 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8996 (ptstr = RSTRING_PTR(tstr),
8997 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8998 !is_broken_string(str)) {
8999 int clen;
9000 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9001
9002 s = RSTRING_PTR(str);
9003 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9004 send = RSTRING_END(str);
9005 while (s < send) {
9006 if (*(unsigned char*)s++ == c) n++;
9007 }
9008 return SIZET2NUM(n);
9009 }
9010 }
9011
9012 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9013 for (i=1; i<argc; i++) {
9014 tstr = argv[i];
9015 StringValue(tstr);
9016 enc = rb_enc_check(str, tstr);
9017 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9018 }
9019
9020 s = RSTRING_PTR(str);
9021 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9022 send = RSTRING_END(str);
9023 ascompat = rb_enc_asciicompat(enc);
9024 while (s < send) {
9025 unsigned int c;
9026
9027 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9028 if (table[c]) {
9029 n++;
9030 }
9031 s++;
9032 }
9033 else {
9034 int clen;
9035 c = rb_enc_codepoint_len(s, send, &clen, enc);
9036 if (tr_find(c, table, del, nodel)) {
9037 n++;
9038 }
9039 s += clen;
9040 }
9041 }
9042
9043 return SIZET2NUM(n);
9044}
9045
9046static VALUE
9047rb_fs_check(VALUE val)
9048{
9049 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9050 val = rb_check_string_type(val);
9051 if (NIL_P(val)) return 0;
9052 }
9053 return val;
9054}
9055
9056static const char isspacetable[256] = {
9057 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9059 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9060 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9061 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9062 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9063 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9064 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9065 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9066 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9067 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9068 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9069 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9070 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9071 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9073};
9074
9075#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9076
9077static long
9078split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9079{
9080 if (empty_count >= 0 && len == 0) {
9081 return empty_count + 1;
9082 }
9083 if (empty_count > 0) {
9084 /* make different substrings */
9085 if (result) {
9086 do {
9087 rb_ary_push(result, str_new_empty_String(str));
9088 } while (--empty_count > 0);
9089 }
9090 else {
9091 do {
9092 rb_yield(str_new_empty_String(str));
9093 } while (--empty_count > 0);
9094 }
9095 }
9096 str = rb_str_subseq(str, beg, len);
9097 if (result) {
9098 rb_ary_push(result, str);
9099 }
9100 else {
9101 rb_yield(str);
9102 }
9103 return empty_count;
9104}
9105
9106typedef enum {
9107 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9108} split_type_t;
9109
9110static split_type_t
9111literal_split_pattern(VALUE spat, split_type_t default_type)
9112{
9113 rb_encoding *enc = STR_ENC_GET(spat);
9114 const char *ptr;
9115 long len;
9116 RSTRING_GETMEM(spat, ptr, len);
9117 if (len == 0) {
9118 /* Special case - split into chars */
9119 return SPLIT_TYPE_CHARS;
9120 }
9121 else if (rb_enc_asciicompat(enc)) {
9122 if (len == 1 && ptr[0] == ' ') {
9123 return SPLIT_TYPE_AWK;
9124 }
9125 }
9126 else {
9127 int l;
9128 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9129 return SPLIT_TYPE_AWK;
9130 }
9131 }
9132 return default_type;
9133}
9134
9135/*
9136 * call-seq:
9137 * split(field_sep = $;, limit = 0) -> array_of_substrings
9138 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9139 *
9140 * :include: doc/string/split.rdoc
9141 *
9142 */
9143
9144static VALUE
9145rb_str_split_m(int argc, VALUE *argv, VALUE str)
9146{
9147 rb_encoding *enc;
9148 VALUE spat;
9149 VALUE limit;
9150 split_type_t split_type;
9151 long beg, end, i = 0, empty_count = -1;
9152 int lim = 0;
9153 VALUE result, tmp;
9154
9155 result = rb_block_given_p() ? Qfalse : Qnil;
9156 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9157 lim = NUM2INT(limit);
9158 if (lim <= 0) limit = Qnil;
9159 else if (lim == 1) {
9160 if (RSTRING_LEN(str) == 0)
9161 return result ? rb_ary_new2(0) : str;
9162 tmp = str_duplicate(rb_cString, str);
9163 if (!result) {
9164 rb_yield(tmp);
9165 return str;
9166 }
9167 return rb_ary_new3(1, tmp);
9168 }
9169 i = 1;
9170 }
9171 if (NIL_P(limit) && !lim) empty_count = 0;
9172
9173 enc = STR_ENC_GET(str);
9174 split_type = SPLIT_TYPE_REGEXP;
9175 if (!NIL_P(spat)) {
9176 spat = get_pat_quoted(spat, 0);
9177 }
9178 else if (NIL_P(spat = rb_fs)) {
9179 split_type = SPLIT_TYPE_AWK;
9180 }
9181 else if (!(spat = rb_fs_check(spat))) {
9182 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9183 }
9184 else {
9185 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9186 }
9187 if (split_type != SPLIT_TYPE_AWK) {
9188 switch (BUILTIN_TYPE(spat)) {
9189 case T_REGEXP:
9190 rb_reg_options(spat); /* check if uninitialized */
9191 tmp = RREGEXP_SRC(spat);
9192 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9193 if (split_type == SPLIT_TYPE_AWK) {
9194 spat = tmp;
9195 split_type = SPLIT_TYPE_STRING;
9196 }
9197 break;
9198
9199 case T_STRING:
9200 mustnot_broken(spat);
9201 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9202 break;
9203
9204 default:
9206 }
9207 }
9208
9209#define SPLIT_STR(beg, len) ( \
9210 empty_count = split_string(result, str, beg, len, empty_count), \
9211 str_mod_check(str, str_start, str_len))
9212
9213 beg = 0;
9214 char *ptr = RSTRING_PTR(str);
9215 char *const str_start = ptr;
9216 const long str_len = RSTRING_LEN(str);
9217 char *const eptr = str_start + str_len;
9218 if (split_type == SPLIT_TYPE_AWK) {
9219 char *bptr = ptr;
9220 int skip = 1;
9221 unsigned int c;
9222
9223 if (result) result = rb_ary_new();
9224 end = beg;
9225 if (is_ascii_string(str)) {
9226 while (ptr < eptr) {
9227 c = (unsigned char)*ptr++;
9228 if (skip) {
9229 if (ascii_isspace(c)) {
9230 beg = ptr - bptr;
9231 }
9232 else {
9233 end = ptr - bptr;
9234 skip = 0;
9235 if (!NIL_P(limit) && lim <= i) break;
9236 }
9237 }
9238 else if (ascii_isspace(c)) {
9239 SPLIT_STR(beg, end-beg);
9240 skip = 1;
9241 beg = ptr - bptr;
9242 if (!NIL_P(limit)) ++i;
9243 }
9244 else {
9245 end = ptr - bptr;
9246 }
9247 }
9248 }
9249 else {
9250 while (ptr < eptr) {
9251 int n;
9252
9253 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9254 ptr += n;
9255 if (skip) {
9256 if (rb_isspace(c)) {
9257 beg = ptr - bptr;
9258 }
9259 else {
9260 end = ptr - bptr;
9261 skip = 0;
9262 if (!NIL_P(limit) && lim <= i) break;
9263 }
9264 }
9265 else if (rb_isspace(c)) {
9266 SPLIT_STR(beg, end-beg);
9267 skip = 1;
9268 beg = ptr - bptr;
9269 if (!NIL_P(limit)) ++i;
9270 }
9271 else {
9272 end = ptr - bptr;
9273 }
9274 }
9275 }
9276 }
9277 else if (split_type == SPLIT_TYPE_STRING) {
9278 char *substr_start = ptr;
9279 char *sptr = RSTRING_PTR(spat);
9280 long slen = RSTRING_LEN(spat);
9281
9282 if (result) result = rb_ary_new();
9283 mustnot_broken(str);
9284 enc = rb_enc_check(str, spat);
9285 while (ptr < eptr &&
9286 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9287 /* Check we are at the start of a char */
9288 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9289 if (t != ptr + end) {
9290 ptr = t;
9291 continue;
9292 }
9293 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9294 str_mod_check(spat, sptr, slen);
9295 ptr += end + slen;
9296 substr_start = ptr;
9297 if (!NIL_P(limit) && lim <= ++i) break;
9298 }
9299 beg = ptr - str_start;
9300 }
9301 else if (split_type == SPLIT_TYPE_CHARS) {
9302 int n;
9303
9304 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9305 mustnot_broken(str);
9306 enc = rb_enc_get(str);
9307 while (ptr < eptr &&
9308 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9309 SPLIT_STR(ptr - str_start, n);
9310 ptr += n;
9311 if (!NIL_P(limit) && lim <= ++i) break;
9312 }
9313 beg = ptr - str_start;
9314 }
9315 else {
9316 if (result) result = rb_ary_new();
9317 long len = RSTRING_LEN(str);
9318 long start = beg;
9319 long idx;
9320 int last_null = 0;
9321 struct re_registers *regs;
9322 VALUE match = 0;
9323
9324 for (; rb_reg_search(spat, str, start, 0) >= 0;
9325 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9326 match = rb_backref_get();
9327 if (!result) rb_match_busy(match);
9328 regs = RMATCH_REGS(match);
9329 end = BEG(0);
9330 if (start == end && BEG(0) == END(0)) {
9331 if (!ptr) {
9332 SPLIT_STR(0, 0);
9333 break;
9334 }
9335 else if (last_null == 1) {
9336 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9337 beg = start;
9338 }
9339 else {
9340 if (start == len)
9341 start++;
9342 else
9343 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9344 last_null = 1;
9345 continue;
9346 }
9347 }
9348 else {
9349 SPLIT_STR(beg, end-beg);
9350 beg = start = END(0);
9351 }
9352 last_null = 0;
9353
9354 for (idx=1; idx < regs->num_regs; idx++) {
9355 if (BEG(idx) == -1) continue;
9356 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9357 }
9358 if (!NIL_P(limit) && lim <= ++i) break;
9359 }
9360 if (match) rb_match_unbusy(match);
9361 }
9362 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9363 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9364 }
9365
9366 return result ? result : str;
9367}
9368
9369VALUE
9370rb_str_split(VALUE str, const char *sep0)
9371{
9372 VALUE sep;
9373
9374 StringValue(str);
9375 sep = rb_str_new_cstr(sep0);
9376 return rb_str_split_m(1, &sep, str);
9377}
9378
9379#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9380
9381static inline int
9382enumerator_element(VALUE ary, VALUE e)
9383{
9384 if (ary) {
9385 rb_ary_push(ary, e);
9386 return 0;
9387 }
9388 else {
9389 rb_yield(e);
9390 return 1;
9391 }
9392}
9393
9394#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9395
9396static const char *
9397chomp_newline(const char *p, const char *e, rb_encoding *enc)
9398{
9399 const char *prev = rb_enc_prev_char(p, e, e, enc);
9400 if (rb_enc_is_newline(prev, e, enc)) {
9401 e = prev;
9402 prev = rb_enc_prev_char(p, e, e, enc);
9403 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9404 e = prev;
9405 }
9406 return e;
9407}
9408
9409static VALUE
9410get_rs(void)
9411{
9412 VALUE rs = rb_rs;
9413 if (!NIL_P(rs) &&
9414 (!RB_TYPE_P(rs, T_STRING) ||
9415 RSTRING_LEN(rs) != 1 ||
9416 RSTRING_PTR(rs)[0] != '\n')) {
9417 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9418 }
9419 return rs;
9420}
9421
9422#define rb_rs get_rs()
9423
9424static VALUE
9425rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9426{
9427 rb_encoding *enc;
9428 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9429 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9430 long pos, len, rslen;
9431 int rsnewline = 0;
9432
9433 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9434 rs = rb_rs;
9435 if (!NIL_P(opts)) {
9436 static ID keywords[1];
9437 if (!keywords[0]) {
9438 keywords[0] = rb_intern_const("chomp");
9439 }
9440 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9441 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9442 }
9443
9444 if (NIL_P(rs)) {
9445 if (!ENUM_ELEM(ary, str)) {
9446 return ary;
9447 }
9448 else {
9449 return orig;
9450 }
9451 }
9452
9453 if (!RSTRING_LEN(str)) goto end;
9454 str = rb_str_new_frozen(str);
9455 ptr = subptr = RSTRING_PTR(str);
9456 pend = RSTRING_END(str);
9457 len = RSTRING_LEN(str);
9458 StringValue(rs);
9459 rslen = RSTRING_LEN(rs);
9460
9461 if (rs == rb_default_rs)
9462 enc = rb_enc_get(str);
9463 else
9464 enc = rb_enc_check(str, rs);
9465
9466 if (rslen == 0) {
9467 /* paragraph mode */
9468 int n;
9469 const char *eol = NULL;
9470 subend = subptr;
9471 while (subend < pend) {
9472 long chomp_rslen = 0;
9473 do {
9474 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9475 n = 0;
9476 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9477 if (rb_enc_is_newline(subend + n, pend, enc)) {
9478 if (eol == subend) break;
9479 subend += rslen;
9480 if (subptr) {
9481 eol = subend;
9482 chomp_rslen = -rslen;
9483 }
9484 }
9485 else {
9486 if (!subptr) subptr = subend;
9487 subend += rslen;
9488 }
9489 rslen = 0;
9490 } while (subend < pend);
9491 if (!subptr) break;
9492 if (rslen == 0) chomp_rslen = 0;
9493 line = rb_str_subseq(str, subptr - ptr,
9494 subend - subptr + (chomp ? chomp_rslen : rslen));
9495 if (ENUM_ELEM(ary, line)) {
9496 str_mod_check(str, ptr, len);
9497 }
9498 subptr = eol = NULL;
9499 }
9500 goto end;
9501 }
9502 else {
9503 rsptr = RSTRING_PTR(rs);
9504 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9505 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9506 rsnewline = 1;
9507 }
9508 }
9509
9510 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9511 rs = rb_str_new(rsptr, rslen);
9512 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9513 rsptr = RSTRING_PTR(rs);
9514 rslen = RSTRING_LEN(rs);
9515 }
9516
9517 while (subptr < pend) {
9518 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9519 if (pos < 0) break;
9520 hit = subptr + pos;
9521 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9522 if (hit != adjusted) {
9523 subptr = adjusted;
9524 continue;
9525 }
9526 subend = hit += rslen;
9527 if (chomp) {
9528 if (rsnewline) {
9529 subend = chomp_newline(subptr, subend, enc);
9530 }
9531 else {
9532 subend -= rslen;
9533 }
9534 }
9535 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9536 if (ENUM_ELEM(ary, line)) {
9537 str_mod_check(str, ptr, len);
9538 }
9539 subptr = hit;
9540 }
9541
9542 if (subptr != pend) {
9543 if (chomp) {
9544 if (rsnewline) {
9545 pend = chomp_newline(subptr, pend, enc);
9546 }
9547 else if (pend - subptr >= rslen &&
9548 memcmp(pend - rslen, rsptr, rslen) == 0) {
9549 pend -= rslen;
9550 }
9551 }
9552 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9553 ENUM_ELEM(ary, line);
9554 RB_GC_GUARD(str);
9555 }
9556
9557 end:
9558 if (ary)
9559 return ary;
9560 else
9561 return orig;
9562}
9563
9564/*
9565 * call-seq:
9566 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9567 * each_line(record_separator = $/, chomp: false) -> enumerator
9568 *
9569 * :include: doc/string/each_line.rdoc
9570 *
9571 */
9572
9573static VALUE
9574rb_str_each_line(int argc, VALUE *argv, VALUE str)
9575{
9576 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9577 return rb_str_enumerate_lines(argc, argv, str, 0);
9578}
9579
9580/*
9581 * call-seq:
9582 * lines(record_separator = $/, chomp: false) -> array_of_strings
9583 *
9584 * Returns substrings ("lines") of +self+
9585 * according to the given arguments:
9586 *
9587 * s = <<~EOT
9588 * This is the first line.
9589 * This is line two.
9590 *
9591 * This is line four.
9592 * This is line five.
9593 * EOT
9594 *
9595 * With the default argument values:
9596 *
9597 * $/ # => "\n"
9598 * s.lines
9599 * # =>
9600 * ["This is the first line.\n",
9601 * "This is line two.\n",
9602 * "\n",
9603 * "This is line four.\n",
9604 * "This is line five.\n"]
9605 *
9606 * With a different +record_separator+:
9607 *
9608 * record_separator = ' is '
9609 * s.lines(record_separator)
9610 * # =>
9611 * ["This is ",
9612 * "the first line.\nThis is ",
9613 * "line two.\n\nThis is ",
9614 * "line four.\nThis is ",
9615 * "line five.\n"]
9616 *
9617 * With keyword argument +chomp+ as +true+,
9618 * removes the trailing newline from each line:
9619 *
9620 * s.lines(chomp: true)
9621 * # =>
9622 * ["This is the first line.",
9623 * "This is line two.",
9624 * "",
9625 * "This is line four.",
9626 * "This is line five."]
9627 *
9628 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
9629 */
9630
9631static VALUE
9632rb_str_lines(int argc, VALUE *argv, VALUE str)
9633{
9634 VALUE ary = WANTARRAY("lines", 0);
9635 return rb_str_enumerate_lines(argc, argv, str, ary);
9636}
9637
9638static VALUE
9639rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9640{
9641 return LONG2FIX(RSTRING_LEN(str));
9642}
9643
9644static VALUE
9645rb_str_enumerate_bytes(VALUE str, VALUE ary)
9646{
9647 long i;
9648
9649 for (i=0; i<RSTRING_LEN(str); i++) {
9650 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9651 }
9652 if (ary)
9653 return ary;
9654 else
9655 return str;
9656}
9657
9658/*
9659 * call-seq:
9660 * each_byte {|byte| ... } -> self
9661 * each_byte -> enumerator
9662 *
9663 * :include: doc/string/each_byte.rdoc
9664 *
9665 */
9666
9667static VALUE
9668rb_str_each_byte(VALUE str)
9669{
9670 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9671 return rb_str_enumerate_bytes(str, 0);
9672}
9673
9674/*
9675 * call-seq:
9676 * bytes -> array_of_bytes
9677 *
9678 * :include: doc/string/bytes.rdoc
9679 *
9680 */
9681
9682static VALUE
9683rb_str_bytes(VALUE str)
9684{
9685 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9686 return rb_str_enumerate_bytes(str, ary);
9687}
9688
9689static VALUE
9690rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9691{
9692 return rb_str_length(str);
9693}
9694
9695static VALUE
9696rb_str_enumerate_chars(VALUE str, VALUE ary)
9697{
9698 VALUE orig = str;
9699 long i, len, n;
9700 const char *ptr;
9701 rb_encoding *enc;
9702
9703 str = rb_str_new_frozen(str);
9704 ptr = RSTRING_PTR(str);
9705 len = RSTRING_LEN(str);
9706 enc = rb_enc_get(str);
9707
9709 for (i = 0; i < len; i += n) {
9710 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9711 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9712 }
9713 }
9714 else {
9715 for (i = 0; i < len; i += n) {
9716 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9717 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9718 }
9719 }
9720 RB_GC_GUARD(str);
9721 if (ary)
9722 return ary;
9723 else
9724 return orig;
9725}
9726
9727/*
9728 * call-seq:
9729 * each_char {|char| ... } -> self
9730 * each_char -> enumerator
9731 *
9732 * :include: doc/string/each_char.rdoc
9733 *
9734 */
9735
9736static VALUE
9737rb_str_each_char(VALUE str)
9738{
9739 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9740 return rb_str_enumerate_chars(str, 0);
9741}
9742
9743/*
9744 * call-seq:
9745 * chars -> array_of_characters
9746 *
9747 * :include: doc/string/chars.rdoc
9748 *
9749 */
9750
9751static VALUE
9752rb_str_chars(VALUE str)
9753{
9754 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9755 return rb_str_enumerate_chars(str, ary);
9756}
9757
9758static VALUE
9759rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9760{
9761 VALUE orig = str;
9762 int n;
9763 unsigned int c;
9764 const char *ptr, *end;
9765 rb_encoding *enc;
9766
9767 if (single_byte_optimizable(str))
9768 return rb_str_enumerate_bytes(str, ary);
9769
9770 str = rb_str_new_frozen(str);
9771 ptr = RSTRING_PTR(str);
9772 end = RSTRING_END(str);
9773 enc = STR_ENC_GET(str);
9774
9775 while (ptr < end) {
9776 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9777 ENUM_ELEM(ary, UINT2NUM(c));
9778 ptr += n;
9779 }
9780 RB_GC_GUARD(str);
9781 if (ary)
9782 return ary;
9783 else
9784 return orig;
9785}
9786
9787/*
9788 * call-seq:
9789 * each_codepoint {|codepoint| ... } -> self
9790 * each_codepoint -> enumerator
9791 *
9792 * :include: doc/string/each_codepoint.rdoc
9793 *
9794 */
9795
9796static VALUE
9797rb_str_each_codepoint(VALUE str)
9798{
9799 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9800 return rb_str_enumerate_codepoints(str, 0);
9801}
9802
9803/*
9804 * call-seq:
9805 * codepoints -> array_of_integers
9806 *
9807 * :include: doc/string/codepoints.rdoc
9808 *
9809 */
9810
9811static VALUE
9812rb_str_codepoints(VALUE str)
9813{
9814 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9815 return rb_str_enumerate_codepoints(str, ary);
9816}
9817
9818static regex_t *
9819get_reg_grapheme_cluster(rb_encoding *enc)
9820{
9821 int encidx = rb_enc_to_index(enc);
9822
9823 const OnigUChar source_ascii[] = "\\X";
9824 const OnigUChar *source = source_ascii;
9825 size_t source_len = sizeof(source_ascii) - 1;
9826
9827 switch (encidx) {
9828#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9829#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9830#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9831#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9832#define CASE_UTF(e) \
9833 case ENCINDEX_UTF_##e: { \
9834 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9835 source = source_UTF_##e; \
9836 source_len = sizeof(source_UTF_##e); \
9837 break; \
9838 }
9839 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9840#undef CASE_UTF
9841#undef CHARS_16BE
9842#undef CHARS_16LE
9843#undef CHARS_32BE
9844#undef CHARS_32LE
9845 }
9846
9847 regex_t *reg_grapheme_cluster;
9848 OnigErrorInfo einfo;
9849 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9850 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9851 if (r) {
9852 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9853 onig_error_code_to_str(message, r, &einfo);
9854 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9855 }
9856
9857 return reg_grapheme_cluster;
9858}
9859
9860static regex_t *
9861get_cached_reg_grapheme_cluster(rb_encoding *enc)
9862{
9863 int encidx = rb_enc_to_index(enc);
9864 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9865
9866 if (encidx == rb_utf8_encindex()) {
9867 if (!reg_grapheme_cluster_utf8) {
9868 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9869 }
9870
9871 return reg_grapheme_cluster_utf8;
9872 }
9873
9874 return NULL;
9875}
9876
9877static VALUE
9878rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9879{
9880 size_t grapheme_cluster_count = 0;
9881 rb_encoding *enc = get_encoding(str);
9882 const char *ptr, *end;
9883
9884 if (!rb_enc_unicode_p(enc)) {
9885 return rb_str_length(str);
9886 }
9887
9888 bool cached_reg_grapheme_cluster = true;
9889 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9890 if (!reg_grapheme_cluster) {
9891 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9892 cached_reg_grapheme_cluster = false;
9893 }
9894
9895 ptr = RSTRING_PTR(str);
9896 end = RSTRING_END(str);
9897
9898 while (ptr < end) {
9899 OnigPosition len = onig_match(reg_grapheme_cluster,
9900 (const OnigUChar *)ptr, (const OnigUChar *)end,
9901 (const OnigUChar *)ptr, NULL, 0);
9902 if (len <= 0) break;
9903 grapheme_cluster_count++;
9904 ptr += len;
9905 }
9906
9907 if (!cached_reg_grapheme_cluster) {
9908 onig_free(reg_grapheme_cluster);
9909 }
9910
9911 return SIZET2NUM(grapheme_cluster_count);
9912}
9913
9914static VALUE
9915rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9916{
9917 VALUE orig = str;
9918 rb_encoding *enc = get_encoding(str);
9919 const char *ptr0, *ptr, *end;
9920
9921 if (!rb_enc_unicode_p(enc)) {
9922 return rb_str_enumerate_chars(str, ary);
9923 }
9924
9925 if (!ary) str = rb_str_new_frozen(str);
9926
9927 bool cached_reg_grapheme_cluster = true;
9928 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9929 if (!reg_grapheme_cluster) {
9930 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9931 cached_reg_grapheme_cluster = false;
9932 }
9933
9934 ptr0 = ptr = RSTRING_PTR(str);
9935 end = RSTRING_END(str);
9936
9937 while (ptr < end) {
9938 OnigPosition len = onig_match(reg_grapheme_cluster,
9939 (const OnigUChar *)ptr, (const OnigUChar *)end,
9940 (const OnigUChar *)ptr, NULL, 0);
9941 if (len <= 0) break;
9942 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9943 ptr += len;
9944 }
9945
9946 if (!cached_reg_grapheme_cluster) {
9947 onig_free(reg_grapheme_cluster);
9948 }
9949
9950 RB_GC_GUARD(str);
9951 if (ary)
9952 return ary;
9953 else
9954 return orig;
9955}
9956
9957/*
9958 * call-seq:
9959 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9960 * each_grapheme_cluster -> enumerator
9961 *
9962 * :include: doc/string/each_grapheme_cluster.rdoc
9963 *
9964 */
9965
9966static VALUE
9967rb_str_each_grapheme_cluster(VALUE str)
9968{
9969 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9970 return rb_str_enumerate_grapheme_clusters(str, 0);
9971}
9972
9973/*
9974 * call-seq:
9975 * grapheme_clusters -> array_of_grapheme_clusters
9976 *
9977 * :include: doc/string/grapheme_clusters.rdoc
9978 *
9979 */
9980
9981static VALUE
9982rb_str_grapheme_clusters(VALUE str)
9983{
9984 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9985 return rb_str_enumerate_grapheme_clusters(str, ary);
9986}
9987
9988static long
9989chopped_length(VALUE str)
9990{
9991 rb_encoding *enc = STR_ENC_GET(str);
9992 const char *p, *p2, *beg, *end;
9993
9994 beg = RSTRING_PTR(str);
9995 end = beg + RSTRING_LEN(str);
9996 if (beg >= end) return 0;
9997 p = rb_enc_prev_char(beg, end, end, enc);
9998 if (!p) return 0;
9999 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10000 p2 = rb_enc_prev_char(beg, p, end, enc);
10001 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10002 }
10003 return p - beg;
10004}
10005
10006/*
10007 * call-seq:
10008 * chop! -> self or nil
10009 *
10010 * Like String#chop, except that:
10011 *
10012 * - Removes trailing characters from +self+ (not from a copy of +self+).
10013 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10014 *
10015 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10016 */
10017
10018static VALUE
10019rb_str_chop_bang(VALUE str)
10020{
10021 str_modify_keep_cr(str);
10022 if (RSTRING_LEN(str) > 0) {
10023 long len;
10024 len = chopped_length(str);
10025 STR_SET_LEN(str, len);
10026 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10027 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10029 }
10030 return str;
10031 }
10032 return Qnil;
10033}
10034
10035
10036/*
10037 * call-seq:
10038 * chop -> new_string
10039 *
10040 * :include: doc/string/chop.rdoc
10041 *
10042 */
10043
10044static VALUE
10045rb_str_chop(VALUE str)
10046{
10047 return rb_str_subseq(str, 0, chopped_length(str));
10048}
10049
10050static long
10051smart_chomp(VALUE str, const char *e, const char *p)
10052{
10053 rb_encoding *enc = rb_enc_get(str);
10054 if (rb_enc_mbminlen(enc) > 1) {
10055 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10056 if (rb_enc_is_newline(pp, e, enc)) {
10057 e = pp;
10058 }
10059 pp = e - rb_enc_mbminlen(enc);
10060 if (pp >= p) {
10061 pp = rb_enc_left_char_head(p, pp, e, enc);
10062 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10063 e = pp;
10064 }
10065 }
10066 }
10067 else {
10068 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10069 case '\n':
10070 if (--e > p && *(e-1) == '\r') {
10071 --e;
10072 }
10073 break;
10074 case '\r':
10075 --e;
10076 break;
10077 }
10078 }
10079 return e - p;
10080}
10081
10082static long
10083chompped_length(VALUE str, VALUE rs)
10084{
10085 rb_encoding *enc;
10086 int newline;
10087 char *pp, *e, *rsptr;
10088 long rslen;
10089 char *const p = RSTRING_PTR(str);
10090 long len = RSTRING_LEN(str);
10091
10092 if (len == 0) return 0;
10093 e = p + len;
10094 if (rs == rb_default_rs) {
10095 return smart_chomp(str, e, p);
10096 }
10097
10098 enc = rb_enc_get(str);
10099 RSTRING_GETMEM(rs, rsptr, rslen);
10100 if (rslen == 0) {
10101 if (rb_enc_mbminlen(enc) > 1) {
10102 while (e > p) {
10103 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10104 if (!rb_enc_is_newline(pp, e, enc)) break;
10105 e = pp;
10106 pp -= rb_enc_mbminlen(enc);
10107 if (pp >= p) {
10108 pp = rb_enc_left_char_head(p, pp, e, enc);
10109 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10110 e = pp;
10111 }
10112 }
10113 }
10114 }
10115 else {
10116 while (e > p && *(e-1) == '\n') {
10117 --e;
10118 if (e > p && *(e-1) == '\r')
10119 --e;
10120 }
10121 }
10122 return e - p;
10123 }
10124 if (rslen > len) return len;
10125
10126 enc = rb_enc_get(rs);
10127 newline = rsptr[rslen-1];
10128 if (rslen == rb_enc_mbminlen(enc)) {
10129 if (rslen == 1) {
10130 if (newline == '\n')
10131 return smart_chomp(str, e, p);
10132 }
10133 else {
10134 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10135 return smart_chomp(str, e, p);
10136 }
10137 }
10138
10139 enc = rb_enc_check(str, rs);
10140 if (is_broken_string(rs)) {
10141 return len;
10142 }
10143 pp = e - rslen;
10144 if (p[len-1] == newline &&
10145 (rslen <= 1 ||
10146 memcmp(rsptr, pp, rslen) == 0)) {
10147 if (at_char_boundary(p, pp, e, enc))
10148 return len - rslen;
10149 RB_GC_GUARD(rs);
10150 }
10151 return len;
10152}
10153
10159static VALUE
10160chomp_rs(int argc, const VALUE *argv)
10161{
10162 rb_check_arity(argc, 0, 1);
10163 if (argc > 0) {
10164 VALUE rs = argv[0];
10165 if (!NIL_P(rs)) StringValue(rs);
10166 return rs;
10167 }
10168 else {
10169 return rb_rs;
10170 }
10171}
10172
10173VALUE
10174rb_str_chomp_string(VALUE str, VALUE rs)
10175{
10176 long olen = RSTRING_LEN(str);
10177 long len = chompped_length(str, rs);
10178 if (len >= olen) return Qnil;
10179 str_modify_keep_cr(str);
10180 STR_SET_LEN(str, len);
10181 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10182 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10184 }
10185 return str;
10186}
10187
10188/*
10189 * call-seq:
10190 * chomp!(line_sep = $/) -> self or nil
10191 *
10192 * Like String#chomp, except that:
10193 *
10194 * - Removes trailing characters from +self+ (not from a copy of +self+).
10195 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10196 *
10197 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10198 */
10199
10200static VALUE
10201rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10202{
10203 VALUE rs;
10204 str_modifiable(str);
10205 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10206 rs = chomp_rs(argc, argv);
10207 if (NIL_P(rs)) return Qnil;
10208 return rb_str_chomp_string(str, rs);
10209}
10210
10211
10212/*
10213 * call-seq:
10214 * chomp(line_sep = $/) -> new_string
10215 *
10216 * :include: doc/string/chomp.rdoc
10217 *
10218 */
10219
10220static VALUE
10221rb_str_chomp(int argc, VALUE *argv, VALUE str)
10222{
10223 VALUE rs = chomp_rs(argc, argv);
10224 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10225 return rb_str_subseq(str, 0, chompped_length(str, rs));
10226}
10227
10228static void
10229tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10230 VALUE str, int num_selectors, VALUE *selectors)
10231{
10232 int i;
10233
10234 for (i=0; i<num_selectors; i++) {
10235 VALUE selector = selectors[i];
10236 rb_encoding *enc;
10237
10238 StringValue(selector);
10239 enc = rb_enc_check(str, selector);
10240 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10241 }
10242}
10243
10244static long
10245lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10246{
10247 const char *const start = s;
10248
10249 if (!s || s >= e) return 0;
10250
10251 /* remove spaces at head */
10252 if (single_byte_optimizable(str)) {
10253 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10254 }
10255 else {
10256 while (s < e) {
10257 int n;
10258 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10259
10260 if (cc && !rb_isspace(cc)) break;
10261 s += n;
10262 }
10263 }
10264 return s - start;
10265}
10266
10267static long
10268lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10269 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10270{
10271 const char *const start = s;
10272
10273 if (!s || s >= e) return 0;
10274
10275 /* remove leading characters in the table */
10276 while (s < e) {
10277 int n;
10278 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10279
10280 if (!tr_find(cc, table, del, nodel)) break;
10281 s += n;
10282 }
10283 return s - start;
10284}
10285
10286/*
10287 * call-seq:
10288 * lstrip!(*selectors) -> self or nil
10289 *
10290 * Like String#lstrip, except that:
10291 *
10292 * - Performs stripping in +self+ (not in a copy of +self+).
10293 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10294 *
10295 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10296 */
10297
10298static VALUE
10299rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10300{
10301 rb_encoding *enc;
10302 char *start, *s;
10303 long olen, loffset;
10304
10305 str_modify_keep_cr(str);
10306 enc = STR_ENC_GET(str);
10307 RSTRING_GETMEM(str, start, olen);
10308 if (argc > 0) {
10309 char table[TR_TABLE_SIZE];
10310 VALUE del = 0, nodel = 0;
10311
10312 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10313 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10314 }
10315 else {
10316 loffset = lstrip_offset(str, start, start+olen, enc);
10317 }
10318
10319 if (loffset > 0) {
10320 long len = olen-loffset;
10321 s = start + loffset;
10322 memmove(start, s, len);
10323 STR_SET_LEN(str, len);
10324 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10325 return str;
10326 }
10327 return Qnil;
10328}
10329
10330
10331/*
10332 * call-seq:
10333 * lstrip(*selectors) -> new_string
10334 *
10335 * Returns a copy of +self+ with leading whitespace removed;
10336 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10337 *
10338 * whitespace = "\x00\t\n\v\f\r "
10339 * s = whitespace + 'abc' + whitespace
10340 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10341 * s.lstrip
10342 * # => "abc\u0000\t\n\v\f\r "
10343 *
10344 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10345 *
10346 * s = "---abc+++"
10347 * s.lstrip("-") # => "abc+++"
10348 *
10349 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10350 * and may use any of its valid forms, including negation, ranges, and escapes:
10351 *
10352 * "01234abc56789".lstrip("0-9") # "abc56789"
10353 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10354 *
10355 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10356 */
10357
10358static VALUE
10359rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10360{
10361 char *start;
10362 long len, loffset;
10363
10364 RSTRING_GETMEM(str, start, len);
10365 if (argc > 0) {
10366 char table[TR_TABLE_SIZE];
10367 VALUE del = 0, nodel = 0;
10368
10369 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10370 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10371 }
10372 else {
10373 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10374 }
10375 if (loffset <= 0) return str_duplicate(rb_cString, str);
10376 return rb_str_subseq(str, loffset, len - loffset);
10377}
10378
10379static long
10380rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10381{
10382 const char *t;
10383
10384 rb_str_check_dummy_enc(enc);
10386 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10387 }
10388 if (!s || s >= e) return 0;
10389 t = e;
10390
10391 /* remove trailing spaces or '\0's */
10392 if (single_byte_optimizable(str)) {
10393 unsigned char c;
10394 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10395 }
10396 else {
10397 char *tp;
10398
10399 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10400 unsigned int c = rb_enc_codepoint(tp, e, enc);
10401 if (c && !rb_isspace(c)) break;
10402 t = tp;
10403 }
10404 }
10405 return e - t;
10406}
10407
10408static long
10409rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10410 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10411{
10412 const char *t;
10413 char *tp;
10414
10415 rb_str_check_dummy_enc(enc);
10417 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10418 }
10419 if (!s || s >= e) return 0;
10420 t = e;
10421
10422 /* remove trailing characters in the table */
10423 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10424 unsigned int c = rb_enc_codepoint(tp, e, enc);
10425 if (!tr_find(c, table, del, nodel)) break;
10426 t = tp;
10427 }
10428
10429 return e - t;
10430}
10431
10432/*
10433 * call-seq:
10434 * rstrip!(*selectors) -> self or nil
10435 *
10436 * Like String#rstrip, except that:
10437 *
10438 * - Performs stripping in +self+ (not in a copy of +self+).
10439 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10440 *
10441 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10442 */
10443
10444static VALUE
10445rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10446{
10447 rb_encoding *enc;
10448 char *start;
10449 long olen, roffset;
10450
10451 str_modify_keep_cr(str);
10452 enc = STR_ENC_GET(str);
10453 RSTRING_GETMEM(str, start, olen);
10454 if (argc > 0) {
10455 char table[TR_TABLE_SIZE];
10456 VALUE del = 0, nodel = 0;
10457
10458 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10459 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10460 }
10461 else {
10462 roffset = rstrip_offset(str, start, start+olen, enc);
10463 }
10464 if (roffset > 0) {
10465 long len = olen - roffset;
10466
10467 STR_SET_LEN(str, len);
10468 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10469 return str;
10470 }
10471 return Qnil;
10472}
10473
10474
10475/*
10476 * call-seq:
10477 * rstrip(*selectors) -> new_string
10478 *
10479 * Returns a copy of +self+ with trailing whitespace removed;
10480 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10481 *
10482 * whitespace = "\x00\t\n\v\f\r "
10483 * s = whitespace + 'abc' + whitespace
10484 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10485 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10486 *
10487 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10488 *
10489 * s = "---abc+++"
10490 * s.rstrip("+") # => "---abc"
10491 *
10492 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10493 * and may use any of its valid forms, including negation, ranges, and escapes:
10494 *
10495 * "01234abc56789".rstrip("0-9") # "01234abc"
10496 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10497 *
10498 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10499 */
10500
10501static VALUE
10502rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10503{
10504 rb_encoding *enc;
10505 char *start;
10506 long olen, roffset;
10507
10508 enc = STR_ENC_GET(str);
10509 RSTRING_GETMEM(str, start, olen);
10510 if (argc > 0) {
10511 char table[TR_TABLE_SIZE];
10512 VALUE del = 0, nodel = 0;
10513
10514 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10515 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10516 }
10517 else {
10518 roffset = rstrip_offset(str, start, start+olen, enc);
10519 }
10520 if (roffset <= 0) return str_duplicate(rb_cString, str);
10521 return rb_str_subseq(str, 0, olen-roffset);
10522}
10523
10524
10525/*
10526 * call-seq:
10527 * strip!(*selectors) -> self or nil
10528 *
10529 * Like String#strip, except that:
10530 *
10531 * - Any modifications are made to +self+.
10532 * - Returns +self+ if any modification are made, +nil+ otherwise.
10533 *
10534 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10535 */
10536
10537static VALUE
10538rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10539{
10540 char *start;
10541 long olen, loffset, roffset;
10542 rb_encoding *enc;
10543
10544 str_modify_keep_cr(str);
10545 enc = STR_ENC_GET(str);
10546 RSTRING_GETMEM(str, start, olen);
10547
10548 if (argc > 0) {
10549 char table[TR_TABLE_SIZE];
10550 VALUE del = 0, nodel = 0;
10551
10552 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10553 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10554 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10555 }
10556 else {
10557 loffset = lstrip_offset(str, start, start+olen, enc);
10558 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10559 }
10560
10561 if (loffset > 0 || roffset > 0) {
10562 long len = olen-roffset;
10563 if (loffset > 0) {
10564 len -= loffset;
10565 memmove(start, start + loffset, len);
10566 }
10567 STR_SET_LEN(str, len);
10568 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10569 return str;
10570 }
10571 return Qnil;
10572}
10573
10574
10575/*
10576 * call-seq:
10577 * strip(*selectors) -> new_string
10578 *
10579 * Returns a copy of +self+ with leading and trailing whitespace removed;
10580 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10581 *
10582 * whitespace = "\x00\t\n\v\f\r "
10583 * s = whitespace + 'abc' + whitespace
10584 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10585 * s.strip # => "abc"
10586 *
10587 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10588 *
10589 * s = "---abc+++"
10590 * s.strip("-+") # => "abc"
10591 * s.strip("+-") # => "abc"
10592 *
10593 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10594 * and may use any of its valid forms, including negation, ranges, and escapes:
10595 *
10596 * "01234abc56789".strip("0-9") # "abc"
10597 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10598 *
10599 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10600 */
10601
10602static VALUE
10603rb_str_strip(int argc, VALUE *argv, VALUE str)
10604{
10605 char *start;
10606 long olen, loffset, roffset;
10607 rb_encoding *enc = STR_ENC_GET(str);
10608
10609 RSTRING_GETMEM(str, start, olen);
10610
10611 if (argc > 0) {
10612 char table[TR_TABLE_SIZE];
10613 VALUE del = 0, nodel = 0;
10614
10615 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10616 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10617 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10618 }
10619 else {
10620 loffset = lstrip_offset(str, start, start+olen, enc);
10621 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10622 }
10623
10624 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10625 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10626}
10627
10628static VALUE
10629scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10630{
10631 VALUE result = Qnil;
10632 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10633 if (pos >= 0) {
10634 VALUE match;
10635 struct re_registers *regs;
10636 if (BUILTIN_TYPE(pat) == T_STRING) {
10637 regs = NULL;
10638 end = pos + RSTRING_LEN(pat);
10639 }
10640 else {
10641 match = rb_backref_get();
10642 regs = RMATCH_REGS(match);
10643 pos = BEG(0);
10644 end = END(0);
10645 }
10646
10647 if (pos == end) {
10648 rb_encoding *enc = STR_ENC_GET(str);
10649 /*
10650 * Always consume at least one character of the input string
10651 */
10652 if (RSTRING_LEN(str) > end)
10653 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10654 RSTRING_END(str), enc);
10655 else
10656 *start = end + 1;
10657 }
10658 else {
10659 *start = end;
10660 }
10661
10662 if (!regs || regs->num_regs == 1) {
10663 result = rb_str_subseq(str, pos, end - pos);
10664 return result;
10665 }
10666 else {
10667 result = rb_ary_new2(regs->num_regs);
10668 for (int i = 1; i < regs->num_regs; i++) {
10669 VALUE s = Qnil;
10670 if (BEG(i) >= 0) {
10671 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10672 }
10673
10674 rb_ary_push(result, s);
10675 }
10676 }
10677
10678 RB_GC_GUARD(match);
10679 }
10680
10681 return result;
10682}
10683
10684
10685/*
10686 * call-seq:
10687 * scan(pattern) -> array_of_results
10688 * scan(pattern) {|result| ... } -> self
10689 *
10690 * :include: doc/string/scan.rdoc
10691 *
10692 */
10693
10694static VALUE
10695rb_str_scan(VALUE str, VALUE pat)
10696{
10697 VALUE result;
10698 long start = 0;
10699 long last = -1, prev = 0;
10700 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10701
10702 pat = get_pat_quoted(pat, 1);
10703 mustnot_broken(str);
10704 if (!rb_block_given_p()) {
10705 VALUE ary = rb_ary_new();
10706
10707 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10708 last = prev;
10709 prev = start;
10710 rb_ary_push(ary, result);
10711 }
10712 if (last >= 0) rb_pat_search(pat, str, last, 1);
10713 else rb_backref_set(Qnil);
10714 return ary;
10715 }
10716
10717 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10718 last = prev;
10719 prev = start;
10720 rb_yield(result);
10721 str_mod_check(str, p, len);
10722 }
10723 if (last >= 0) rb_pat_search(pat, str, last, 1);
10724 return str;
10725}
10726
10727
10728/*
10729 * call-seq:
10730 * hex -> integer
10731 *
10732 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10733 * returns its value as an integer.
10734 *
10735 * The leading substring is interpreted as hexadecimal when it begins with:
10736 *
10737 * - One or more character representing hexadecimal digits
10738 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10739 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10740 *
10741 * 'f'.hex # => 15
10742 * '11'.hex # => 17
10743 * 'FFF'.hex # => 4095
10744 * 'fffg'.hex # => 4095
10745 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10746 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10747 * 'deadbeef'.hex # => 3735928559
10748 *
10749 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10750 *
10751 * '0xfff'.hex # => 4095
10752 * '0xfffg'.hex # => 4095
10753 *
10754 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10755 *
10756 * '-fff'.hex # => -4095
10757 * '-0xFFF'.hex # => -4095
10758 *
10759 * For any substring not described above, returns zero:
10760 *
10761 * 'xxx'.hex # => 0
10762 * ''.hex # => 0
10763 *
10764 * Note that, unlike #oct, this method interprets only hexadecimal,
10765 * and not binary, octal, or decimal notations:
10766 *
10767 * '0b111'.hex # => 45329
10768 * '0o777'.hex # => 0
10769 * '0d999'.hex # => 55705
10770 *
10771 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10772 */
10773
10774static VALUE
10775rb_str_hex(VALUE str)
10776{
10777 return rb_str_to_inum(str, 16, FALSE);
10778}
10779
10780
10781/*
10782 * call-seq:
10783 * oct -> integer
10784 *
10785 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10786 * returns their value as an integer.
10787 *
10788 * In brief:
10789 *
10790 * # Interpreted as octal.
10791 * '777'.oct # => 511
10792 * '777x'.oct # => 511
10793 * '0777'.oct # => 511
10794 * '0o777'.oct # => 511
10795 * '-777'.oct # => -511
10796 * # Not interpreted as octal.
10797 * '0b111'.oct # => 7 # Interpreted as binary.
10798 * '0d999'.oct # => 999 # Interpreted as decimal.
10799 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10800 *
10801 * The leading substring is interpreted as octal when it begins with:
10802 *
10803 * - One or more character representing octal digits
10804 * (each in the range <tt>'0'..'7'</tt>);
10805 * the string to be interpreted ends at the first character that does not represent an octal digit:
10806 *
10807 * '7'.oct @ => 7
10808 * '11'.oct # => 9
10809 * '777'.oct # => 511
10810 * '0777'.oct # => 511
10811 * '7778'.oct # => 511
10812 * '777x'.oct # => 511
10813 *
10814 * - <tt>'0o'</tt>, followed by one or more octal digits:
10815 *
10816 * '0o777'.oct # => 511
10817 * '0o7778'.oct # => 511
10818 *
10819 * The leading substring is _not_ interpreted as octal when it begins with:
10820 *
10821 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10822 * (each in the range <tt>'0'..'1'</tt>);
10823 * the string to be interpreted ends at the first character that does not represent a binary digit.
10824 * the string is interpreted as binary digits (base 2):
10825 *
10826 * '0b111'.oct # => 7
10827 * '0b1112'.oct # => 7
10828 *
10829 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10830 * (each in the range <tt>'0'..'9'</tt>);
10831 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10832 * the string is interpreted as decimal digits (base 10):
10833 *
10834 * '0d999'.oct # => 999
10835 * '0d999x'.oct # => 999
10836 *
10837 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10838 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10839 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10840 * the string is interpreted as hexadecimal digits (base 16):
10841 *
10842 * '0xfff'.oct # => 4095
10843 * '0xfffg'.oct # => 4095
10844 *
10845 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10846 *
10847 * '-777'.oct # => -511
10848 * '-0777'.oct # => -511
10849 * '-0b111'.oct # => -7
10850 * '-0xfff'.oct # => -4095
10851 *
10852 * For any substring not described above, returns zero:
10853 *
10854 * 'foo'.oct # => 0
10855 * ''.oct # => 0
10856 *
10857 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10858 */
10859
10860static VALUE
10861rb_str_oct(VALUE str)
10862{
10863 return rb_str_to_inum(str, -8, FALSE);
10864}
10865
10866#ifndef HAVE_CRYPT_R
10867# include "ruby/thread_native.h"
10868# include "ruby/atomic.h"
10869
10870static struct {
10871 rb_nativethread_lock_t lock;
10872} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10873#endif
10874
10875/*
10876 * call-seq:
10877 * crypt(salt_str) -> new_string
10878 *
10879 * Returns the string generated by calling <code>crypt(3)</code>
10880 * standard library function with <code>str</code> and
10881 * <code>salt_str</code>, in this order, as its arguments. Please do
10882 * not use this method any longer. It is legacy; provided only for
10883 * backward compatibility with ruby scripts in earlier days. It is
10884 * bad to use in contemporary programs for several reasons:
10885 *
10886 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10887 * run. The generated string lacks data portability.
10888 *
10889 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10890 * (i.e. silently ends up in unexpected results).
10891 *
10892 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10893 * thread safe.
10894 *
10895 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10896 * very very weak. According to its manpage, Linux's traditional
10897 * <code>crypt(3)</code> output has only 2**56 variations; too
10898 * easy to brute force today. And this is the default behaviour.
10899 *
10900 * * In order to make things robust some OSes implement so-called
10901 * "modular" usage. To go through, you have to do a complex
10902 * build-up of the <code>salt_str</code> parameter, by hand.
10903 * Failure in generation of a proper salt string tends not to
10904 * yield any errors; typos in parameters are normally not
10905 * detectable.
10906 *
10907 * * For instance, in the following example, the second invocation
10908 * of String#crypt is wrong; it has a typo in "round=" (lacks
10909 * "s"). However the call does not fail and something unexpected
10910 * is generated.
10911 *
10912 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10913 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10914 *
10915 * * Even in the "modular" mode, some hash functions are considered
10916 * archaic and no longer recommended at all; for instance module
10917 * <code>$1$</code> is officially abandoned by its author: see
10918 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10919 * instance module <code>$3$</code> is considered completely
10920 * broken: see the manpage of FreeBSD.
10921 *
10922 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10923 * written above, <code>crypt(3)</code> on Mac OS never fails.
10924 * This means even if you build up a proper salt string it
10925 * generates a traditional DES hash anyways, and there is no way
10926 * for you to be aware of.
10927 *
10928 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10929 *
10930 * If for some reason you cannot migrate to other secure contemporary
10931 * password hashing algorithms, install the string-crypt gem and
10932 * <code>require 'string/crypt'</code> to continue using it.
10933 */
10934
10935static VALUE
10936rb_str_crypt(VALUE str, VALUE salt)
10937{
10938#ifdef HAVE_CRYPT_R
10939 VALUE databuf;
10940 struct crypt_data *data;
10941# define CRYPT_END() ALLOCV_END(databuf)
10942#else
10943 char *tmp_buf;
10944 extern char *crypt(const char *, const char *);
10945# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10946#endif
10947 VALUE result;
10948 const char *s, *saltp;
10949 char *res;
10950#ifdef BROKEN_CRYPT
10951 char salt_8bit_clean[3];
10952#endif
10953
10954 StringValue(salt);
10955 mustnot_wchar(str);
10956 mustnot_wchar(salt);
10957 s = StringValueCStr(str);
10958 saltp = RSTRING_PTR(salt);
10959 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10960 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10961 }
10962
10963#ifdef BROKEN_CRYPT
10964 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10965 salt_8bit_clean[0] = saltp[0] & 0x7f;
10966 salt_8bit_clean[1] = saltp[1] & 0x7f;
10967 salt_8bit_clean[2] = '\0';
10968 saltp = salt_8bit_clean;
10969 }
10970#endif
10971#ifdef HAVE_CRYPT_R
10972 data = ALLOCV(databuf, sizeof(struct crypt_data));
10973# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10974 data->initialized = 0;
10975# endif
10976 res = crypt_r(s, saltp, data);
10977#else
10978 rb_nativethread_lock_lock(&crypt_mutex.lock);
10979 res = crypt(s, saltp);
10980#endif
10981 if (!res) {
10982 int err = errno;
10983 CRYPT_END();
10984 rb_syserr_fail(err, "crypt");
10985 }
10986#ifdef HAVE_CRYPT_R
10987 result = rb_str_new_cstr(res);
10988 CRYPT_END();
10989#else
10990 // We need to copy this buffer because it's static and we need to unlock the mutex
10991 // before allocating a new object (the string to be returned). If we allocate while
10992 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10993 // if other ractors are waiting on this lock.
10994 size_t res_size = strlen(res)+1;
10995 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10996 memcpy(tmp_buf, res, res_size);
10997 res = tmp_buf;
10998 CRYPT_END();
10999 result = rb_str_new_cstr(res);
11000#endif
11001 return result;
11002}
11003
11004
11005/*
11006 * call-seq:
11007 * ord -> integer
11008 *
11009 * :include: doc/string/ord.rdoc
11010 *
11011 */
11012
11013static VALUE
11014rb_str_ord(VALUE s)
11015{
11016 unsigned int c;
11017
11018 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11019 return UINT2NUM(c);
11020}
11021/*
11022 * call-seq:
11023 * sum(n = 16) -> integer
11024 *
11025 * :include: doc/string/sum.rdoc
11026 *
11027 */
11028
11029static VALUE
11030rb_str_sum(int argc, VALUE *argv, VALUE str)
11031{
11032 int bits = 16;
11033 char *ptr, *p, *pend;
11034 long len;
11035 VALUE sum = INT2FIX(0);
11036 unsigned long sum0 = 0;
11037
11038 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11039 bits = 0;
11040 }
11041 ptr = p = RSTRING_PTR(str);
11042 len = RSTRING_LEN(str);
11043 pend = p + len;
11044
11045 while (p < pend) {
11046 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11047 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11048 str_mod_check(str, ptr, len);
11049 sum0 = 0;
11050 }
11051 sum0 += (unsigned char)*p;
11052 p++;
11053 }
11054
11055 if (bits == 0) {
11056 if (sum0) {
11057 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11058 }
11059 }
11060 else {
11061 if (sum == INT2FIX(0)) {
11062 if (bits < (int)sizeof(long)*CHAR_BIT) {
11063 sum0 &= (((unsigned long)1)<<bits)-1;
11064 }
11065 sum = LONG2FIX(sum0);
11066 }
11067 else {
11068 VALUE mod;
11069
11070 if (sum0) {
11071 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11072 }
11073
11074 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11075 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11076 sum = rb_funcall(sum, '&', 1, mod);
11077 }
11078 }
11079 return sum;
11080}
11081
11082static VALUE
11083rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11084{
11085 rb_encoding *enc;
11086 VALUE w;
11087 long width, len, flen = 1, fclen = 1;
11088 VALUE res;
11089 char *p;
11090 const char *f = " ";
11091 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11092 VALUE pad;
11093 int singlebyte = 1, cr;
11094 int termlen;
11095
11096 rb_scan_args(argc, argv, "11", &w, &pad);
11097 enc = STR_ENC_GET(str);
11098 termlen = rb_enc_mbminlen(enc);
11099 width = NUM2LONG(w);
11100 if (argc == 2) {
11101 StringValue(pad);
11102 enc = rb_enc_check(str, pad);
11103 f = RSTRING_PTR(pad);
11104 flen = RSTRING_LEN(pad);
11105 fclen = str_strlen(pad, enc); /* rb_enc_check */
11106 singlebyte = single_byte_optimizable(pad);
11107 if (flen == 0 || fclen == 0) {
11108 rb_raise(rb_eArgError, "zero width padding");
11109 }
11110 }
11111 len = str_strlen(str, enc); /* rb_enc_check */
11112 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11113 n = width - len;
11114 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11115 rlen = n - llen;
11116 cr = ENC_CODERANGE(str);
11117 if (flen > 1) {
11118 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11119 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11120 }
11121 size = RSTRING_LEN(str);
11122 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11123 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11124 (len += llen2 + rlen2) >= LONG_MAX - size) {
11125 rb_raise(rb_eArgError, "argument too big");
11126 }
11127 len += size;
11128 res = str_enc_new(rb_cString, 0, len, enc);
11129 p = RSTRING_PTR(res);
11130 if (flen <= 1) {
11131 memset(p, *f, llen);
11132 p += llen;
11133 }
11134 else {
11135 while (llen >= fclen) {
11136 memcpy(p,f,flen);
11137 p += flen;
11138 llen -= fclen;
11139 }
11140 if (llen > 0) {
11141 memcpy(p, f, llen2);
11142 p += llen2;
11143 }
11144 }
11145 memcpy(p, RSTRING_PTR(str), size);
11146 p += size;
11147 if (flen <= 1) {
11148 memset(p, *f, rlen);
11149 p += rlen;
11150 }
11151 else {
11152 while (rlen >= fclen) {
11153 memcpy(p,f,flen);
11154 p += flen;
11155 rlen -= fclen;
11156 }
11157 if (rlen > 0) {
11158 memcpy(p, f, rlen2);
11159 p += rlen2;
11160 }
11161 }
11162 TERM_FILL(p, termlen);
11163 STR_SET_LEN(res, p-RSTRING_PTR(res));
11164
11165 if (argc == 2)
11166 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11167 if (cr != ENC_CODERANGE_BROKEN)
11168 ENC_CODERANGE_SET(res, cr);
11169
11170 RB_GC_GUARD(pad);
11171 return res;
11172}
11173
11174
11175/*
11176 * call-seq:
11177 * ljust(width, pad_string = ' ') -> new_string
11178 *
11179 * :include: doc/string/ljust.rdoc
11180 *
11181 */
11182
11183static VALUE
11184rb_str_ljust(int argc, VALUE *argv, VALUE str)
11185{
11186 return rb_str_justify(argc, argv, str, 'l');
11187}
11188
11189/*
11190 * call-seq:
11191 * rjust(width, pad_string = ' ') -> new_string
11192 *
11193 * :include: doc/string/rjust.rdoc
11194 *
11195 */
11196
11197static VALUE
11198rb_str_rjust(int argc, VALUE *argv, VALUE str)
11199{
11200 return rb_str_justify(argc, argv, str, 'r');
11201}
11202
11203
11204/*
11205 * call-seq:
11206 * center(size, pad_string = ' ') -> new_string
11207 *
11208 * :include: doc/string/center.rdoc
11209 *
11210 */
11211
11212static VALUE
11213rb_str_center(int argc, VALUE *argv, VALUE str)
11214{
11215 return rb_str_justify(argc, argv, str, 'c');
11216}
11217
11218/*
11219 * call-seq:
11220 * partition(pattern) -> [pre_match, first_match, post_match]
11221 *
11222 * :include: doc/string/partition.rdoc
11223 *
11224 */
11225
11226static VALUE
11227rb_str_partition(VALUE str, VALUE sep)
11228{
11229 long pos;
11230
11231 sep = get_pat_quoted(sep, 0);
11232 if (RB_TYPE_P(sep, T_REGEXP)) {
11233 if (rb_reg_search(sep, str, 0, 0) < 0) {
11234 goto failed;
11235 }
11236 VALUE match = rb_backref_get();
11237 struct re_registers *regs = RMATCH_REGS(match);
11238
11239 pos = BEG(0);
11240 sep = rb_str_subseq(str, pos, END(0) - pos);
11241 }
11242 else {
11243 pos = rb_str_index(str, sep, 0);
11244 if (pos < 0) goto failed;
11245 }
11246 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11247 sep,
11248 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11249 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11250
11251 failed:
11252 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11253}
11254
11255/*
11256 * call-seq:
11257 * rpartition(pattern) -> [pre_match, last_match, post_match]
11258 *
11259 * :include: doc/string/rpartition.rdoc
11260 *
11261 */
11262
11263static VALUE
11264rb_str_rpartition(VALUE str, VALUE sep)
11265{
11266 long pos = RSTRING_LEN(str);
11267
11268 sep = get_pat_quoted(sep, 0);
11269 if (RB_TYPE_P(sep, T_REGEXP)) {
11270 if (rb_reg_search(sep, str, pos, 1) < 0) {
11271 goto failed;
11272 }
11273 VALUE match = rb_backref_get();
11274 struct re_registers *regs = RMATCH_REGS(match);
11275
11276 pos = BEG(0);
11277 sep = rb_str_subseq(str, pos, END(0) - pos);
11278 }
11279 else {
11280 pos = rb_str_sublen(str, pos);
11281 pos = rb_str_rindex(str, sep, pos);
11282 if (pos < 0) {
11283 goto failed;
11284 }
11285 }
11286
11287 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11288 sep,
11289 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11290 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11291 failed:
11292 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11293}
11294
11295/*
11296 * call-seq:
11297 * start_with?(*patterns) -> true or false
11298 *
11299 * :include: doc/string/start_with_p.rdoc
11300 *
11301 */
11302
11303static VALUE
11304rb_str_start_with(int argc, VALUE *argv, VALUE str)
11305{
11306 int i;
11307
11308 for (i=0; i<argc; i++) {
11309 VALUE tmp = argv[i];
11310 if (RB_TYPE_P(tmp, T_REGEXP)) {
11311 if (rb_reg_start_with_p(tmp, str))
11312 return Qtrue;
11313 }
11314 else {
11315 const char *p, *s, *e;
11316 long slen, tlen;
11317 rb_encoding *enc;
11318
11319 StringValue(tmp);
11320 enc = rb_enc_check(str, tmp);
11321 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11322 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11323 p = RSTRING_PTR(str);
11324 e = p + slen;
11325 s = p + tlen;
11326 if (!at_char_right_boundary(p, s, e, enc))
11327 continue;
11328 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11329 return Qtrue;
11330 }
11331 }
11332 return Qfalse;
11333}
11334
11335/*
11336 * call-seq:
11337 * end_with?(*strings) -> true or false
11338 *
11339 * :include: doc/string/end_with_p.rdoc
11340 *
11341 */
11342
11343static VALUE
11344rb_str_end_with(int argc, VALUE *argv, VALUE str)
11345{
11346 int i;
11347
11348 for (i=0; i<argc; i++) {
11349 VALUE tmp = argv[i];
11350 const char *p, *s, *e;
11351 long slen, tlen;
11352 rb_encoding *enc;
11353
11354 StringValue(tmp);
11355 enc = rb_enc_check(str, tmp);
11356 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11357 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11358 p = RSTRING_PTR(str);
11359 e = p + slen;
11360 s = e - tlen;
11361 if (!at_char_boundary(p, s, e, enc))
11362 continue;
11363 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11364 return Qtrue;
11365 }
11366 return Qfalse;
11367}
11368
11378static long
11379deleted_prefix_length(VALUE str, VALUE prefix)
11380{
11381 const char *strptr, *prefixptr;
11382 long olen, prefixlen;
11383 rb_encoding *enc = rb_enc_get(str);
11384
11385 StringValue(prefix);
11386
11387 if (!is_broken_string(prefix) ||
11388 !rb_enc_asciicompat(enc) ||
11389 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11390 enc = rb_enc_check(str, prefix);
11391 }
11392
11393 /* return 0 if not start with prefix */
11394 prefixlen = RSTRING_LEN(prefix);
11395 if (prefixlen <= 0) return 0;
11396 olen = RSTRING_LEN(str);
11397 if (olen < prefixlen) return 0;
11398 strptr = RSTRING_PTR(str);
11399 prefixptr = RSTRING_PTR(prefix);
11400 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11401 if (is_broken_string(prefix)) {
11402 if (!is_broken_string(str)) {
11403 /* prefix in a valid string cannot be broken */
11404 return 0;
11405 }
11406 const char *strend = strptr + olen;
11407 const char *after_prefix = strptr + prefixlen;
11408 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11409 /* prefix does not end at char-boundary */
11410 return 0;
11411 }
11412 }
11413 /* prefix part in `str` also should be valid. */
11414
11415 return prefixlen;
11416}
11417
11418/*
11419 * call-seq:
11420 * delete_prefix!(prefix) -> self or nil
11421 *
11422 * Like String#delete_prefix, except that +self+ is modified in place;
11423 * returns +self+ if the prefix is removed, +nil+ otherwise.
11424 *
11425 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11426 */
11427
11428static VALUE
11429rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11430{
11431 long prefixlen;
11432 str_modify_keep_cr(str);
11433
11434 prefixlen = deleted_prefix_length(str, prefix);
11435 if (prefixlen <= 0) return Qnil;
11436
11437 return rb_str_drop_bytes(str, prefixlen);
11438}
11439
11440/*
11441 * call-seq:
11442 * delete_prefix(prefix) -> new_string
11443 *
11444 * :include: doc/string/delete_prefix.rdoc
11445 *
11446 */
11447
11448static VALUE
11449rb_str_delete_prefix(VALUE str, VALUE prefix)
11450{
11451 long prefixlen;
11452
11453 prefixlen = deleted_prefix_length(str, prefix);
11454 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11455
11456 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11457}
11458
11468static long
11469deleted_suffix_length(VALUE str, VALUE suffix)
11470{
11471 const char *strptr, *suffixptr;
11472 long olen, suffixlen;
11473 rb_encoding *enc;
11474
11475 StringValue(suffix);
11476 if (is_broken_string(suffix)) return 0;
11477 enc = rb_enc_check(str, suffix);
11478
11479 /* return 0 if not start with suffix */
11480 suffixlen = RSTRING_LEN(suffix);
11481 if (suffixlen <= 0) return 0;
11482 olen = RSTRING_LEN(str);
11483 if (olen < suffixlen) return 0;
11484 strptr = RSTRING_PTR(str);
11485 suffixptr = RSTRING_PTR(suffix);
11486 const char *strend = strptr + olen;
11487 const char *before_suffix = strend - suffixlen;
11488 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11489 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11490
11491 return suffixlen;
11492}
11493
11494/*
11495 * call-seq:
11496 * delete_suffix!(suffix) -> self or nil
11497 *
11498 * Like String#delete_suffix, except that +self+ is modified in place;
11499 * returns +self+ if the suffix is removed, +nil+ otherwise.
11500 *
11501 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11502 */
11503
11504static VALUE
11505rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11506{
11507 long olen, suffixlen, len;
11508 str_modifiable(str);
11509
11510 suffixlen = deleted_suffix_length(str, suffix);
11511 if (suffixlen <= 0) return Qnil;
11512
11513 olen = RSTRING_LEN(str);
11514 str_modify_keep_cr(str);
11515 len = olen - suffixlen;
11516 STR_SET_LEN(str, len);
11517 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11518 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11520 }
11521 return str;
11522}
11523
11524/*
11525 * call-seq:
11526 * delete_suffix(suffix) -> new_string
11527 *
11528 * :include: doc/string/delete_suffix.rdoc
11529 *
11530 */
11531
11532static VALUE
11533rb_str_delete_suffix(VALUE str, VALUE suffix)
11534{
11535 long suffixlen;
11536
11537 suffixlen = deleted_suffix_length(str, suffix);
11538 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11539
11540 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11541}
11542
11543void
11544rb_str_setter(VALUE val, ID id, VALUE *var)
11545{
11546 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11547 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11548 }
11549 *var = val;
11550}
11551
11552static void
11553nil_setter_warning(ID id)
11554{
11555 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11556}
11557
11558void
11559rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11560{
11561 rb_str_setter(val, id, var);
11562 if (!NIL_P(*var)) {
11563 nil_setter_warning(id);
11564 }
11565}
11566
11567static void
11568rb_fs_setter(VALUE val, ID id, VALUE *var)
11569{
11570 val = rb_fs_check(val);
11571 if (!val) {
11572 rb_raise(rb_eTypeError,
11573 "value of %"PRIsVALUE" must be String or Regexp",
11574 rb_id2str(id));
11575 }
11576 if (!NIL_P(val)) {
11577 nil_setter_warning(id);
11578 }
11579 *var = val;
11580}
11581
11582
11583/*
11584 * call-seq:
11585 * force_encoding(encoding) -> self
11586 *
11587 * :include: doc/string/force_encoding.rdoc
11588 *
11589 */
11590
11591static VALUE
11592rb_str_force_encoding(VALUE str, VALUE enc)
11593{
11594 str_modifiable(str);
11595
11596 rb_encoding *encoding = rb_to_encoding(enc);
11597 int idx = rb_enc_to_index(encoding);
11598
11599 // If the encoding is unchanged, we do nothing.
11600 if (ENCODING_GET(str) == idx) {
11601 return str;
11602 }
11603
11604 rb_enc_associate_index(str, idx);
11605
11606 // If the coderange was 7bit and the new encoding is ASCII-compatible
11607 // we can keep the coderange.
11608 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11609 return str;
11610 }
11611
11613 return str;
11614}
11615
11616/*
11617 * call-seq:
11618 * b -> new_string
11619 *
11620 * :include: doc/string/b.rdoc
11621 *
11622 */
11623
11624static VALUE
11625rb_str_b(VALUE str)
11626{
11627 VALUE str2;
11628 if (STR_EMBED_P(str)) {
11629 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11630 }
11631 else {
11632 str2 = str_alloc_heap(rb_cString);
11633 }
11634 str_replace_shared_without_enc(str2, str);
11635
11636 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11637 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11638 // If we know the receiver's code range then we know the result's code range.
11639 int cr = ENC_CODERANGE(str);
11640 switch (cr) {
11641 case ENC_CODERANGE_7BIT:
11643 break;
11647 break;
11648 default:
11649 ENC_CODERANGE_CLEAR(str2);
11650 break;
11651 }
11652 }
11653
11654 return str2;
11655}
11656
11657/*
11658 * call-seq:
11659 * valid_encoding? -> true or false
11660 *
11661 * :include: doc/string/valid_encoding_p.rdoc
11662 *
11663 */
11664
11665static VALUE
11666rb_str_valid_encoding_p(VALUE str)
11667{
11668 int cr = rb_enc_str_coderange(str);
11669
11670 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11671}
11672
11673/*
11674 * call-seq:
11675 * ascii_only? -> true or false
11676 *
11677 * Returns whether +self+ contains only ASCII characters:
11678 *
11679 * 'abc'.ascii_only? # => true
11680 * "abc\u{6666}".ascii_only? # => false
11681 *
11682 * Related: see {Querying}[rdoc-ref:String@Querying].
11683 */
11684
11685static VALUE
11686rb_str_is_ascii_only_p(VALUE str)
11687{
11688 int cr = rb_enc_str_coderange(str);
11689
11690 return RBOOL(cr == ENC_CODERANGE_7BIT);
11691}
11692
11693VALUE
11695{
11696 static const char ellipsis[] = "...";
11697 const long ellipsislen = sizeof(ellipsis) - 1;
11698 rb_encoding *const enc = rb_enc_get(str);
11699 const long blen = RSTRING_LEN(str);
11700 const char *const p = RSTRING_PTR(str), *e = p + blen;
11701 VALUE estr, ret = 0;
11702
11703 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11704 if (len * rb_enc_mbminlen(enc) >= blen ||
11705 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11706 ret = str;
11707 }
11708 else if (len <= ellipsislen ||
11709 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11710 if (rb_enc_asciicompat(enc)) {
11711 ret = rb_str_new(ellipsis, len);
11712 rb_enc_associate(ret, enc);
11713 }
11714 else {
11715 estr = rb_usascii_str_new(ellipsis, len);
11716 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11717 }
11718 }
11719 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11720 rb_str_cat(ret, ellipsis, ellipsislen);
11721 }
11722 else {
11723 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11724 rb_enc_from_encoding(enc), 0, Qnil);
11725 rb_str_append(ret, estr);
11726 }
11727 return ret;
11728}
11729
11730static VALUE
11731str_compat_and_valid(VALUE str, rb_encoding *enc)
11732{
11733 int cr;
11734 str = StringValue(str);
11735 cr = rb_enc_str_coderange(str);
11736 if (cr == ENC_CODERANGE_BROKEN) {
11737 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11738 }
11739 else {
11740 rb_encoding *e = STR_ENC_GET(str);
11741 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11742 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11743 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11744 }
11745 }
11746 return str;
11747}
11748
11749static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11750
11751VALUE
11753{
11754 rb_encoding *enc = STR_ENC_GET(str);
11755 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11756}
11757
11758VALUE
11759rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11760{
11761 int cr = ENC_CODERANGE_UNKNOWN;
11762 if (enc == STR_ENC_GET(str)) {
11763 /* cached coderange makes sense only when enc equals the
11764 * actual encoding of str */
11765 cr = ENC_CODERANGE(str);
11766 }
11767 return enc_str_scrub(enc, str, repl, cr);
11768}
11769
11770static VALUE
11771enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11772{
11773 int encidx;
11774 VALUE buf = Qnil;
11775 const char *rep, *p, *e, *p1, *sp;
11776 long replen = -1;
11777 long slen;
11778
11779 if (rb_block_given_p()) {
11780 if (!NIL_P(repl))
11781 rb_raise(rb_eArgError, "both of block and replacement given");
11782 replen = 0;
11783 }
11784
11785 if (ENC_CODERANGE_CLEAN_P(cr))
11786 return Qnil;
11787
11788 if (!NIL_P(repl)) {
11789 repl = str_compat_and_valid(repl, enc);
11790 }
11791
11792 if (rb_enc_dummy_p(enc)) {
11793 return Qnil;
11794 }
11795 encidx = rb_enc_to_index(enc);
11796
11797#define DEFAULT_REPLACE_CHAR(str) do { \
11798 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11799 rep = replace; replen = (int)sizeof(replace); \
11800 } while (0)
11801
11802 slen = RSTRING_LEN(str);
11803 p = RSTRING_PTR(str);
11804 e = RSTRING_END(str);
11805 p1 = p;
11806 sp = p;
11807
11808 if (rb_enc_asciicompat(enc)) {
11809 int rep7bit_p;
11810 if (!replen) {
11811 rep = NULL;
11812 rep7bit_p = FALSE;
11813 }
11814 else if (!NIL_P(repl)) {
11815 rep = RSTRING_PTR(repl);
11816 replen = RSTRING_LEN(repl);
11817 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11818 }
11819 else if (encidx == rb_utf8_encindex()) {
11820 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11821 rep7bit_p = FALSE;
11822 }
11823 else {
11824 DEFAULT_REPLACE_CHAR("?");
11825 rep7bit_p = TRUE;
11826 }
11827 cr = ENC_CODERANGE_7BIT;
11828
11829 p = search_nonascii(p, e);
11830 if (!p) {
11831 p = e;
11832 }
11833 while (p < e) {
11834 int ret = rb_enc_precise_mbclen(p, e, enc);
11835 if (MBCLEN_NEEDMORE_P(ret)) {
11836 break;
11837 }
11838 else if (MBCLEN_CHARFOUND_P(ret)) {
11840 p += MBCLEN_CHARFOUND_LEN(ret);
11841 }
11842 else if (MBCLEN_INVALID_P(ret)) {
11843 /*
11844 * p1~p: valid ascii/multibyte chars
11845 * p ~e: invalid bytes + unknown bytes
11846 */
11847 long clen = rb_enc_mbmaxlen(enc);
11848 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11849 if (p > p1) {
11850 rb_str_buf_cat(buf, p1, p - p1);
11851 }
11852
11853 if (e - p < clen) clen = e - p;
11854 if (clen <= 2) {
11855 clen = 1;
11856 }
11857 else {
11858 const char *q = p;
11859 clen--;
11860 for (; clen > 1; clen--) {
11861 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11862 if (MBCLEN_NEEDMORE_P(ret)) break;
11863 if (MBCLEN_INVALID_P(ret)) continue;
11865 }
11866 }
11867 if (rep) {
11868 rb_str_buf_cat(buf, rep, replen);
11869 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11870 }
11871 else {
11872 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11873 str_mod_check(str, sp, slen);
11874 repl = str_compat_and_valid(repl, enc);
11875 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11878 }
11879 p += clen;
11880 p1 = p;
11881 p = search_nonascii(p, e);
11882 if (!p) {
11883 p = e;
11884 break;
11885 }
11886 }
11887 else {
11889 }
11890 }
11891 if (NIL_P(buf)) {
11892 if (p == e) {
11893 ENC_CODERANGE_SET(str, cr);
11894 return Qnil;
11895 }
11896 buf = rb_str_buf_new(RSTRING_LEN(str));
11897 }
11898 if (p1 < p) {
11899 rb_str_buf_cat(buf, p1, p - p1);
11900 }
11901 if (p < e) {
11902 if (rep) {
11903 rb_str_buf_cat(buf, rep, replen);
11904 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11905 }
11906 else {
11907 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11908 str_mod_check(str, sp, slen);
11909 repl = str_compat_and_valid(repl, enc);
11910 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11913 }
11914 }
11915 }
11916 else {
11917 /* ASCII incompatible */
11918 long mbminlen = rb_enc_mbminlen(enc);
11919 if (!replen) {
11920 rep = NULL;
11921 }
11922 else if (!NIL_P(repl)) {
11923 rep = RSTRING_PTR(repl);
11924 replen = RSTRING_LEN(repl);
11925 }
11926 else if (encidx == ENCINDEX_UTF_16BE) {
11927 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11928 }
11929 else if (encidx == ENCINDEX_UTF_16LE) {
11930 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11931 }
11932 else if (encidx == ENCINDEX_UTF_32BE) {
11933 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11934 }
11935 else if (encidx == ENCINDEX_UTF_32LE) {
11936 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11937 }
11938 else {
11939 DEFAULT_REPLACE_CHAR("?");
11940 }
11941
11942 while (p < e) {
11943 int ret = rb_enc_precise_mbclen(p, e, enc);
11944 if (MBCLEN_NEEDMORE_P(ret)) {
11945 break;
11946 }
11947 else if (MBCLEN_CHARFOUND_P(ret)) {
11948 p += MBCLEN_CHARFOUND_LEN(ret);
11949 }
11950 else if (MBCLEN_INVALID_P(ret)) {
11951 const char *q = p;
11952 long clen = rb_enc_mbmaxlen(enc);
11953 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11954 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11955
11956 if (e - p < clen) clen = e - p;
11957 if (clen <= mbminlen * 2) {
11958 clen = mbminlen;
11959 }
11960 else {
11961 clen -= mbminlen;
11962 for (; clen > mbminlen; clen-=mbminlen) {
11963 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11964 if (MBCLEN_NEEDMORE_P(ret)) break;
11965 if (MBCLEN_INVALID_P(ret)) continue;
11967 }
11968 }
11969 if (rep) {
11970 rb_str_buf_cat(buf, rep, replen);
11971 }
11972 else {
11973 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11974 str_mod_check(str, sp, slen);
11975 repl = str_compat_and_valid(repl, enc);
11976 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11977 }
11978 p += clen;
11979 p1 = p;
11980 }
11981 else {
11983 }
11984 }
11985 if (NIL_P(buf)) {
11986 if (p == e) {
11988 return Qnil;
11989 }
11990 buf = rb_str_buf_new(RSTRING_LEN(str));
11991 }
11992 if (p1 < p) {
11993 rb_str_buf_cat(buf, p1, p - p1);
11994 }
11995 if (p < e) {
11996 if (rep) {
11997 rb_str_buf_cat(buf, rep, replen);
11998 }
11999 else {
12000 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12001 str_mod_check(str, sp, slen);
12002 repl = str_compat_and_valid(repl, enc);
12003 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12004 }
12005 }
12007 }
12008 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12009 return buf;
12010}
12011
12012/*
12013 * call-seq:
12014 * scrub(replacement_string = default_replacement_string) -> new_string
12015 * scrub{|sequence| ... } -> new_string
12016 *
12017 * :include: doc/string/scrub.rdoc
12018 *
12019 */
12020static VALUE
12021str_scrub(int argc, VALUE *argv, VALUE str)
12022{
12023 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12024 VALUE new = rb_str_scrub(str, repl);
12025 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12026}
12027
12028/*
12029 * call-seq:
12030 * scrub!(replacement_string = default_replacement_string) -> self
12031 * scrub!{|sequence| ... } -> self
12032 *
12033 * Like String#scrub, except that:
12034 *
12035 * - Any replacements are made in +self+.
12036 * - Returns +self+.
12037 *
12038 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12039 *
12040 */
12041static VALUE
12042str_scrub_bang(int argc, VALUE *argv, VALUE str)
12043{
12044 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12045 VALUE new = rb_str_scrub(str, repl);
12046 if (!NIL_P(new)) rb_str_replace(str, new);
12047 return str;
12048}
12049
12050static ID id_normalize;
12051static ID id_normalized_p;
12052static VALUE mUnicodeNormalize;
12053
12054static VALUE
12055unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12056{
12057 static int UnicodeNormalizeRequired = 0;
12058 VALUE argv2[2];
12059
12060 if (!UnicodeNormalizeRequired) {
12061 rb_require("unicode_normalize/normalize.rb");
12062 UnicodeNormalizeRequired = 1;
12063 }
12064 argv2[0] = str;
12065 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12066 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12067}
12068
12069/*
12070 * call-seq:
12071 * unicode_normalize(form = :nfc) -> string
12072 *
12073 * :include: doc/string/unicode_normalize.rdoc
12074 *
12075 */
12076static VALUE
12077rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12078{
12079 return unicode_normalize_common(argc, argv, str, id_normalize);
12080}
12081
12082/*
12083 * call-seq:
12084 * unicode_normalize!(form = :nfc) -> self
12085 *
12086 * Like String#unicode_normalize, except that the normalization
12087 * is performed on +self+ (not on a copy of +self+).
12088 *
12089 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12090 *
12091 */
12092static VALUE
12093rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12094{
12095 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12096}
12097
12098/* call-seq:
12099 * unicode_normalized?(form = :nfc) -> true or false
12100 *
12101 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12102 * see String#unicode_normalize.
12103 *
12104 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12105 *
12106 * Examples:
12107 *
12108 * "a\u0300".unicode_normalized? # => false
12109 * "a\u0300".unicode_normalized?(:nfd) # => true
12110 * "\u00E0".unicode_normalized? # => true
12111 * "\u00E0".unicode_normalized?(:nfd) # => false
12112 *
12113 *
12114 * Raises an exception if +self+ is not in a Unicode encoding:
12115 *
12116 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12117 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12118 *
12119 * Related: see {Querying}[rdoc-ref:String@Querying].
12120 */
12121static VALUE
12122rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12123{
12124 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12125}
12126
12127/**********************************************************************
12128 * Document-class: Symbol
12129 *
12130 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12131 *
12132 * You can create a +Symbol+ object explicitly with:
12133 *
12134 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12135 *
12136 * The same +Symbol+ object will be
12137 * created for a given name or string for the duration of a program's
12138 * execution, regardless of the context or meaning of that name. Thus
12139 * if <code>Fred</code> is a constant in one context, a method in
12140 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12141 * will be the same object in all three contexts.
12142 *
12143 * module One
12144 * class Fred
12145 * end
12146 * $f1 = :Fred
12147 * end
12148 * module Two
12149 * Fred = 1
12150 * $f2 = :Fred
12151 * end
12152 * def Fred()
12153 * end
12154 * $f3 = :Fred
12155 * $f1.object_id #=> 2514190
12156 * $f2.object_id #=> 2514190
12157 * $f3.object_id #=> 2514190
12158 *
12159 * Constant, method, and variable names are returned as symbols:
12160 *
12161 * module One
12162 * Two = 2
12163 * def three; 3 end
12164 * @four = 4
12165 * @@five = 5
12166 * $six = 6
12167 * end
12168 * seven = 7
12169 *
12170 * One.constants
12171 * # => [:Two]
12172 * One.instance_methods(true)
12173 * # => [:three]
12174 * One.instance_variables
12175 * # => [:@four]
12176 * One.class_variables
12177 * # => [:@@five]
12178 * global_variables.grep(/six/)
12179 * # => [:$six]
12180 * local_variables
12181 * # => [:seven]
12182 *
12183 * A +Symbol+ object differs from a String object in that
12184 * a +Symbol+ object represents an identifier, while a String object
12185 * represents text or data.
12186 *
12187 * == What's Here
12188 *
12189 * First, what's elsewhere. Class +Symbol+:
12190 *
12191 * - Inherits from {class Object}[rdoc-ref:Object@Whats+Here].
12192 * - Includes {module Comparable}[rdoc-ref:Comparable@Whats+Here].
12193 *
12194 * Here, class +Symbol+ provides methods that are useful for:
12195 *
12196 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12197 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12198 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12199 *
12200 * === Methods for Querying
12201 *
12202 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12203 * - #=~: Returns the index of the first substring in symbol that matches a
12204 * given Regexp or other object; returns +nil+ if no match is found.
12205 * - #[], #slice : Returns a substring of symbol
12206 * determined by a given index, start/length, or range, or string.
12207 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12208 * - #encoding: Returns the Encoding object that represents the encoding
12209 * of symbol.
12210 * - #end_with?: Returns +true+ if symbol ends with
12211 * any of the given strings.
12212 * - #match: Returns a MatchData object if symbol
12213 * matches a given Regexp; +nil+ otherwise.
12214 * - #match?: Returns +true+ if symbol
12215 * matches a given Regexp; +false+ otherwise.
12216 * - #length, #size: Returns the number of characters in symbol.
12217 * - #start_with?: Returns +true+ if symbol starts with
12218 * any of the given strings.
12219 *
12220 * === Methods for Comparing
12221 *
12222 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12223 * or larger than symbol.
12224 * - #==, #===: Returns +true+ if a given symbol has the same content and
12225 * encoding.
12226 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12227 * symbol is smaller than, equal to, or larger than symbol.
12228 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12229 * after Unicode case folding; +false+ otherwise.
12230 *
12231 * === Methods for Converting
12232 *
12233 * - #capitalize: Returns symbol with the first character upcased
12234 * and all other characters downcased.
12235 * - #downcase: Returns symbol with all characters downcased.
12236 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12237 * - #name: Returns the frozen string corresponding to symbol.
12238 * - #succ, #next: Returns the symbol that is the successor to symbol.
12239 * - #swapcase: Returns symbol with all upcase characters downcased
12240 * and all downcase characters upcased.
12241 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12242 * - #to_s, #id2name: Returns the string corresponding to +self+.
12243 * - #to_sym, #intern: Returns +self+.
12244 * - #upcase: Returns symbol with all characters upcased.
12245 *
12246 */
12247
12248
12249/*
12250 * call-seq:
12251 * self == other -> true or false
12252 *
12253 * Returns whether +other+ is the same object as +self+.
12254 */
12255
12256#define sym_equal rb_obj_equal
12257
12258static int
12259sym_printable(const char *s, const char *send, rb_encoding *enc)
12260{
12261 while (s < send) {
12262 int n;
12263 int c = rb_enc_precise_mbclen(s, send, enc);
12264
12265 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12266 n = MBCLEN_CHARFOUND_LEN(c);
12267 c = rb_enc_mbc_to_codepoint(s, send, enc);
12268 if (!rb_enc_isprint(c, enc)) return FALSE;
12269 s += n;
12270 }
12271 return TRUE;
12272}
12273
12274int
12275rb_str_symname_p(VALUE sym)
12276{
12277 rb_encoding *enc;
12278 const char *ptr;
12279 long len;
12280 rb_encoding *resenc = rb_default_internal_encoding();
12281
12282 if (resenc == NULL) resenc = rb_default_external_encoding();
12283 enc = STR_ENC_GET(sym);
12284 ptr = RSTRING_PTR(sym);
12285 len = RSTRING_LEN(sym);
12286 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12287 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12288 return FALSE;
12289 }
12290 return TRUE;
12291}
12292
12293VALUE
12294rb_str_quote_unprintable(VALUE str)
12295{
12296 rb_encoding *enc;
12297 const char *ptr;
12298 long len;
12299 rb_encoding *resenc;
12300
12301 Check_Type(str, T_STRING);
12302 resenc = rb_default_internal_encoding();
12303 if (resenc == NULL) resenc = rb_default_external_encoding();
12304 enc = STR_ENC_GET(str);
12305 ptr = RSTRING_PTR(str);
12306 len = RSTRING_LEN(str);
12307 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12308 !sym_printable(ptr, ptr + len, enc)) {
12309 return rb_str_escape(str);
12310 }
12311 return str;
12312}
12313
12314VALUE
12315rb_id_quote_unprintable(ID id)
12316{
12317 VALUE str = rb_id2str(id);
12318 if (!rb_str_symname_p(str)) {
12319 return rb_str_escape(str);
12320 }
12321 return str;
12322}
12323
12324/*
12325 * call-seq:
12326 * inspect -> string
12327 *
12328 * Returns a string representation of +self+ (including the leading colon):
12329 *
12330 * :foo.inspect # => ":foo"
12331 *
12332 * Related: Symbol#to_s, Symbol#name.
12333 *
12334 */
12335
12336static VALUE
12337sym_inspect(VALUE sym)
12338{
12339 VALUE str = rb_sym2str(sym);
12340 const char *ptr;
12341 long len;
12342 char *dest;
12343
12344 if (!rb_str_symname_p(str)) {
12345 str = rb_str_inspect(str);
12346 len = RSTRING_LEN(str);
12347 rb_str_resize(str, len + 1);
12348 dest = RSTRING_PTR(str);
12349 memmove(dest + 1, dest, len);
12350 }
12351 else {
12352 rb_encoding *enc = STR_ENC_GET(str);
12353 VALUE orig_str = str;
12354
12355 len = RSTRING_LEN(orig_str);
12356 str = rb_enc_str_new(0, len + 1, enc);
12357
12358 // Get data pointer after allocation
12359 ptr = RSTRING_PTR(orig_str);
12360 dest = RSTRING_PTR(str);
12361 memcpy(dest + 1, ptr, len);
12362
12363 RB_GC_GUARD(orig_str);
12364 }
12365 dest[0] = ':';
12366
12368
12369 return str;
12370}
12371
12372VALUE
12374{
12375 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12376 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12377 return str;
12378}
12379
12380VALUE
12381rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12382{
12383 VALUE obj;
12384
12385 if (argc < 1) {
12386 rb_raise(rb_eArgError, "no receiver given");
12387 }
12388 obj = argv[0];
12389 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12390}
12391
12392/*
12393 * call-seq:
12394 * succ
12395 *
12396 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12397 *
12398 * :foo.succ # => :fop
12399 *
12400 * Related: String#succ.
12401 */
12402
12403static VALUE
12404sym_succ(VALUE sym)
12405{
12406 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12407}
12408
12409/*
12410 * call-seq:
12411 * self <=> other -> -1, 0, 1, or nil
12412 *
12413 * Compares +self+ and +other+, using String#<=>.
12414 *
12415 * Returns:
12416 *
12417 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12418 * - +nil+, otherwise.
12419 *
12420 * Examples:
12421 *
12422 * :bar <=> :foo # => -1
12423 * :foo <=> :foo # => 0
12424 * :foo <=> :bar # => 1
12425 * :foo <=> 'bar' # => nil
12426 *
12427 * \Class \Symbol includes module Comparable,
12428 * each of whose methods uses Symbol#<=> for comparison.
12429 *
12430 * Related: String#<=>.
12431 */
12432
12433static VALUE
12434sym_cmp(VALUE sym, VALUE other)
12435{
12436 if (!SYMBOL_P(other)) {
12437 return Qnil;
12438 }
12439 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12440}
12441
12442/*
12443 * call-seq:
12444 * casecmp(object) -> -1, 0, 1, or nil
12445 *
12446 * :include: doc/symbol/casecmp.rdoc
12447 *
12448 */
12449
12450static VALUE
12451sym_casecmp(VALUE sym, VALUE other)
12452{
12453 if (!SYMBOL_P(other)) {
12454 return Qnil;
12455 }
12456 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12457}
12458
12459/*
12460 * call-seq:
12461 * casecmp?(object) -> true, false, or nil
12462 *
12463 * :include: doc/symbol/casecmp_p.rdoc
12464 *
12465 */
12466
12467static VALUE
12468sym_casecmp_p(VALUE sym, VALUE other)
12469{
12470 if (!SYMBOL_P(other)) {
12471 return Qnil;
12472 }
12473 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12474}
12475
12476/*
12477 * call-seq:
12478 * self =~ other -> integer or nil
12479 *
12480 * Equivalent to <tt>self.to_s =~ other</tt>,
12481 * including possible updates to global variables;
12482 * see String#=~.
12483 *
12484 */
12485
12486static VALUE
12487sym_match(VALUE sym, VALUE other)
12488{
12489 return rb_str_match(rb_sym2str(sym), other);
12490}
12491
12492/*
12493 * call-seq:
12494 * match(pattern, offset = 0) -> matchdata or nil
12495 * match(pattern, offset = 0) {|matchdata| } -> object
12496 *
12497 * Equivalent to <tt>self.to_s.match</tt>,
12498 * including possible updates to global variables;
12499 * see String#match.
12500 *
12501 */
12502
12503static VALUE
12504sym_match_m(int argc, VALUE *argv, VALUE sym)
12505{
12506 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12507}
12508
12509/*
12510 * call-seq:
12511 * match?(pattern, offset) -> true or false
12512 *
12513 * Equivalent to <tt>sym.to_s.match?</tt>;
12514 * see String#match.
12515 *
12516 */
12517
12518static VALUE
12519sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12520{
12521 return rb_str_match_m_p(argc, argv, sym);
12522}
12523
12524/*
12525 * call-seq:
12526 * self[offset] -> string or nil
12527 * self[offset, size] -> string or nil
12528 * self[range] -> string or nil
12529 * self[regexp, capture = 0] -> string or nil
12530 * self[substring] -> string or nil
12531 *
12532 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12533 *
12534 */
12535
12536static VALUE
12537sym_aref(int argc, VALUE *argv, VALUE sym)
12538{
12539 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12540}
12541
12542/*
12543 * call-seq:
12544 * length -> integer
12545 *
12546 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12547 */
12548
12549static VALUE
12550sym_length(VALUE sym)
12551{
12552 return rb_str_length(rb_sym2str(sym));
12553}
12554
12555/*
12556 * call-seq:
12557 * empty? -> true or false
12558 *
12559 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12560 *
12561 */
12562
12563static VALUE
12564sym_empty(VALUE sym)
12565{
12566 return rb_str_empty(rb_sym2str(sym));
12567}
12568
12569/*
12570 * call-seq:
12571 * upcase(mapping) -> symbol
12572 *
12573 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12574 *
12575 * See String#upcase.
12576 *
12577 */
12578
12579static VALUE
12580sym_upcase(int argc, VALUE *argv, VALUE sym)
12581{
12582 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12583}
12584
12585/*
12586 * call-seq:
12587 * downcase(mapping) -> symbol
12588 *
12589 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12590 *
12591 * See String#downcase.
12592 *
12593 * Related: Symbol#upcase.
12594 *
12595 */
12596
12597static VALUE
12598sym_downcase(int argc, VALUE *argv, VALUE sym)
12599{
12600 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12601}
12602
12603/*
12604 * call-seq:
12605 * capitalize(mapping) -> symbol
12606 *
12607 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12608 *
12609 * See String#capitalize.
12610 *
12611 */
12612
12613static VALUE
12614sym_capitalize(int argc, VALUE *argv, VALUE sym)
12615{
12616 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12617}
12618
12619/*
12620 * call-seq:
12621 * swapcase(mapping) -> symbol
12622 *
12623 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12624 *
12625 * See String#swapcase.
12626 *
12627 */
12628
12629static VALUE
12630sym_swapcase(int argc, VALUE *argv, VALUE sym)
12631{
12632 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12633}
12634
12635/*
12636 * call-seq:
12637 * start_with?(*string_or_regexp) -> true or false
12638 *
12639 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12640 *
12641 */
12642
12643static VALUE
12644sym_start_with(int argc, VALUE *argv, VALUE sym)
12645{
12646 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12647}
12648
12649/*
12650 * call-seq:
12651 * end_with?(*strings) -> true or false
12652 *
12653 *
12654 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12655 *
12656 */
12657
12658static VALUE
12659sym_end_with(int argc, VALUE *argv, VALUE sym)
12660{
12661 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12662}
12663
12664/*
12665 * call-seq:
12666 * encoding -> encoding
12667 *
12668 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12669 *
12670 */
12671
12672static VALUE
12673sym_encoding(VALUE sym)
12674{
12675 return rb_obj_encoding(rb_sym2str(sym));
12676}
12677
12678static VALUE
12679string_for_symbol(VALUE name)
12680{
12681 if (!RB_TYPE_P(name, T_STRING)) {
12682 VALUE tmp = rb_check_string_type(name);
12683 if (NIL_P(tmp)) {
12684 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12685 name);
12686 }
12687 name = tmp;
12688 }
12689 return name;
12690}
12691
12692ID
12694{
12695 if (SYMBOL_P(name)) {
12696 return SYM2ID(name);
12697 }
12698 name = string_for_symbol(name);
12699 return rb_intern_str(name);
12700}
12701
12702VALUE
12704{
12705 if (SYMBOL_P(name)) {
12706 return name;
12707 }
12708 name = string_for_symbol(name);
12709 return rb_str_intern(name);
12710}
12711
12712/*
12713 * call-seq:
12714 * Symbol.all_symbols -> array_of_symbols
12715 *
12716 * Returns an array of all symbols currently in Ruby's symbol table:
12717 *
12718 * Symbol.all_symbols.size # => 9334
12719 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12720 *
12721 */
12722
12723static VALUE
12724sym_all_symbols(VALUE _)
12725{
12726 return rb_sym_all_symbols();
12727}
12728
12729VALUE
12730rb_str_to_interned_str(VALUE str)
12731{
12732 return rb_fstring(str);
12733}
12734
12735VALUE
12736rb_interned_str(const char *ptr, long len)
12737{
12738 struct RString fake_str = {RBASIC_INIT};
12739 int encidx = ENCINDEX_US_ASCII;
12740 int coderange = ENC_CODERANGE_7BIT;
12741 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12742 encidx = ENCINDEX_ASCII_8BIT;
12743 coderange = ENC_CODERANGE_VALID;
12744 }
12745 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12746 ENC_CODERANGE_SET(str, coderange);
12747 return register_fstring(str, true, false);
12748}
12749
12750VALUE
12752{
12753 return rb_interned_str(ptr, strlen(ptr));
12754}
12755
12756VALUE
12757rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12758{
12759 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12760 rb_enc_autoload(enc);
12761 }
12762
12763 struct RString fake_str = {RBASIC_INIT};
12764 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12765}
12766
12767VALUE
12768rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12769{
12770 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12771 rb_enc_autoload(enc);
12772 }
12773
12774 struct RString fake_str = {RBASIC_INIT};
12775 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12776 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12777 return str;
12778}
12779
12780VALUE
12782{
12783 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12784}
12785
12786#if USE_YJIT || USE_ZJIT
12787void
12788rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12789{
12790 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12791 ssize_t code = RB_NUM2SSIZE(codepoint);
12792
12793 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12794 rb_str_buf_cat_byte(str, (char) code);
12795 return;
12796 }
12797 }
12798
12799 rb_str_concat(str, codepoint);
12800}
12801#endif
12802
12803static int
12804fstring_set_class_i(VALUE *str, void *data)
12805{
12806 RBASIC_SET_CLASS(*str, rb_cString);
12807
12808 return ST_CONTINUE;
12809}
12810
12811void
12812Init_String(void)
12813{
12815
12816 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12817
12819 rb_define_alloc_func(rb_cString, empty_str_alloc);
12820 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12821 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12822 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12824 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12825 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12828 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12829 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12830 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12831 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12834 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12835 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12836 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12837 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12840 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12841 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12842 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12843 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12844 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12846 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12848 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12849 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12850 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12851 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12852 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12853 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12854 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12855 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12856 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12857 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12858 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12859 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12860 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12861 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12863 rb_define_method(rb_cString, "+@", str_uplus, 0);
12864 rb_define_method(rb_cString, "-@", str_uminus, 0);
12865 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12866 rb_define_alias(rb_cString, "dedup", "-@");
12867
12868 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12869 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12870 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12871 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12874 rb_define_method(rb_cString, "undump", str_undump, 0);
12875
12876 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12877 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12878 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12879 sym_fold = ID2SYM(rb_intern_const("fold"));
12880
12881 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12882 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12883 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12884 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12885
12886 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12887 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12888 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12889 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12890
12891 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12892 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12893 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12894 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12895 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12896 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12897 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12898 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12899 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12900 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12901 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12902 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12904 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12905 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12906 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12907 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12908 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12909
12910 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12911 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12912 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12913
12914 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12915
12916 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12917 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12918 rb_define_method(rb_cString, "center", rb_str_center, -1);
12919
12920 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12921 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12922 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12923 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12924 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12925 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12926 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12927 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12928 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12929
12930 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12931 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12932 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12933 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12934 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12935 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12936 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12937 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12938 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12939
12940 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12941 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12942 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12943 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12944 rb_define_method(rb_cString, "count", rb_str_count, -1);
12945
12946 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12947 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12948 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12949 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12950
12951 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12952 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12953 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12954 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12955 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12956
12957 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12958
12959 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12960 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12961
12962 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12963 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12964
12965 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12966 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12967 rb_define_method(rb_cString, "b", rb_str_b, 0);
12968 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12969 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12970
12971 /* define UnicodeNormalize module here so that we don't have to look it up */
12972 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12973 id_normalize = rb_intern_const("normalize");
12974 id_normalized_p = rb_intern_const("normalized?");
12975
12976 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12977 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12978 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12979
12980 rb_fs = Qnil;
12981 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12982 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12983 rb_gc_register_address(&rb_fs);
12984
12989 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12990
12991 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12992 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12993 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12994 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12995 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12996 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12997
12998 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12999 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13000 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13001 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13002
13003 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13004 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13005 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13006 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13007 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13008 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13009 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13010
13011 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13012 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13013 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13014 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13015
13016 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13017 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13018
13019 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13020}
13021
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:696
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:404
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1803
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1596
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1709
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2965
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2775
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3255
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1017
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:3044
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:130
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:128
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:125
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:122
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:127
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:129
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:126
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:134
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:660
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3958
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1422
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1418
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1425
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1416
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1420
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:675
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2285
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2303
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1354
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3681
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:582
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:176
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1342
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3365
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1325
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:930
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1190
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3046
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1209
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12757
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:254
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2317
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3753
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1138
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1430
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1331
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:949
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12781
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:814
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:762
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:697
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2030
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1098
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2036
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1950
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4227
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3725
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1926
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1735
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1495
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2470
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3818
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1406
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12373
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2543
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1382
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1729
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3074
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5363
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4181
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3171
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11694
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1783
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1771
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1172
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:984
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1501
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1979
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4167
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3586
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2406
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:1997
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6577
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3179
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12751
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1412
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3784
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3121
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4288
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3405
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7256
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2775
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12736
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4235
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4055
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4210
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3760
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3296
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5850
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11752
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1685
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2968
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3268
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3387
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1184
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2729
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7363
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1394
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1701
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2420
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5765
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9370
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1178
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:975
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1833
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2024
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2103
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3457
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1705
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1031
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12703
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12693
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1865
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3505
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4471
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1424
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2945
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2794
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1418
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2807
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1762
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:119
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:514
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
union RString::@52::@53::@55 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:211
Definition string.c:8250
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:307
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113