Ruby 4.1.0dev (2026-04-07 revision fcd210086c82c4e4f2835561d7f7ce81e9edf1c5)
string.c (fcd210086c82c4e4f2835561d7f7ce81e9edf1c5)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
156} while (0)
157
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
161} while (0)
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
172 }\
173 }\
174 else {\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 }\
180} while (0)
181
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
192 } \
193} while (0)
194
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
197/* TODO: include the terminator size in capa. */
198
199#define STR_ENC_GET(str) get_encoding(str)
200
201static inline bool
202zero_filled(const char *s, int n)
203{
204 for (; n > 0; --n) {
205 if (*s++) return false;
206 }
207 return true;
208}
209
210#if !defined SHARABLE_MIDDLE_SUBSTRING
211# define SHARABLE_MIDDLE_SUBSTRING 0
212#endif
213
214static inline bool
215SHARABLE_SUBSTRING_P(VALUE str, long beg, long len)
216{
217#if SHARABLE_MIDDLE_SUBSTRING
218 return true;
219#else
220 long end = beg + len;
221 long source_len = RSTRING_LEN(str);
222 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
223#endif
224}
225
226static inline long
227str_embed_capa(VALUE str)
228{
229 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
230}
231
232bool
233rb_str_reembeddable_p(VALUE str)
234{
235 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
236}
237
238static inline size_t
239rb_str_embed_size(long capa, long termlen)
240{
241 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
242 if (size < sizeof(struct RString)) size = sizeof(struct RString);
243 return size;
244}
245
246size_t
247rb_str_size_as_embedded(VALUE str)
248{
249 size_t real_size;
250 if (STR_EMBED_P(str)) {
251 size_t capa = RSTRING(str)->len;
252 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
253
254 real_size = rb_str_embed_size(capa, TERM_LEN(str));
255 }
256 /* if the string is not currently embedded, but it can be embedded, how
257 * much space would it require */
258 else if (rb_str_reembeddable_p(str)) {
259 size_t capa = RSTRING(str)->as.heap.aux.capa;
260 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
261
262 real_size = rb_str_embed_size(capa, TERM_LEN(str));
263 }
264 else {
265 real_size = sizeof(struct RString);
266 }
267
268 return real_size;
269}
270
271static inline bool
272STR_EMBEDDABLE_P(long len, long termlen)
273{
274 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
275}
276
277static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
278static VALUE str_new_frozen(VALUE klass, VALUE orig);
279static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
280static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
281static VALUE str_new(VALUE klass, const char *ptr, long len);
282static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
283static inline void str_modifiable(VALUE str);
284static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
285static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
286
287static inline void
288str_make_independent(VALUE str)
289{
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str), len, 0L, termlen);
293}
294
295static inline int str_dependent_p(VALUE str);
296
297void
298rb_str_make_independent(VALUE str)
299{
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
302 }
303}
304
305void
306rb_str_make_embedded(VALUE str)
307{
308 RUBY_ASSERT(rb_str_reembeddable_p(str));
309 RUBY_ASSERT(!STR_EMBED_P(str));
310
311 int termlen = TERM_LEN(str);
312 char *buf = RSTRING(str)->as.heap.ptr;
313 long old_capa = RSTRING(str)->as.heap.aux.capa + termlen;
314 long len = RSTRING(str)->len;
315
316 STR_SET_EMBED(str);
317 STR_SET_LEN(str, len);
318
319 if (len > 0) {
320 memcpy(RSTRING_PTR(str), buf, len);
321 SIZED_FREE_N(buf, old_capa);
322 }
323
324 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
325}
326
327void
328rb_debug_rstring_null_ptr(const char *func)
329{
330 fprintf(stderr, "%s is returning NULL!! "
331 "SIGSEGV is highly expected to follow immediately.\n"
332 "If you could reproduce, attach your debugger here, "
333 "and look at the passed string.\n",
334 func);
335}
336
337/* symbols for [up|down|swap]case/capitalize options */
338static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
339
340static rb_encoding *
341get_encoding(VALUE str)
342{
343 return rb_enc_from_index(ENCODING_GET(str));
344}
345
346static void
347mustnot_broken(VALUE str)
348{
349 if (is_broken_string(str)) {
350 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
351 }
352}
353
354static void
355mustnot_wchar(VALUE str)
356{
357 rb_encoding *enc = STR_ENC_GET(str);
358 if (rb_enc_mbminlen(enc) > 1) {
359 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
360 }
361}
362
363static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
364
365#if SIZEOF_LONG == SIZEOF_VOIDP
366#define PRECOMPUTED_FAKESTR_HASH 1
367#else
368#endif
369
370static inline bool
371BARE_STRING_P(VALUE str)
372{
373 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
374}
375
376static inline st_index_t
377str_do_hash(VALUE str)
378{
379 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
380 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
381 if (e && !is_ascii_string(str)) {
382 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
383 }
384 return h;
385}
386
387static VALUE
388str_store_precomputed_hash(VALUE str, st_index_t hash)
389{
390 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
391 RUBY_ASSERT(STR_EMBED_P(str));
392
393#if RUBY_DEBUG
394 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
395 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
397#endif
398
399 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
400
401 FL_SET(str, STR_PRECOMPUTED_HASH);
402
403 return str;
404}
405
406VALUE
407rb_fstring(VALUE str)
408{
409 VALUE fstr;
410 int bare;
411
412 Check_Type(str, T_STRING);
413
414 if (FL_TEST(str, RSTRING_FSTR))
415 return str;
416
417 bare = BARE_STRING_P(str);
418 if (!bare) {
419 if (STR_EMBED_P(str)) {
420 OBJ_FREEZE(str);
421 return str;
422 }
423
424 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
426 return str;
427 }
428 }
429
430 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
431 rb_str_resize(str, RSTRING_LEN(str));
432
433 fstr = register_fstring(str, false, false);
434
435 if (!bare) {
436 str_replace_shared_without_enc(str, fstr);
437 OBJ_FREEZE(str);
438 return str;
439 }
440 return fstr;
441}
442
443static VALUE fstring_table_obj;
444
445static VALUE
446fstring_concurrent_set_hash(VALUE str)
447{
448#ifdef PRECOMPUTED_FAKESTR_HASH
449 st_index_t h;
450 if (FL_TEST_RAW(str, STR_FAKESTR)) {
451 // register_fstring precomputes the hash and stores it in capa for fake strings
452 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
453 }
454 else {
455 h = rb_str_hash(str);
456 }
457 // rb_str_hash doesn't include the encoding for ascii only strings, so
458 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
459 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
460#else
461 return (VALUE)rb_str_hash(str);
462#endif
463}
464
465static bool
466fstring_concurrent_set_cmp(VALUE a, VALUE b)
467{
468 long alen, blen;
469 const char *aptr, *bptr;
470
473
474 RSTRING_GETMEM(a, aptr, alen);
475 RSTRING_GETMEM(b, bptr, blen);
476 return (alen == blen &&
477 ENCODING_GET(a) == ENCODING_GET(b) &&
478 memcmp(aptr, bptr, alen) == 0);
479}
480
482 bool copy;
483 bool force_precompute_hash;
484};
485
486static VALUE
487fstring_concurrent_set_create(VALUE str, void *data)
488{
489 struct fstr_create_arg *arg = data;
490
491 // Unless the string is empty or binary, its coderange has been precomputed.
492 int coderange = ENC_CODERANGE(str);
493
494 if (FL_TEST_RAW(str, STR_FAKESTR)) {
495 if (arg->copy) {
496 VALUE new_str;
497 long len = RSTRING_LEN(str);
498 long capa = len + sizeof(st_index_t);
499 int term_len = TERM_LEN(str);
500
501 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
502 new_str = str_alloc_embed(rb_cString, capa + term_len);
503 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
504 STR_SET_LEN(new_str, RSTRING_LEN(str));
505 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
506 rb_enc_copy(new_str, str);
507 str_store_precomputed_hash(new_str, str_do_hash(str));
508 }
509 else {
510 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
511 rb_enc_copy(new_str, str);
512#ifdef PRECOMPUTED_FAKESTR_HASH
513 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
514 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
515 }
516#endif
517 }
518 str = new_str;
519 }
520 else {
521 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
522 RSTRING(str)->len,
523 ENCODING_GET(str));
524 }
525 OBJ_FREEZE(str);
526 }
527 else {
528 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
529 str = str_new_frozen(rb_cString, str);
530 }
531 if (STR_SHARED_P(str)) { /* str should not be shared */
532 /* shared substring */
533 str_make_independent(str);
535 }
536 if (!BARE_STRING_P(str)) {
537 str = str_new_frozen(rb_cString, str);
538 }
539 }
540
541 ENC_CODERANGE_SET(str, coderange);
542 RBASIC(str)->flags |= RSTRING_FSTR;
543 if (!RB_OBJ_SHAREABLE_P(str)) {
544 RB_OBJ_SET_SHAREABLE(str);
545 }
546 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
549 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
550 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
552 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
553
554 return str;
555}
556
557static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
558 .hash = fstring_concurrent_set_hash,
559 .cmp = fstring_concurrent_set_cmp,
560 .create = fstring_concurrent_set_create,
561 .free = NULL,
562};
563
564void
565Init_fstring_table(void)
566{
567 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
568 rb_gc_register_address(&fstring_table_obj);
569}
570
571static VALUE
572register_fstring(VALUE str, bool copy, bool force_precompute_hash)
573{
574 struct fstr_create_arg args = {
575 .copy = copy,
576 .force_precompute_hash = force_precompute_hash
577 };
578
579#if SIZEOF_VOIDP == SIZEOF_LONG
580 if (FL_TEST_RAW(str, STR_FAKESTR)) {
581 // if the string hasn't been interned, we'll need the hash twice, so we
582 // compute it once and store it in capa
583 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
584 }
585#endif
586
587 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
588
589 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
591 RUBY_ASSERT(OBJ_FROZEN(result));
593 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
594 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
596
597 return result;
598}
599
600bool
601rb_obj_is_fstring_table(VALUE obj)
602{
603 ASSERT_vm_locking();
604
605 return obj == fstring_table_obj;
606}
607
608void
609rb_gc_free_fstring(VALUE obj)
610{
611 ASSERT_vm_locking_with_barrier();
612
613 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
615 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
616
617 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
618
619 RB_DEBUG_COUNTER_INC(obj_str_fstr);
620
621 FL_UNSET(obj, RSTRING_FSTR);
622}
623
624void
625rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
626{
627 if (fstring_table_obj) {
628 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
629 }
630}
631
632static VALUE
633setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
634{
635 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
636 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
637
638 if (!name) {
640 name = "";
641 }
642
643 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
644
645 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
646 fake_str->len = len;
647 fake_str->as.heap.ptr = (char *)name;
648 fake_str->as.heap.aux.capa = len;
649 return (VALUE)fake_str;
650}
651
652/*
653 * set up a fake string which refers a static string literal.
654 */
655VALUE
656rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
657{
658 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
659}
660
661/*
662 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
663 * shared string which refers a static string literal. `ptr` must
664 * point a constant string.
665 */
666VALUE
667rb_fstring_new(const char *ptr, long len)
668{
669 struct RString fake_str = {RBASIC_INIT};
670 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
671}
672
673VALUE
674rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
675{
676 struct RString fake_str = {RBASIC_INIT};
677 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
678}
679
680VALUE
681rb_fstring_cstr(const char *ptr)
682{
683 return rb_fstring_new(ptr, strlen(ptr));
684}
685
686static inline bool
687single_byte_optimizable(VALUE str)
688{
689 int encindex = ENCODING_GET(str);
690 switch (encindex) {
691 case ENCINDEX_ASCII_8BIT:
692 case ENCINDEX_US_ASCII:
693 return true;
694 case ENCINDEX_UTF_8:
695 // For UTF-8 it's worth scanning the string coderange when unknown.
697 }
698 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
699 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
700 return true;
701 }
702
703 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
704 return true;
705 }
706
707 /* Conservative. Possibly single byte.
708 * "\xa1" in Shift_JIS for example. */
709 return false;
710}
711
713
714static inline const char *
715search_nonascii(const char *p, const char *e)
716{
717 const char *s, *t;
718
719#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
720# if SIZEOF_UINTPTR_T == 8
721# define NONASCII_MASK UINT64_C(0x8080808080808080)
722# elif SIZEOF_UINTPTR_T == 4
723# define NONASCII_MASK UINT32_C(0x80808080)
724# else
725# error "don't know what to do."
726# endif
727#else
728# if SIZEOF_UINTPTR_T == 8
729# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
730# elif SIZEOF_UINTPTR_T == 4
731# define NONASCII_MASK 0x80808080UL /* or...? */
732# else
733# error "don't know what to do."
734# endif
735#endif
736
737 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
738#if !UNALIGNED_WORD_ACCESS
739 if ((uintptr_t)p % SIZEOF_VOIDP) {
740 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
741 p += l;
742 switch (l) {
743 default: UNREACHABLE;
744#if SIZEOF_VOIDP > 4
745 case 7: if (p[-7]&0x80) return p-7;
746 case 6: if (p[-6]&0x80) return p-6;
747 case 5: if (p[-5]&0x80) return p-5;
748 case 4: if (p[-4]&0x80) return p-4;
749#endif
750 case 3: if (p[-3]&0x80) return p-3;
751 case 2: if (p[-2]&0x80) return p-2;
752 case 1: if (p[-1]&0x80) return p-1;
753 case 0: break;
754 }
755 }
756#endif
757#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
758#define aligned_ptr(value) \
759 __builtin_assume_aligned((value), sizeof(uintptr_t))
760#else
761#define aligned_ptr(value) (value)
762#endif
763 s = aligned_ptr(p);
764 t = (e - (SIZEOF_VOIDP-1));
765#undef aligned_ptr
766 for (;s < t; s += sizeof(uintptr_t)) {
767 uintptr_t word;
768 memcpy(&word, s, sizeof(word));
769 if (word & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
946{
947 int cr = ENC_CODERANGE(str);
948
949 if (cr == ENC_CODERANGE_UNKNOWN) {
950 cr = enc_coderange_scan(str, get_encoding(str));
951 ENC_CODERANGE_SET(str, cr);
952 }
953 return cr;
954}
955
956static inline bool
957rb_enc_str_asciicompat(VALUE str)
958{
959 int encindex = ENCODING_GET_INLINED(str);
960 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
961}
962
963int
965{
966 switch(ENC_CODERANGE(str)) {
968 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
970 return true;
971 default:
972 return false;
973 }
974}
975
976static inline void
977str_mod_check(VALUE s, const char *p, long len)
978{
979 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
980 rb_raise(rb_eRuntimeError, "string modified");
981 }
982}
983
984static size_t
985str_capacity(VALUE str, const int termlen)
986{
987 if (STR_EMBED_P(str)) {
988 return str_embed_capa(str) - termlen;
989 }
990 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
991 return RSTRING(str)->len;
992 }
993 else {
994 return RSTRING(str)->as.heap.aux.capa;
995 }
996}
997
998size_t
1000{
1001 return str_capacity(str, TERM_LEN(str));
1002}
1003
1004static inline void
1005must_not_null(const char *ptr)
1006{
1007 if (!ptr) {
1008 rb_raise(rb_eArgError, "NULL pointer given");
1009 }
1010}
1011
1012static inline VALUE
1013str_alloc_embed(VALUE klass, size_t capa)
1014{
1015 size_t size = rb_str_embed_size(capa, 0);
1016 RUBY_ASSERT(size > 0);
1017 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1018
1019 NEWOBJ_OF(str, struct RString, klass,
1021
1022 str->len = 0;
1023 str->as.embed.ary[0] = 0;
1024
1025 return (VALUE)str;
1026}
1027
1028static inline VALUE
1029str_alloc_heap(VALUE klass)
1030{
1031 NEWOBJ_OF(str, struct RString, klass,
1032 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1033
1034 str->len = 0;
1035 str->as.heap.aux.capa = 0;
1036 str->as.heap.ptr = NULL;
1037
1038 return (VALUE)str;
1039}
1040
1041static inline VALUE
1042empty_str_alloc(VALUE klass)
1043{
1044 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1045 VALUE str = str_alloc_embed(klass, 0);
1046 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1048 return str;
1049}
1050
1051static VALUE
1052str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1053{
1054 VALUE str;
1055
1056 if (len < 0) {
1057 rb_raise(rb_eArgError, "negative string size (or size too big)");
1058 }
1059
1060 if (enc == NULL) {
1061 enc = rb_ascii8bit_encoding();
1062 }
1063
1064 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1065
1066 int termlen = rb_enc_mbminlen(enc);
1067
1068 if (STR_EMBEDDABLE_P(len, termlen)) {
1069 str = str_alloc_embed(klass, len + termlen);
1070 if (len == 0) {
1071 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1072 }
1073 }
1074 else {
1075 str = str_alloc_heap(klass);
1076 RSTRING(str)->as.heap.aux.capa = len;
1077 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1078 * integer overflow. If we can STATIC_ASSERT that, the following
1079 * mul_add_mul can be reverted to a simple ALLOC_N. */
1080 RSTRING(str)->as.heap.ptr =
1081 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1082 }
1083
1084 rb_enc_raw_set(str, enc);
1085
1086 if (ptr) {
1087 memcpy(RSTRING_PTR(str), ptr, len);
1088 }
1089 else {
1090 memset(RSTRING_PTR(str), 0, len);
1091 }
1092
1093 STR_SET_LEN(str, len);
1094 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1095 return str;
1096}
1097
1098static VALUE
1099str_new(VALUE klass, const char *ptr, long len)
1100{
1101 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1102}
1103
1104VALUE
1105rb_str_new(const char *ptr, long len)
1106{
1107 return str_new(rb_cString, ptr, len);
1108}
1109
1110VALUE
1111rb_usascii_str_new(const char *ptr, long len)
1112{
1113 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1114}
1115
1116VALUE
1117rb_utf8_str_new(const char *ptr, long len)
1118{
1119 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1120}
1121
1122VALUE
1123rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1124{
1125 return str_enc_new(rb_cString, ptr, len, enc);
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1133 * memory regions, and that cannot be detected by the MSAN. Just
1134 * trust the programmer that the argument passed here is a sane C
1135 * string. */
1136 __msan_unpoison_string(ptr);
1137 return rb_str_new(ptr, strlen(ptr));
1138}
1139
1140VALUE
1142{
1143 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1144}
1145
1146VALUE
1148{
1149 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1150}
1151
1152VALUE
1154{
1155 must_not_null(ptr);
1156 if (rb_enc_mbminlen(enc) != 1) {
1157 rb_raise(rb_eArgError, "wchar encoding given");
1158 }
1159 return rb_enc_str_new(ptr, strlen(ptr), enc);
1160}
1161
1162static VALUE
1163str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1164{
1165 VALUE str;
1166
1167 if (len < 0) {
1168 rb_raise(rb_eArgError, "negative string size (or size too big)");
1169 }
1170
1171 if (!ptr) {
1172 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1173 }
1174 else {
1175 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1176 str = str_alloc_heap(klass);
1177 RSTRING(str)->len = len;
1178 RSTRING(str)->as.heap.ptr = (char *)ptr;
1179 RSTRING(str)->as.heap.aux.capa = len;
1180 RBASIC(str)->flags |= STR_NOFREE;
1181 rb_enc_associate_index(str, encindex);
1182 }
1183 return str;
1184}
1185
1186VALUE
1187rb_str_new_static(const char *ptr, long len)
1188{
1189 return str_new_static(rb_cString, ptr, len, 0);
1190}
1191
1192VALUE
1194{
1195 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1196}
1197
1198VALUE
1200{
1201 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1202}
1203
1204VALUE
1206{
1207 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1208}
1209
1210static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1211 rb_encoding *from, rb_encoding *to,
1212 int ecflags, VALUE ecopts);
1213
1214static inline bool
1215is_enc_ascii_string(VALUE str, rb_encoding *enc)
1216{
1217 int encidx = rb_enc_to_index(enc);
1218 if (rb_enc_get_index(str) == encidx)
1219 return is_ascii_string(str);
1220 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1221}
1222
1223VALUE
1224rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1225{
1226 long len;
1227 const char *ptr;
1228 VALUE newstr;
1229
1230 if (!to) return str;
1231 if (!from) from = rb_enc_get(str);
1232 if (from == to) return str;
1233 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1234 rb_is_ascii8bit_enc(to)) {
1235 if (STR_ENC_GET(str) != to) {
1236 str = rb_str_dup(str);
1237 rb_enc_associate(str, to);
1238 }
1239 return str;
1240 }
1241
1242 RSTRING_GETMEM(str, ptr, len);
1243 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1244 from, to, ecflags, ecopts);
1245 if (NIL_P(newstr)) {
1246 /* some error, return original */
1247 return str;
1248 }
1249 return newstr;
1250}
1251
1252VALUE
1253rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1254 rb_encoding *from, int ecflags, VALUE ecopts)
1255{
1256 long olen;
1257
1258 olen = RSTRING_LEN(newstr);
1259 if (ofs < -olen || olen < ofs)
1260 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1261 if (ofs < 0) ofs += olen;
1262 if (!from) {
1263 STR_SET_LEN(newstr, ofs);
1264 return rb_str_cat(newstr, ptr, len);
1265 }
1266
1267 rb_str_modify(newstr);
1268 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1269 rb_enc_get(newstr),
1270 ecflags, ecopts);
1271}
1272
1273VALUE
1274rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1275{
1276 STR_SET_LEN(str, 0);
1277 rb_enc_associate(str, enc);
1278 rb_str_cat(str, ptr, len);
1279 return str;
1280}
1281
1282static VALUE
1283str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1284 rb_encoding *from, rb_encoding *to,
1285 int ecflags, VALUE ecopts)
1286{
1287 rb_econv_t *ec;
1289 long olen;
1290 VALUE econv_wrapper;
1291 const unsigned char *start, *sp;
1292 unsigned char *dest, *dp;
1293 size_t converted_output = (size_t)ofs;
1294
1295 olen = rb_str_capacity(newstr);
1296
1297 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1298 RBASIC_CLEAR_CLASS(econv_wrapper);
1299 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1300 if (!ec) return Qnil;
1301 DATA_PTR(econv_wrapper) = ec;
1302
1303 sp = (unsigned char*)ptr;
1304 start = sp;
1305 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1306 (dp = dest + converted_output),
1307 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1309 /* destination buffer short */
1310 size_t converted_input = sp - start;
1311 size_t rest = len - converted_input;
1312 converted_output = dp - dest;
1313 rb_str_set_len(newstr, converted_output);
1314 if (converted_input && converted_output &&
1315 rest < (LONG_MAX / converted_output)) {
1316 rest = (rest * converted_output) / converted_input;
1317 }
1318 else {
1319 rest = olen;
1320 }
1321 olen += rest < 2 ? 2 : rest;
1322 rb_str_resize(newstr, olen);
1323 }
1324 DATA_PTR(econv_wrapper) = 0;
1325 RB_GC_GUARD(econv_wrapper);
1326 rb_econv_close(ec);
1327 switch (ret) {
1328 case econv_finished:
1329 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1330 rb_str_set_len(newstr, len);
1331 rb_enc_associate(newstr, to);
1332 return newstr;
1333
1334 default:
1335 return Qnil;
1336 }
1337}
1338
1339VALUE
1341{
1342 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1343}
1344
1345VALUE
1347{
1348 rb_encoding *ienc;
1349 VALUE str;
1350 const int eidx = rb_enc_to_index(eenc);
1351
1352 if (!ptr) {
1353 return rb_enc_str_new(ptr, len, eenc);
1354 }
1355
1356 /* ASCII-8BIT case, no conversion */
1357 if ((eidx == rb_ascii8bit_encindex()) ||
1358 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1359 return rb_str_new(ptr, len);
1360 }
1361 /* no default_internal or same encoding, no conversion */
1362 ienc = rb_default_internal_encoding();
1363 if (!ienc || eenc == ienc) {
1364 return rb_enc_str_new(ptr, len, eenc);
1365 }
1366 /* ASCII compatible, and ASCII only string, no conversion in
1367 * default_internal */
1368 if ((eidx == rb_ascii8bit_encindex()) ||
1369 (eidx == rb_usascii_encindex()) ||
1370 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1371 return rb_enc_str_new(ptr, len, ienc);
1372 }
1373 /* convert from the given encoding to default_internal */
1374 str = rb_enc_str_new(NULL, 0, ienc);
1375 /* when the conversion failed for some reason, just ignore the
1376 * default_internal and result in the given encoding as-is. */
1377 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1378 rb_str_initialize(str, ptr, len, eenc);
1379 }
1380 return str;
1381}
1382
1383VALUE
1384rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1385{
1386 int eidx = rb_enc_to_index(eenc);
1387 if (eidx == rb_usascii_encindex() &&
1388 !is_ascii_string(str)) {
1389 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1390 return str;
1391 }
1392 rb_enc_associate_index(str, eidx);
1393 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1394}
1395
1396VALUE
1397rb_external_str_new(const char *ptr, long len)
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1406}
1407
1408VALUE
1409rb_locale_str_new(const char *ptr, long len)
1410{
1411 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1424}
1425
1426VALUE
1428{
1429 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1430}
1431
1432VALUE
1434{
1435 return rb_str_export_to_enc(str, rb_default_external_encoding());
1436}
1437
1438VALUE
1440{
1441 return rb_str_export_to_enc(str, rb_locale_encoding());
1442}
1443
1444VALUE
1446{
1447 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1448}
1449
1450static VALUE
1451str_replace_shared_without_enc(VALUE str2, VALUE str)
1452{
1453 const int termlen = TERM_LEN(str);
1454 char *ptr;
1455 long len;
1456
1457 RSTRING_GETMEM(str, ptr, len);
1458 if (str_embed_capa(str2) >= len + termlen) {
1459 char *ptr2 = RSTRING(str2)->as.embed.ary;
1460 STR_SET_EMBED(str2);
1461 memcpy(ptr2, RSTRING_PTR(str), len);
1462 TERM_FILL(ptr2+len, termlen);
1463 }
1464 else {
1465 VALUE root;
1466 if (STR_SHARED_P(str)) {
1467 root = RSTRING(str)->as.heap.aux.shared;
1468 RSTRING_GETMEM(str, ptr, len);
1469 }
1470 else {
1471 root = rb_str_new_frozen(str);
1472 RSTRING_GETMEM(root, ptr, len);
1473 }
1474 RUBY_ASSERT(OBJ_FROZEN(root));
1475
1476 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1477 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1478 rb_fatal("about to free a possible shared root");
1479 }
1480 char *ptr2 = STR_HEAP_PTR(str2);
1481 if (ptr2 != ptr) {
1482 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1483 }
1484 }
1485 FL_SET(str2, STR_NOEMBED);
1486 RSTRING(str2)->as.heap.ptr = ptr;
1487 STR_SET_SHARED(str2, root);
1488 }
1489
1490 STR_SET_LEN(str2, len);
1491
1492 return str2;
1493}
1494
1495static VALUE
1496str_replace_shared(VALUE str2, VALUE str)
1497{
1498 str_replace_shared_without_enc(str2, str);
1499 rb_enc_cr_str_exact_copy(str2, str);
1500 return str2;
1501}
1502
1503static VALUE
1504str_new_shared(VALUE klass, VALUE str)
1505{
1506 return str_replace_shared(str_alloc_heap(klass), str);
1507}
1508
1509VALUE
1511{
1512 return str_new_shared(rb_obj_class(str), str);
1513}
1514
1515VALUE
1517{
1518 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1519 return str_new_frozen(rb_obj_class(orig), orig);
1520}
1521
1522static VALUE
1523rb_str_new_frozen_String(VALUE orig)
1524{
1525 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1526 return str_new_frozen(rb_cString, orig);
1527}
1528
1529
1530VALUE
1531rb_str_frozen_bare_string(VALUE orig)
1532{
1533 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1534 return str_new_frozen(rb_cString, orig);
1535}
1536
1537VALUE
1538rb_str_tmp_frozen_acquire(VALUE orig)
1539{
1540 if (OBJ_FROZEN_RAW(orig)) return orig;
1541 return str_new_frozen_buffer(0, orig, FALSE);
1542}
1543
1544VALUE
1545rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1546{
1547 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1548 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1549
1550 VALUE str = str_alloc_heap(0);
1551 OBJ_FREEZE(str);
1552 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1553 FL_SET(str, STR_SHARED_ROOT);
1554
1555 size_t capa = str_capacity(orig, TERM_LEN(orig));
1556
1557 /* If the string is embedded then we want to create a copy that is heap
1558 * allocated. If the string is shared then the shared root must be
1559 * embedded, so we want to create a copy. If the string is a shared root
1560 * then it must be embedded, so we want to create a copy. */
1561 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1562 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1563 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1564 }
1565 else {
1566 /* orig must be heap allocated and not shared, so we can safely transfer
1567 * the pointer to str. */
1568 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1569 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1570 RBASIC(orig)->flags &= ~STR_NOFREE;
1571 STR_SET_SHARED(orig, str);
1572 if (RB_OBJ_SHAREABLE_P(orig)) {
1573 RB_OBJ_SET_SHAREABLE(str);
1574 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1575 }
1576 }
1577
1578 RSTRING(str)->len = RSTRING(orig)->len;
1579 RSTRING(str)->as.heap.aux.capa = capa + (TERM_LEN(orig) - TERM_LEN(str));
1580
1581 return str;
1582}
1583
1584void
1585rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1586{
1587 if (RBASIC_CLASS(tmp) != 0)
1588 return;
1589
1590 if (STR_EMBED_P(tmp)) {
1592 }
1593 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1594 !OBJ_FROZEN_RAW(orig)) {
1595 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1596
1597 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1598 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1599 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1600
1601 /* Unshare orig since the root (tmp) only has this one child. */
1602 FL_UNSET_RAW(orig, STR_SHARED);
1603 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1604 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1606
1607 /* Make tmp embedded and empty so it is safe for sweeping. */
1608 STR_SET_EMBED(tmp);
1609 STR_SET_LEN(tmp, 0);
1610 }
1611 }
1612}
1613
1614static VALUE
1615str_new_frozen(VALUE klass, VALUE orig)
1616{
1617 return str_new_frozen_buffer(klass, orig, TRUE);
1618}
1619
1620static VALUE
1621heap_str_make_shared(VALUE klass, VALUE orig)
1622{
1623 RUBY_ASSERT(!STR_EMBED_P(orig));
1624 RUBY_ASSERT(!STR_SHARED_P(orig));
1626
1627 VALUE str = str_alloc_heap(klass);
1628 STR_SET_LEN(str, RSTRING_LEN(orig));
1629 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1630 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1631 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1632 RBASIC(orig)->flags &= ~STR_NOFREE;
1633 STR_SET_SHARED(orig, str);
1634 if (klass == 0)
1635 FL_UNSET_RAW(str, STR_BORROWED);
1636 return str;
1637}
1638
1639static VALUE
1640str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1641{
1642 VALUE str;
1643
1644 long len = RSTRING_LEN(orig);
1645 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1646 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1647
1648 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1649 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1650 RUBY_ASSERT(STR_EMBED_P(str));
1651 }
1652 else {
1653 if (FL_TEST_RAW(orig, STR_SHARED)) {
1654 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1655 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1656 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1657 RUBY_ASSERT(ofs >= 0);
1658 RUBY_ASSERT(rest >= 0);
1659 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1661
1662 if ((ofs > 0) || (rest > 0) ||
1663 (klass != RBASIC(shared)->klass) ||
1664 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1665 str = str_new_shared(klass, shared);
1666 RUBY_ASSERT(!STR_EMBED_P(str));
1667 RSTRING(str)->as.heap.ptr += ofs;
1668 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1669 }
1670 else {
1671 if (RBASIC_CLASS(shared) == 0)
1672 FL_SET_RAW(shared, STR_BORROWED);
1673 return shared;
1674 }
1675 }
1676 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1677 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1678 STR_SET_EMBED(str);
1679 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1680 STR_SET_LEN(str, RSTRING_LEN(orig));
1681 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1682 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1683 }
1684 else {
1685 if (RB_OBJ_SHAREABLE_P(orig)) {
1686 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 }
1688 else {
1689 str = heap_str_make_shared(klass, orig);
1690 }
1691 }
1692 }
1693
1694 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1695 OBJ_FREEZE(str);
1696 return str;
1697}
1698
1699VALUE
1700rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1701{
1702 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1703}
1704
1705static VALUE
1706str_new_empty_String(VALUE str)
1707{
1708 VALUE v = rb_str_new(0, 0);
1709 rb_enc_copy(v, str);
1710 return v;
1711}
1712
1713#define STR_BUF_MIN_SIZE 63
1714
1715VALUE
1717{
1718 if (STR_EMBEDDABLE_P(capa, 1)) {
1719 return str_alloc_embed(rb_cString, capa + 1);
1720 }
1721
1722 VALUE str = str_alloc_heap(rb_cString);
1723
1724 RSTRING(str)->as.heap.aux.capa = capa;
1725 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1726 RSTRING(str)->as.heap.ptr[0] = '\0';
1727
1728 return str;
1729}
1730
1731VALUE
1733{
1734 VALUE str;
1735 long len = strlen(ptr);
1736
1737 str = rb_str_buf_new(len);
1738 rb_str_buf_cat(str, ptr, len);
1739
1740 return str;
1741}
1742
1743VALUE
1745{
1746 return str_new(0, 0, len);
1747}
1748
1749void
1751{
1752 if (STR_EMBED_P(str)) {
1753 RB_DEBUG_COUNTER_INC(obj_str_embed);
1754 }
1755 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1756 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1757 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1758 }
1759 else {
1760 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1761 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1762 }
1763}
1764
1765size_t
1766rb_str_memsize(VALUE str)
1767{
1768 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1769 return STR_HEAP_SIZE(str);
1770 }
1771 else {
1772 return 0;
1773 }
1774}
1775
1776VALUE
1778{
1779 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1780}
1781
1782static inline void str_discard(VALUE str);
1783static void str_shared_replace(VALUE str, VALUE str2);
1784
1785void
1787{
1788 if (str != str2) str_shared_replace(str, str2);
1789}
1790
1791static void
1792str_shared_replace(VALUE str, VALUE str2)
1793{
1794 rb_encoding *enc;
1795 int cr;
1796 int termlen;
1797
1798 RUBY_ASSERT(str2 != str);
1799 enc = STR_ENC_GET(str2);
1800 cr = ENC_CODERANGE(str2);
1801 str_discard(str);
1802 termlen = rb_enc_mbminlen(enc);
1803
1804 STR_SET_LEN(str, RSTRING_LEN(str2));
1805
1806 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1807 STR_SET_EMBED(str);
1808 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812 else {
1813 if (STR_EMBED_P(str2)) {
1814 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1815 long len = RSTRING_LEN(str2);
1816 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1817
1818 char *new_ptr = ALLOC_N(char, len + termlen);
1819 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1820 RSTRING(str2)->as.heap.ptr = new_ptr;
1821 STR_SET_LEN(str2, len);
1822 RSTRING(str2)->as.heap.aux.capa = len;
1823 STR_SET_NOEMBED(str2);
1824 }
1825
1826 STR_SET_NOEMBED(str);
1827 FL_UNSET(str, STR_SHARED);
1828 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1829
1830 if (FL_TEST(str2, STR_SHARED)) {
1831 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1832 STR_SET_SHARED(str, shared);
1833 }
1834 else {
1835 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1836 }
1837
1838 /* abandon str2 */
1839 STR_SET_EMBED(str2);
1840 RSTRING_PTR(str2)[0] = 0;
1841 STR_SET_LEN(str2, 0);
1842 rb_enc_associate(str, enc);
1843 ENC_CODERANGE_SET(str, cr);
1844 }
1845}
1846
1847VALUE
1849{
1850 VALUE str;
1851
1852 if (RB_TYPE_P(obj, T_STRING)) {
1853 return obj;
1854 }
1855 str = rb_funcall(obj, idTo_s, 0);
1856 return rb_obj_as_string_result(str, obj);
1857}
1858
1859VALUE
1860rb_obj_as_string_result(VALUE str, VALUE obj)
1861{
1862 if (!RB_TYPE_P(str, T_STRING))
1863 return rb_any_to_s(obj);
1864 return str;
1865}
1866
1867static VALUE
1868str_replace(VALUE str, VALUE str2)
1869{
1870 long len;
1871
1872 len = RSTRING_LEN(str2);
1873 if (STR_SHARED_P(str2)) {
1874 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1876 STR_SET_NOEMBED(str);
1877 STR_SET_LEN(str, len);
1878 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1879 STR_SET_SHARED(str, shared);
1880 rb_enc_cr_str_exact_copy(str, str2);
1881 }
1882 else {
1883 str_replace_shared(str, str2);
1884 }
1885
1886 return str;
1887}
1888
1889static inline VALUE
1890ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1891{
1892 size_t size = rb_str_embed_size(capa, 0);
1893 RUBY_ASSERT(size > 0);
1894 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1895
1896 NEWOBJ_OF(str, struct RString, klass,
1898
1899 str->len = 0;
1900
1901 return (VALUE)str;
1902}
1903
1904static inline VALUE
1905ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1906{
1907 NEWOBJ_OF(str, struct RString, klass,
1908 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1909
1910 str->as.heap.aux.capa = 0;
1911 str->as.heap.ptr = NULL;
1912
1913 return (VALUE)str;
1914}
1915
1916static inline VALUE
1917str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1918{
1919 int encidx = 0;
1920 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1921 encidx = rb_enc_get_index(str);
1922 flags &= ~ENCODING_MASK;
1923 }
1924 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1925 if (encidx) rb_enc_associate_index(dup, encidx);
1926 return dup;
1927}
1928
1929static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1930
1931static inline VALUE
1932str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1933{
1934 VALUE flags = FL_TEST_RAW(str, flag_mask);
1935 long len = RSTRING_LEN(str);
1936
1937 RUBY_ASSERT(STR_EMBED_P(dup));
1938 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1939 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1940 STR_SET_LEN(dup, RSTRING_LEN(str));
1941 return str_duplicate_setup_encoding(str, dup, flags);
1942}
1943
1944static inline VALUE
1945str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1946{
1947 VALUE flags = FL_TEST_RAW(str, flag_mask);
1948 VALUE root = str;
1949 if (FL_TEST_RAW(str, STR_SHARED)) {
1950 root = RSTRING(str)->as.heap.aux.shared;
1951 }
1952 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1953 root = str = str_new_frozen(klass, str);
1954 flags = FL_TEST_RAW(str, flag_mask);
1955 }
1956 RUBY_ASSERT(!STR_SHARED_P(root));
1958
1959 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1960 FL_SET_RAW(dup, RSTRING_NOEMBED);
1961 STR_SET_SHARED(dup, root);
1962 flags |= RSTRING_NOEMBED | STR_SHARED;
1963
1964 STR_SET_LEN(dup, RSTRING_LEN(str));
1965 return str_duplicate_setup_encoding(str, dup, flags);
1966}
1967
1968static inline VALUE
1969str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1970{
1971 if (STR_EMBED_P(str)) {
1972 return str_duplicate_setup_embed(klass, str, dup);
1973 }
1974 else {
1975 return str_duplicate_setup_heap(klass, str, dup);
1976 }
1977}
1978
1979static inline VALUE
1980str_duplicate(VALUE klass, VALUE str)
1981{
1982 VALUE dup;
1983 if (STR_EMBED_P(str)) {
1984 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1985 }
1986 else {
1987 dup = str_alloc_heap(klass);
1988 }
1989
1990 return str_duplicate_setup(klass, str, dup);
1991}
1992
1993VALUE
1995{
1996 return str_duplicate(rb_obj_class(str), str);
1997}
1998
1999/* :nodoc: */
2000VALUE
2001rb_str_dup_m(VALUE str)
2002{
2003 if (LIKELY(BARE_STRING_P(str))) {
2004 return str_duplicate(rb_cString, str);
2005 }
2006 else {
2007 return rb_obj_dup(str);
2008 }
2009}
2010
2011VALUE
2013{
2014 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2015 return str_duplicate(rb_cString, str);
2016}
2017
2018VALUE
2019rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2020{
2021 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2022 VALUE new_str, klass = rb_cString;
2023
2024 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2025 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2026 str_duplicate_setup_embed(klass, str, new_str);
2027 }
2028 else {
2029 new_str = ec_str_alloc_heap(ec, klass);
2030 str_duplicate_setup_heap(klass, str, new_str);
2031 }
2032 if (chilled) {
2033 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2034 }
2035 return new_str;
2036}
2037
2038VALUE
2039rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2040{
2041 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2042 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2043 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2044 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2045 return rb_str_freeze(str);
2046}
2047
2048/*
2049 * The documentation block below uses an include (instead of inline text)
2050 * because the included text has non-ASCII characters (which are not allowed in a C file).
2051 */
2052
2053/*
2054 *
2055 * call-seq:
2056 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2057 *
2058 * :include: doc/string/new.rdoc
2059 *
2060 */
2061
2062static VALUE
2063rb_str_init(int argc, VALUE *argv, VALUE str)
2064{
2065 static ID keyword_ids[2];
2066 VALUE orig, opt, venc, vcapa;
2067 VALUE kwargs[2];
2068 rb_encoding *enc = 0;
2069 int n;
2070
2071 if (!keyword_ids[0]) {
2072 keyword_ids[0] = rb_id_encoding();
2073 CONST_ID(keyword_ids[1], "capacity");
2074 }
2075
2076 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2077 if (!NIL_P(opt)) {
2078 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2079 venc = kwargs[0];
2080 vcapa = kwargs[1];
2081 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2082 enc = rb_to_encoding(venc);
2083 }
2084 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2085 long capa = NUM2LONG(vcapa);
2086 long len = 0;
2087 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2088
2089 if (capa < STR_BUF_MIN_SIZE) {
2090 capa = STR_BUF_MIN_SIZE;
2091 }
2092 if (n == 1) {
2093 StringValue(orig);
2094 len = RSTRING_LEN(orig);
2095 if (capa < len) {
2096 capa = len;
2097 }
2098 if (orig == str) n = 0;
2099 }
2100 str_modifiable(str);
2101 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2102 /* make noembed always */
2103 const size_t size = (size_t)capa + termlen;
2104 const char *const old_ptr = RSTRING_PTR(str);
2105 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2106 char *new_ptr = ALLOC_N(char, size);
2107 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2108 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2109 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2110 RSTRING(str)->as.heap.ptr = new_ptr;
2111 }
2112 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2113 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2114 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2115 }
2116 STR_SET_LEN(str, len);
2117 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2118 if (n == 1) {
2119 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2120 rb_enc_cr_str_exact_copy(str, orig);
2121 }
2122 FL_SET(str, STR_NOEMBED);
2123 RSTRING(str)->as.heap.aux.capa = capa;
2124 }
2125 else if (n == 1) {
2126 rb_str_replace(str, orig);
2127 }
2128 if (enc) {
2129 rb_enc_associate(str, enc);
2131 }
2132 }
2133 else if (n == 1) {
2134 rb_str_replace(str, orig);
2135 }
2136 return str;
2137}
2138
2139/* :nodoc: */
2140static VALUE
2141rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2142{
2143 if (klass != rb_cString) {
2144 return rb_class_new_instance_pass_kw(argc, argv, klass);
2145 }
2146
2147 static ID keyword_ids[2];
2148 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2149 VALUE kwargs[2];
2150 rb_encoding *enc = NULL;
2151
2152 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2153 if (NIL_P(opt)) {
2154 return rb_class_new_instance_pass_kw(argc, argv, klass);
2155 }
2156
2157 keyword_ids[0] = rb_id_encoding();
2158 CONST_ID(keyword_ids[1], "capacity");
2159 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2160 encoding = kwargs[0];
2161 capacity = kwargs[1];
2162
2163 if (n == 1) {
2164 orig = StringValue(orig);
2165 }
2166 else {
2167 orig = Qnil;
2168 }
2169
2170 if (UNDEF_P(encoding)) {
2171 if (!NIL_P(orig)) {
2172 encoding = rb_obj_encoding(orig);
2173 }
2174 }
2175
2176 if (!UNDEF_P(encoding)) {
2177 enc = rb_to_encoding(encoding);
2178 }
2179
2180 // If capacity is nil, we're basically just duping `orig`.
2181 if (UNDEF_P(capacity)) {
2182 if (NIL_P(orig)) {
2183 VALUE empty_str = str_new(klass, "", 0);
2184 if (enc) {
2185 rb_enc_associate(empty_str, enc);
2186 }
2187 return empty_str;
2188 }
2189 VALUE copy = str_duplicate(klass, orig);
2190 rb_enc_associate(copy, enc);
2191 ENC_CODERANGE_CLEAR(copy);
2192 return copy;
2193 }
2194
2195 long capa = 0;
2196 capa = NUM2LONG(capacity);
2197 if (capa < 0) {
2198 capa = 0;
2199 }
2200
2201 if (!NIL_P(orig)) {
2202 long orig_capa = rb_str_capacity(orig);
2203 if (orig_capa > capa) {
2204 capa = orig_capa;
2205 }
2206 }
2207
2208 VALUE str = str_enc_new(klass, NULL, capa, enc);
2209 STR_SET_LEN(str, 0);
2210 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2211
2212 if (!NIL_P(orig)) {
2213 rb_str_buf_append(str, orig);
2214 }
2215
2216 return str;
2217}
2218
2219#ifdef NONASCII_MASK
2220#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2221
2222/*
2223 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2224 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2225 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2226 *
2227 * if (!(byte & 0x80))
2228 * byte |= 0x40; // turn on bit6
2229 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2230 *
2231 * This function calculates whether a byte is leading or not for all bytes
2232 * in the argument word by concurrently using the above logic, and then
2233 * adds up the number of leading bytes in the word.
2234 */
2235static inline uintptr_t
2236count_utf8_lead_bytes_with_word(const uintptr_t *s)
2237{
2238 uintptr_t d = *s;
2239
2240 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2241 d = (d>>6) | (~d>>7);
2242 d &= NONASCII_MASK >> 7;
2243
2244 /* Gather all bytes. */
2245#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2246 /* use only if it can use POPCNT */
2247 return rb_popcount_intptr(d);
2248#else
2249 d += (d>>8);
2250 d += (d>>16);
2251# if SIZEOF_VOIDP == 8
2252 d += (d>>32);
2253# endif
2254 return (d&0xF);
2255#endif
2256}
2257#endif
2258
2259static inline long
2260enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2261{
2262 long c;
2263 const char *q;
2264
2265 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266 long diff = (long)(e - p);
2267 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268 }
2269#ifdef NONASCII_MASK
2270 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2271 uintptr_t len = 0;
2272 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2273 const uintptr_t *s, *t;
2274 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2275 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2276 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2277 while (p < (const char *)s) {
2278 if (is_utf8_lead_byte(*p)) len++;
2279 p++;
2280 }
2281 while (s < t) {
2282 len += count_utf8_lead_bytes_with_word(s);
2283 s++;
2284 }
2285 p = (const char *)s;
2286 }
2287 while (p < e) {
2288 if (is_utf8_lead_byte(*p)) len++;
2289 p++;
2290 }
2291 return (long)len;
2292 }
2293#endif
2294 else if (rb_enc_asciicompat(enc)) {
2295 c = 0;
2296 if (ENC_CODERANGE_CLEAN_P(cr)) {
2297 while (p < e) {
2298 if (ISASCII(*p)) {
2299 q = search_nonascii(p, e);
2300 if (!q)
2301 return c + (e - p);
2302 c += q - p;
2303 p = q;
2304 }
2305 p += rb_enc_fast_mbclen(p, e, enc);
2306 c++;
2307 }
2308 }
2309 else {
2310 while (p < e) {
2311 if (ISASCII(*p)) {
2312 q = search_nonascii(p, e);
2313 if (!q)
2314 return c + (e - p);
2315 c += q - p;
2316 p = q;
2317 }
2318 p += rb_enc_mbclen(p, e, enc);
2319 c++;
2320 }
2321 }
2322 return c;
2323 }
2324
2325 for (c=0; p<e; c++) {
2326 p += rb_enc_mbclen(p, e, enc);
2327 }
2328 return c;
2329}
2330
2331long
2332rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2333{
2334 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2335}
2336
2337/* To get strlen with cr
2338 * Note that given cr is not used.
2339 */
2340long
2341rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2342{
2343 long c;
2344 const char *q;
2345 int ret;
2346
2347 *cr = 0;
2348 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2349 long diff = (long)(e - p);
2350 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2351 }
2352 else if (rb_enc_asciicompat(enc)) {
2353 c = 0;
2354 while (p < e) {
2355 if (ISASCII(*p)) {
2356 q = search_nonascii(p, e);
2357 if (!q) {
2358 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2359 return c + (e - p);
2360 }
2361 c += q - p;
2362 p = q;
2363 }
2364 ret = rb_enc_precise_mbclen(p, e, enc);
2365 if (MBCLEN_CHARFOUND_P(ret)) {
2366 *cr |= ENC_CODERANGE_VALID;
2367 p += MBCLEN_CHARFOUND_LEN(ret);
2368 }
2369 else {
2371 p++;
2372 }
2373 c++;
2374 }
2375 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2376 return c;
2377 }
2378
2379 for (c=0; p<e; c++) {
2380 ret = rb_enc_precise_mbclen(p, e, enc);
2381 if (MBCLEN_CHARFOUND_P(ret)) {
2382 *cr |= ENC_CODERANGE_VALID;
2383 p += MBCLEN_CHARFOUND_LEN(ret);
2384 }
2385 else {
2387 if (p + rb_enc_mbminlen(enc) <= e)
2388 p += rb_enc_mbminlen(enc);
2389 else
2390 p = e;
2391 }
2392 }
2393 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2394 return c;
2395}
2396
2397/* enc must be str's enc or rb_enc_check(str, str2) */
2398static long
2399str_strlen(VALUE str, rb_encoding *enc)
2400{
2401 const char *p, *e;
2402 int cr;
2403
2404 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2405 if (!enc) enc = STR_ENC_GET(str);
2406 p = RSTRING_PTR(str);
2407 e = RSTRING_END(str);
2408 cr = ENC_CODERANGE(str);
2409
2410 if (cr == ENC_CODERANGE_UNKNOWN) {
2411 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2412 if (cr) ENC_CODERANGE_SET(str, cr);
2413 return n;
2414 }
2415 else {
2416 return enc_strlen(p, e, enc, cr);
2417 }
2418}
2419
2420long
2422{
2423 return str_strlen(str, NULL);
2424}
2425
2426/*
2427 * call-seq:
2428 * length -> integer
2429 *
2430 * :include: doc/string/length.rdoc
2431 *
2432 */
2433
2434VALUE
2436{
2437 return LONG2NUM(str_strlen(str, NULL));
2438}
2439
2440/*
2441 * call-seq:
2442 * bytesize -> integer
2443 *
2444 * :include: doc/string/bytesize.rdoc
2445 *
2446 */
2447
2448VALUE
2449rb_str_bytesize(VALUE str)
2450{
2451 return LONG2NUM(RSTRING_LEN(str));
2452}
2453
2454/*
2455 * call-seq:
2456 * empty? -> true or false
2457 *
2458 * Returns whether the length of +self+ is zero:
2459 *
2460 * 'hello'.empty? # => false
2461 * ' '.empty? # => false
2462 * ''.empty? # => true
2463 *
2464 * Related: see {Querying}[rdoc-ref:String@Querying].
2465 */
2466
2467static VALUE
2468rb_str_empty(VALUE str)
2469{
2470 return RBOOL(RSTRING_LEN(str) == 0);
2471}
2472
2473/*
2474 * call-seq:
2475 * self + other_string -> new_string
2476 *
2477 * Returns a new string containing +other_string+ concatenated to +self+:
2478 *
2479 * 'Hello from ' + self.to_s # => "Hello from main"
2480 *
2481 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2482 */
2483
2484VALUE
2486{
2487 VALUE str3;
2488 rb_encoding *enc;
2489 char *ptr1, *ptr2, *ptr3;
2490 long len1, len2;
2491 int termlen;
2492
2493 StringValue(str2);
2494 enc = rb_enc_check_str(str1, str2);
2495 RSTRING_GETMEM(str1, ptr1, len1);
2496 RSTRING_GETMEM(str2, ptr2, len2);
2497 termlen = rb_enc_mbminlen(enc);
2498 if (len1 > LONG_MAX - len2) {
2499 rb_raise(rb_eArgError, "string size too big");
2500 }
2501 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2502 ptr3 = RSTRING_PTR(str3);
2503 memcpy(ptr3, ptr1, len1);
2504 memcpy(ptr3+len1, ptr2, len2);
2505 TERM_FILL(&ptr3[len1+len2], termlen);
2506
2507 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2509 RB_GC_GUARD(str1);
2510 RB_GC_GUARD(str2);
2511 return str3;
2512}
2513
2514/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2515VALUE
2516rb_str_opt_plus(VALUE str1, VALUE str2)
2517{
2520 long len1, len2;
2521 MAYBE_UNUSED(char) *ptr1, *ptr2;
2522 RSTRING_GETMEM(str1, ptr1, len1);
2523 RSTRING_GETMEM(str2, ptr2, len2);
2524 int enc1 = rb_enc_get_index(str1);
2525 int enc2 = rb_enc_get_index(str2);
2526
2527 if (enc1 < 0) {
2528 return Qundef;
2529 }
2530 else if (enc2 < 0) {
2531 return Qundef;
2532 }
2533 else if (enc1 != enc2) {
2534 return Qundef;
2535 }
2536 else if (len1 > LONG_MAX - len2) {
2537 return Qundef;
2538 }
2539 else {
2540 return rb_str_plus(str1, str2);
2541 }
2542
2543}
2544
2545/*
2546 * call-seq:
2547 * self * n -> new_string
2548 *
2549 * Returns a new string containing +n+ copies of +self+:
2550 *
2551 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2552 * 'No!' * 0 # => ""
2553 *
2554 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2555 */
2556
2557VALUE
2559{
2560 VALUE str2;
2561 long n, len;
2562 char *ptr2;
2563 int termlen;
2564
2565 if (times == INT2FIX(1)) {
2566 return str_duplicate(rb_cString, str);
2567 }
2568 if (times == INT2FIX(0)) {
2569 str2 = str_alloc_embed(rb_cString, 0);
2570 rb_enc_copy(str2, str);
2571 return str2;
2572 }
2573 len = NUM2LONG(times);
2574 if (len < 0) {
2575 rb_raise(rb_eArgError, "negative argument");
2576 }
2577 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2578 if (STR_EMBEDDABLE_P(len, 1)) {
2579 str2 = str_alloc_embed(rb_cString, len + 1);
2580 memset(RSTRING_PTR(str2), 0, len + 1);
2581 }
2582 else {
2583 str2 = str_alloc_heap(rb_cString);
2584 RSTRING(str2)->as.heap.aux.capa = len;
2585 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2586 }
2587 STR_SET_LEN(str2, len);
2588 rb_enc_copy(str2, str);
2589 return str2;
2590 }
2591 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2592 rb_raise(rb_eArgError, "argument too big");
2593 }
2594
2595 len *= RSTRING_LEN(str);
2596 termlen = TERM_LEN(str);
2597 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2598 ptr2 = RSTRING_PTR(str2);
2599 if (len) {
2600 n = RSTRING_LEN(str);
2601 memcpy(ptr2, RSTRING_PTR(str), n);
2602 while (n <= len/2) {
2603 memcpy(ptr2 + n, ptr2, n);
2604 n *= 2;
2605 }
2606 memcpy(ptr2 + n, ptr2, len-n);
2607 }
2608 STR_SET_LEN(str2, len);
2609 TERM_FILL(&ptr2[len], termlen);
2610 rb_enc_cr_str_copy_for_substr(str2, str);
2611
2612 return str2;
2613}
2614
2615/*
2616 * call-seq:
2617 * self % object -> new_string
2618 *
2619 * Returns the result of formatting +object+ into the format specifications
2620 * contained in +self+
2621 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2622 *
2623 * '%05d' % 123 # => "00123"
2624 *
2625 * If +self+ contains multiple format specifications,
2626 * +object+ must be an array or hash containing the objects to be formatted:
2627 *
2628 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2629 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2630 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2631 *
2632 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2633 */
2634
2635static VALUE
2636rb_str_format_m(VALUE str, VALUE arg)
2637{
2638 VALUE tmp = rb_check_array_type(arg);
2639
2640 if (!NIL_P(tmp)) {
2641 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2642 RB_GC_GUARD(tmp);
2643 return result;
2644 }
2645 return rb_str_format(1, &arg, str);
2646}
2647
2648static inline void
2649rb_check_lockedtmp(VALUE str)
2650{
2651 if (FL_TEST(str, STR_TMPLOCK)) {
2652 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2653 }
2654}
2655
2656// If none of these flags are set, we know we have an modifiable string.
2657// If any is set, we need to do more detailed checks.
2658#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2659static inline void
2660str_modifiable(VALUE str)
2661{
2662 RUBY_ASSERT(ruby_thread_has_gvl_p());
2663
2664 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2665 if (CHILLED_STRING_P(str)) {
2666 CHILLED_STRING_MUTATED(str);
2667 }
2668 rb_check_lockedtmp(str);
2669 rb_check_frozen(str);
2670 }
2671}
2672
2673static inline int
2674str_dependent_p(VALUE str)
2675{
2676 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2677 return FALSE;
2678 }
2679 else {
2680 return TRUE;
2681 }
2682}
2683
2684// If none of these flags are set, we know we have an independent string.
2685// If any is set, we need to do more detailed checks.
2686#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2687static inline int
2688str_independent(VALUE str)
2689{
2690 RUBY_ASSERT(ruby_thread_has_gvl_p());
2691
2692 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2693 str_modifiable(str);
2694 return !str_dependent_p(str);
2695 }
2696 return TRUE;
2697}
2698
2699static void
2700str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2701{
2702 RUBY_ASSERT(ruby_thread_has_gvl_p());
2703
2704 char *ptr;
2705 char *oldptr;
2706 long capa = len + expand;
2707
2708 if (len > capa) len = capa;
2709
2710 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2711 ptr = RSTRING(str)->as.heap.ptr;
2712 STR_SET_EMBED(str);
2713 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2714 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2715 STR_SET_LEN(str, len);
2716 return;
2717 }
2718
2719 ptr = ALLOC_N(char, (size_t)capa + termlen);
2720 oldptr = RSTRING_PTR(str);
2721 if (oldptr) {
2722 memcpy(ptr, oldptr, len);
2723 }
2724 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2725 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2726 }
2727 STR_SET_NOEMBED(str);
2728 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2729 TERM_FILL(ptr + len, termlen);
2730 RSTRING(str)->as.heap.ptr = ptr;
2731 STR_SET_LEN(str, len);
2732 RSTRING(str)->as.heap.aux.capa = capa;
2733}
2734
2735void
2736rb_str_modify(VALUE str)
2737{
2738 if (!str_independent(str))
2739 str_make_independent(str);
2741}
2742
2743void
2745{
2746 RUBY_ASSERT(ruby_thread_has_gvl_p());
2747
2748 int termlen = TERM_LEN(str);
2749 long len = RSTRING_LEN(str);
2750
2751 if (expand < 0) {
2752 rb_raise(rb_eArgError, "negative expanding string size");
2753 }
2754 if (expand >= LONG_MAX - len) {
2755 rb_raise(rb_eArgError, "string size too big");
2756 }
2757
2758 if (!str_independent(str)) {
2759 str_make_independent_expand(str, len, expand, termlen);
2760 }
2761 else if (expand > 0) {
2762 RESIZE_CAPA_TERM(str, len + expand, termlen);
2763 }
2765}
2766
2767/* As rb_str_modify(), but don't clear coderange */
2768static void
2769str_modify_keep_cr(VALUE str)
2770{
2771 if (!str_independent(str))
2772 str_make_independent(str);
2774 /* Force re-scan later */
2776}
2777
2778static inline void
2779str_discard(VALUE str)
2780{
2781 str_modifiable(str);
2782 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2783 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2784 RSTRING(str)->as.heap.ptr = 0;
2785 STR_SET_LEN(str, 0);
2786 }
2787}
2788
2789void
2791{
2792 int encindex = rb_enc_get_index(str);
2793
2794 if (RB_UNLIKELY(encindex == -1)) {
2795 rb_raise(rb_eTypeError, "not encoding capable object");
2796 }
2797
2798 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2799 return;
2800 }
2801
2802 rb_encoding *enc = rb_enc_from_index(encindex);
2803 if (!rb_enc_asciicompat(enc)) {
2804 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2805 }
2806}
2807
2808VALUE
2810{
2811 RUBY_ASSERT(ruby_thread_has_gvl_p());
2812
2813 VALUE s = *ptr;
2814 if (!RB_TYPE_P(s, T_STRING)) {
2815 s = rb_str_to_str(s);
2816 *ptr = s;
2817 }
2818 return s;
2819}
2820
2821char *
2823{
2824 VALUE str = rb_string_value(ptr);
2825 return RSTRING_PTR(str);
2826}
2827
2828static const char *
2829str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2830{
2831 const char *e = s + len;
2832
2833 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2834 if (zero_filled(s, minlen)) return s;
2835 }
2836 return 0;
2837}
2838
2839static char *
2840str_fill_term(VALUE str, char *s, long len, int termlen)
2841{
2842 /* This function assumes that (capa + termlen) bytes of memory
2843 * is allocated, like many other functions in this file.
2844 */
2845 if (str_dependent_p(str)) {
2846 if (!zero_filled(s + len, termlen))
2847 str_make_independent_expand(str, len, 0L, termlen);
2848 }
2849 else {
2850 TERM_FILL(s + len, termlen);
2851 return s;
2852 }
2853 return RSTRING_PTR(str);
2854}
2855
2856void
2857rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2858{
2859 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2860 long len = RSTRING_LEN(str);
2861
2862 RUBY_ASSERT(capa >= len);
2863 if (capa - len < termlen) {
2864 rb_check_lockedtmp(str);
2865 str_make_independent_expand(str, len, 0L, termlen);
2866 }
2867 else if (str_dependent_p(str)) {
2868 if (termlen > oldtermlen)
2869 str_make_independent_expand(str, len, 0L, termlen);
2870 }
2871 else {
2872 if (!STR_EMBED_P(str)) {
2873 /* modify capa instead of realloc */
2874 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2875 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2876 }
2877 if (termlen > oldtermlen) {
2878 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2879 }
2880 }
2881
2882 return;
2883}
2884
2885static char *
2886str_null_check(VALUE str, int *w)
2887{
2888 char *s = RSTRING_PTR(str);
2889 long len = RSTRING_LEN(str);
2890 int minlen = 1;
2891
2892 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2893 rb_encoding *enc = rb_str_enc_get(str);
2894 minlen = rb_enc_mbminlen(enc);
2895
2896 if (minlen > 1) {
2897 *w = 1;
2898 if (str_null_char(s, len, minlen, enc)) {
2899 return NULL;
2900 }
2901 return str_fill_term(str, s, len, minlen);
2902 }
2903 }
2904
2905 *w = 0;
2906 if (!s || memchr(s, 0, len)) {
2907 return NULL;
2908 }
2909 if (s[len]) {
2910 s = str_fill_term(str, s, len, minlen);
2911 }
2912 return s;
2913}
2914
2915const char *
2916rb_str_null_check(VALUE str)
2917{
2919
2920 char *s;
2921 long len;
2922 RSTRING_GETMEM(str, s, len);
2923
2924 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2925 if (!s || memchr(s, 0, len)) {
2926 rb_raise(rb_eArgError, "string contains null byte");
2927 }
2928 }
2929 else {
2930 int w;
2931 const char *s = str_null_check(str, &w);
2932 if (!s) {
2933 if (w) {
2934 rb_raise(rb_eArgError, "string contains null char");
2935 }
2936 rb_raise(rb_eArgError, "string contains null byte");
2937 }
2938 }
2939
2940 return s;
2941}
2942
2943char *
2944rb_str_to_cstr(VALUE str)
2945{
2946 int w;
2947 return str_null_check(str, &w);
2948}
2949
2950char *
2952{
2953 VALUE str = rb_string_value(ptr);
2954 int w;
2955 char *s = str_null_check(str, &w);
2956 if (!s) {
2957 if (w) {
2958 rb_raise(rb_eArgError, "string contains null char");
2959 }
2960 rb_raise(rb_eArgError, "string contains null byte");
2961 }
2962 return s;
2963}
2964
2965char *
2966rb_str_fill_terminator(VALUE str, const int newminlen)
2967{
2968 char *s = RSTRING_PTR(str);
2969 long len = RSTRING_LEN(str);
2970 return str_fill_term(str, s, len, newminlen);
2971}
2972
2973VALUE
2975{
2976 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2977 return str;
2978}
2979
2980/*
2981 * call-seq:
2982 * String.try_convert(object) -> object, new_string, or nil
2983 *
2984 * Attempts to convert the given +object+ to a string.
2985 *
2986 * If +object+ is already a string, returns +object+, unmodified.
2987 *
2988 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2989 * calls <tt>object.to_str</tt> and returns the result.
2990 *
2991 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2992 *
2993 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2994 */
2995static VALUE
2996rb_str_s_try_convert(VALUE dummy, VALUE str)
2997{
2998 return rb_check_string_type(str);
2999}
3000
3001static char*
3002str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3003{
3004 long nth = *nthp;
3005 if (rb_enc_mbmaxlen(enc) == 1) {
3006 p += nth;
3007 }
3008 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3009 p += nth * rb_enc_mbmaxlen(enc);
3010 }
3011 else if (rb_enc_asciicompat(enc)) {
3012 const char *p2, *e2;
3013 int n;
3014
3015 while (p < e && 0 < nth) {
3016 e2 = p + nth;
3017 if (e < e2) {
3018 *nthp = nth;
3019 return (char *)e;
3020 }
3021 if (ISASCII(*p)) {
3022 p2 = search_nonascii(p, e2);
3023 if (!p2) {
3024 nth -= e2 - p;
3025 *nthp = nth;
3026 return (char *)e2;
3027 }
3028 nth -= p2 - p;
3029 p = p2;
3030 }
3031 n = rb_enc_mbclen(p, e, enc);
3032 p += n;
3033 nth--;
3034 }
3035 *nthp = nth;
3036 if (nth != 0) {
3037 return (char *)e;
3038 }
3039 return (char *)p;
3040 }
3041 else {
3042 while (p < e && nth--) {
3043 p += rb_enc_mbclen(p, e, enc);
3044 }
3045 }
3046 if (p > e) p = e;
3047 *nthp = nth;
3048 return (char*)p;
3049}
3050
3051char*
3052rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3053{
3054 return str_nth_len(p, e, &nth, enc);
3055}
3056
3057static char*
3058str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3059{
3060 if (singlebyte)
3061 p += nth;
3062 else {
3063 p = str_nth_len(p, e, &nth, enc);
3064 }
3065 if (!p) return 0;
3066 if (p > e) p = e;
3067 return (char *)p;
3068}
3069
3070/* char offset to byte offset */
3071static long
3072str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3073{
3074 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3075 if (!pp) return e - p;
3076 return pp - p;
3077}
3078
3079long
3080rb_str_offset(VALUE str, long pos)
3081{
3082 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3083 STR_ENC_GET(str), single_byte_optimizable(str));
3084}
3085
3086#ifdef NONASCII_MASK
3087static char *
3088str_utf8_nth(const char *p, const char *e, long *nthp)
3089{
3090 long nth = *nthp;
3091 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3092 const uintptr_t *s, *t;
3093 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3094 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3095 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3096 while (p < (const char *)s) {
3097 if (is_utf8_lead_byte(*p)) nth--;
3098 p++;
3099 }
3100 do {
3101 nth -= count_utf8_lead_bytes_with_word(s);
3102 s++;
3103 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3104 p = (char *)s;
3105 }
3106 while (p < e) {
3107 if (is_utf8_lead_byte(*p)) {
3108 if (nth == 0) break;
3109 nth--;
3110 }
3111 p++;
3112 }
3113 *nthp = nth;
3114 return (char *)p;
3115}
3116
3117static long
3118str_utf8_offset(const char *p, const char *e, long nth)
3119{
3120 const char *pp = str_utf8_nth(p, e, &nth);
3121 return pp - p;
3122}
3123#endif
3124
3125/* byte offset to char offset */
3126long
3127rb_str_sublen(VALUE str, long pos)
3128{
3129 if (single_byte_optimizable(str) || pos < 0)
3130 return pos;
3131 else {
3132 char *p = RSTRING_PTR(str);
3133 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3134 }
3135}
3136
3137static VALUE
3138str_subseq(VALUE str, long beg, long len)
3139{
3140 VALUE str2;
3141
3142 RUBY_ASSERT(beg >= 0);
3143 RUBY_ASSERT(len >= 0);
3144 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3145
3146 const int termlen = TERM_LEN(str);
3147 if (!SHARABLE_SUBSTRING_P(str, beg, len)) {
3148 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
3149 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3151 }
3152 RB_GC_GUARD(str);
3153 return str2;
3154 }
3155
3156 str2 = str_alloc_heap(rb_cString);
3157 if (str_embed_capa(str2) >= len + termlen) {
3158 char *ptr2 = RSTRING(str2)->as.embed.ary;
3159 STR_SET_EMBED(str2);
3160 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3161 TERM_FILL(ptr2+len, termlen);
3162
3163 STR_SET_LEN(str2, len);
3164 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3166 }
3167
3168 RB_GC_GUARD(str);
3169 }
3170 else {
3171 str_replace_shared(str2, str);
3172 RUBY_ASSERT(!STR_EMBED_P(str2));
3173 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3174 ENC_CODERANGE_CLEAR(str2);
3175 }
3176
3177 RSTRING(str2)->as.heap.ptr += beg;
3178 if (RSTRING_LEN(str2) > len) {
3179 STR_SET_LEN(str2, len);
3180 }
3181 }
3182
3183 return str2;
3184}
3185
3186VALUE
3187rb_str_subseq(VALUE str, long beg, long len)
3188{
3189 VALUE str2 = str_subseq(str, beg, len);
3190 rb_enc_cr_str_copy_for_substr(str2, str);
3191 return str2;
3192}
3193
3194char *
3195rb_str_subpos(VALUE str, long beg, long *lenp)
3196{
3197 long len = *lenp;
3198 long slen = -1L;
3199 const long blen = RSTRING_LEN(str);
3200 rb_encoding *enc = STR_ENC_GET(str);
3201 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3202
3203 if (len < 0) return 0;
3204 if (beg < 0 && -beg < 0) return 0;
3205 if (!blen) {
3206 len = 0;
3207 }
3208 if (single_byte_optimizable(str)) {
3209 if (beg > blen) return 0;
3210 if (beg < 0) {
3211 beg += blen;
3212 if (beg < 0) return 0;
3213 }
3214 if (len > blen - beg)
3215 len = blen - beg;
3216 if (len < 0) return 0;
3217 p = s + beg;
3218 goto end;
3219 }
3220 if (beg < 0) {
3221 if (len > -beg) len = -beg;
3222 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3223 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3224 beg = -beg;
3225 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3226 p = e;
3227 if (!p) return 0;
3228 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3229 if (!p) return 0;
3230 len = e - p;
3231 goto end;
3232 }
3233 else {
3234 slen = str_strlen(str, enc);
3235 beg += slen;
3236 if (beg < 0) return 0;
3237 p = s + beg;
3238 if (len == 0) goto end;
3239 }
3240 }
3241 else if (beg > 0 && beg > blen) {
3242 return 0;
3243 }
3244 if (len == 0) {
3245 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3246 p = s + beg;
3247 }
3248#ifdef NONASCII_MASK
3249 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3250 enc == rb_utf8_encoding()) {
3251 p = str_utf8_nth(s, e, &beg);
3252 if (beg > 0) return 0;
3253 len = str_utf8_offset(p, e, len);
3254 }
3255#endif
3256 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3257 int char_sz = rb_enc_mbmaxlen(enc);
3258
3259 p = s + beg * char_sz;
3260 if (p > e) {
3261 return 0;
3262 }
3263 else if (len * char_sz > e - p)
3264 len = e - p;
3265 else
3266 len *= char_sz;
3267 }
3268 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3269 if (beg > 0) return 0;
3270 len = 0;
3271 }
3272 else {
3273 len = str_offset(p, e, len, enc, 0);
3274 }
3275 end:
3276 *lenp = len;
3277 RB_GC_GUARD(str);
3278 return p;
3279}
3280
3281static VALUE str_substr(VALUE str, long beg, long len, int empty);
3282
3283VALUE
3284rb_str_substr(VALUE str, long beg, long len)
3285{
3286 return str_substr(str, beg, len, TRUE);
3287}
3288
3289VALUE
3290rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3291{
3292 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3293}
3294
3295static VALUE
3296str_substr(VALUE str, long beg, long len, int empty)
3297{
3298 char *p = rb_str_subpos(str, beg, &len);
3299
3300 if (!p) return Qnil;
3301 if (!len && !empty) return Qnil;
3302
3303 beg = p - RSTRING_PTR(str);
3304
3305 VALUE str2 = str_subseq(str, beg, len);
3306 rb_enc_cr_str_copy_for_substr(str2, str);
3307 return str2;
3308}
3309
3310/* :nodoc: */
3311VALUE
3313{
3314 if (CHILLED_STRING_P(str)) {
3315 FL_UNSET_RAW(str, STR_CHILLED);
3316 }
3317
3318 if (OBJ_FROZEN(str)) return str;
3319 rb_str_resize(str, RSTRING_LEN(str));
3320 return rb_obj_freeze(str);
3321}
3322
3323/*
3324 * call-seq:
3325 * +string -> new_string or self
3326 *
3327 * Returns +self+ if +self+ is not frozen and can be mutated
3328 * without warning issuance.
3329 *
3330 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3331 *
3332 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3333 */
3334static VALUE
3335str_uplus(VALUE str)
3336{
3337 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3338 return rb_str_dup(str);
3339 }
3340 else {
3341 return str;
3342 }
3343}
3344
3345/*
3346 * call-seq:
3347 * -self -> frozen_string
3348 *
3349 * Returns a frozen string equal to +self+.
3350 *
3351 * The returned string is +self+ if and only if all of the following are true:
3352 *
3353 * - +self+ is already frozen.
3354 * - +self+ is an instance of \String (rather than of a subclass of \String)
3355 * - +self+ has no instance variables set on it.
3356 *
3357 * Otherwise, the returned string is a frozen copy of +self+.
3358 *
3359 * Returning +self+, when possible, saves duplicating +self+;
3360 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3361 *
3362 * It may also save duplicating other, already-existing, strings:
3363 *
3364 * s0 = 'foo'
3365 * s1 = 'foo'
3366 * s0.object_id == s1.object_id # => false
3367 * (-s0).object_id == (-s1).object_id # => true
3368 *
3369 * Note that method #-@ is convenient for defining a constant:
3370 *
3371 * FileName = -'config/database.yml'
3372 *
3373 * While its alias #dedup is better suited for chaining:
3374 *
3375 * 'foo'.dedup.gsub!('o')
3376 *
3377 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3378 */
3379static VALUE
3380str_uminus(VALUE str)
3381{
3382 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3383 str = rb_str_dup(str);
3384 }
3385 return rb_fstring(str);
3386}
3387
3388RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3389#define rb_str_dup_frozen rb_str_new_frozen
3390
3391VALUE
3393{
3394 rb_check_frozen(str);
3395 if (FL_TEST(str, STR_TMPLOCK)) {
3396 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3397 }
3398 FL_SET(str, STR_TMPLOCK);
3399 return str;
3400}
3401
3402VALUE
3404{
3405 rb_check_frozen(str);
3406 if (!FL_TEST(str, STR_TMPLOCK)) {
3407 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3408 }
3409 FL_UNSET(str, STR_TMPLOCK);
3410 return str;
3411}
3412
3413VALUE
3414rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3415{
3416 rb_str_locktmp(str);
3417 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3418}
3419
3420void
3422{
3423 RUBY_ASSERT(ruby_thread_has_gvl_p());
3424
3425 long capa;
3426 const int termlen = TERM_LEN(str);
3427
3428 str_modifiable(str);
3429 if (STR_SHARED_P(str)) {
3430 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3431 }
3432 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3433 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3434 }
3435
3436 int cr = ENC_CODERANGE(str);
3437 if (len == 0) {
3438 /* Empty string does not contain non-ASCII */
3440 }
3441 else if (cr == ENC_CODERANGE_UNKNOWN) {
3442 /* Leave unknown. */
3443 }
3444 else if (len > RSTRING_LEN(str)) {
3445 if (ENC_CODERANGE_CLEAN_P(cr)) {
3446 /* Update the coderange regarding the extended part. */
3447 const char *const prev_end = RSTRING_END(str);
3448 const char *const new_end = RSTRING_PTR(str) + len;
3449 rb_encoding *enc = rb_enc_get(str);
3450 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3451 ENC_CODERANGE_SET(str, cr);
3452 }
3453 else if (cr == ENC_CODERANGE_BROKEN) {
3454 /* May be valid now, by appended part. */
3456 }
3457 }
3458 else if (len < RSTRING_LEN(str)) {
3459 if (cr != ENC_CODERANGE_7BIT) {
3460 /* ASCII-only string is keeping after truncated. Valid
3461 * and broken may be invalid or valid, leave unknown. */
3463 }
3464 }
3465
3466 STR_SET_LEN(str, len);
3467 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3468}
3469
3470VALUE
3471rb_str_resize(VALUE str, long len)
3472{
3473 if (len < 0) {
3474 rb_raise(rb_eArgError, "negative string size (or size too big)");
3475 }
3476
3477 int independent = str_independent(str);
3478 long slen = RSTRING_LEN(str);
3479 const int termlen = TERM_LEN(str);
3480
3481 if (slen > len || (termlen != 1 && slen < len)) {
3483 }
3484
3485 {
3486 long capa;
3487 if (STR_EMBED_P(str)) {
3488 if (len == slen) return str;
3489 if (str_embed_capa(str) >= len + termlen) {
3490 STR_SET_LEN(str, len);
3491 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3492 return str;
3493 }
3494 str_make_independent_expand(str, slen, len - slen, termlen);
3495 }
3496 else if (str_embed_capa(str) >= len + termlen) {
3497 capa = RSTRING(str)->as.heap.aux.capa;
3498 char *ptr = STR_HEAP_PTR(str);
3499 STR_SET_EMBED(str);
3500 if (slen > len) slen = len;
3501 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3502 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3503 STR_SET_LEN(str, len);
3504 if (independent) {
3505 SIZED_FREE_N(ptr, capa + termlen);
3506 }
3507 return str;
3508 }
3509 else if (!independent) {
3510 if (len == slen) return str;
3511 str_make_independent_expand(str, slen, len - slen, termlen);
3512 }
3513 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3514 (capa - len) > (len < 1024 ? len : 1024)) {
3515 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3516 (size_t)len + termlen, STR_HEAP_SIZE(str));
3517 RSTRING(str)->as.heap.aux.capa = len;
3518 }
3519 else if (len == slen) return str;
3520 STR_SET_LEN(str, len);
3521 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3522 }
3523 return str;
3524}
3525
3526static void
3527str_ensure_available_capa(VALUE str, long len)
3528{
3529 str_modify_keep_cr(str);
3530
3531 const int termlen = TERM_LEN(str);
3532 long olen = RSTRING_LEN(str);
3533
3534 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3535 rb_raise(rb_eArgError, "string sizes too big");
3536 }
3537
3538 long total = olen + len;
3539 long capa = str_capacity(str, termlen);
3540
3541 if (capa < total) {
3542 if (total >= LONG_MAX / 2) {
3543 capa = total;
3544 }
3545 while (total > capa) {
3546 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3547 }
3548 RESIZE_CAPA_TERM(str, capa, termlen);
3549 }
3550}
3551
3552static VALUE
3553str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3554{
3555 if (keep_cr) {
3556 str_modify_keep_cr(str);
3557 }
3558 else {
3559 rb_str_modify(str);
3560 }
3561 if (len == 0) return 0;
3562
3563 long total, olen, off = -1;
3564 char *sptr;
3565 const int termlen = TERM_LEN(str);
3566
3567 RSTRING_GETMEM(str, sptr, olen);
3568 if (ptr >= sptr && ptr <= sptr + olen) {
3569 off = ptr - sptr;
3570 }
3571
3572 long capa = str_capacity(str, termlen);
3573
3574 if (olen > LONG_MAX - len) {
3575 rb_raise(rb_eArgError, "string sizes too big");
3576 }
3577 total = olen + len;
3578 if (capa < total) {
3579 if (total >= LONG_MAX / 2) {
3580 capa = total;
3581 }
3582 while (total > capa) {
3583 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3584 }
3585 RESIZE_CAPA_TERM(str, capa, termlen);
3586 sptr = RSTRING_PTR(str);
3587 }
3588 if (off != -1) {
3589 ptr = sptr + off;
3590 }
3591 memcpy(sptr + olen, ptr, len);
3592 STR_SET_LEN(str, total);
3593 TERM_FILL(sptr + total, termlen); /* sentinel */
3594
3595 return str;
3596}
3597
3598#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3599#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3600
3601VALUE
3602rb_str_cat(VALUE str, const char *ptr, long len)
3603{
3604 if (len == 0) return str;
3605 if (len < 0) {
3606 rb_raise(rb_eArgError, "negative string size (or size too big)");
3607 }
3608 return str_buf_cat(str, ptr, len);
3609}
3610
3611VALUE
3612rb_str_cat_cstr(VALUE str, const char *ptr)
3613{
3614 must_not_null(ptr);
3615 return rb_str_buf_cat(str, ptr, strlen(ptr));
3616}
3617
3618static void
3619rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3620{
3621 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3622
3623 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3624 if (UNLIKELY(!str_independent(str))) {
3625 str_make_independent(str);
3626 }
3627
3628 long string_length = -1;
3629 const int null_terminator_length = 1;
3630 char *sptr;
3631 RSTRING_GETMEM(str, sptr, string_length);
3632
3633 // Ensure the resulting string wouldn't be too long.
3634 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3635 rb_raise(rb_eArgError, "string sizes too big");
3636 }
3637
3638 long string_capacity = str_capacity(str, null_terminator_length);
3639
3640 // Get the code range before any modifications since those might clear the code range.
3641 int cr = ENC_CODERANGE(str);
3642
3643 // Check if the string has spare string_capacity to write the new byte.
3644 if (LIKELY(string_capacity >= string_length + 1)) {
3645 // In fast path we can write the new byte and note the string's new length.
3646 sptr[string_length] = byte;
3647 STR_SET_LEN(str, string_length + 1);
3648 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3649 }
3650 else {
3651 // If there's not enough string_capacity, make a call into the general string concatenation function.
3652 str_buf_cat(str, (char *)&byte, 1);
3653 }
3654
3655 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3656 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3657 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3658 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3659 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3660 if (ISASCII(byte)) {
3662 }
3663 else {
3665
3666 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3667 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3668 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3669 }
3670 }
3671 }
3672}
3673
3674RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3675RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3676RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3677
3678static VALUE
3679rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3680 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3681{
3682 int str_encindex = ENCODING_GET(str);
3683 int res_encindex;
3684 int str_cr, res_cr;
3685 rb_encoding *str_enc, *ptr_enc;
3686
3687 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3688
3689 if (str_encindex == ptr_encindex) {
3690 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3691 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3692 }
3693 }
3694 else {
3695 str_enc = rb_enc_from_index(str_encindex);
3696 ptr_enc = rb_enc_from_index(ptr_encindex);
3697 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3698 if (len == 0)
3699 return str;
3700 if (RSTRING_LEN(str) == 0) {
3701 rb_str_buf_cat(str, ptr, len);
3702 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3703 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3704 return str;
3705 }
3706 goto incompatible;
3707 }
3708 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3709 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3710 }
3711 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3712 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3713 str_cr = rb_enc_str_coderange(str);
3714 }
3715 }
3716 }
3717 if (ptr_cr_ret)
3718 *ptr_cr_ret = ptr_cr;
3719
3720 if (str_encindex != ptr_encindex &&
3721 str_cr != ENC_CODERANGE_7BIT &&
3722 ptr_cr != ENC_CODERANGE_7BIT) {
3723 str_enc = rb_enc_from_index(str_encindex);
3724 ptr_enc = rb_enc_from_index(ptr_encindex);
3725 goto incompatible;
3726 }
3727
3728 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3729 res_encindex = str_encindex;
3730 res_cr = ENC_CODERANGE_UNKNOWN;
3731 }
3732 else if (str_cr == ENC_CODERANGE_7BIT) {
3733 if (ptr_cr == ENC_CODERANGE_7BIT) {
3734 res_encindex = str_encindex;
3735 res_cr = ENC_CODERANGE_7BIT;
3736 }
3737 else {
3738 res_encindex = ptr_encindex;
3739 res_cr = ptr_cr;
3740 }
3741 }
3742 else if (str_cr == ENC_CODERANGE_VALID) {
3743 res_encindex = str_encindex;
3744 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3745 res_cr = str_cr;
3746 else
3747 res_cr = ptr_cr;
3748 }
3749 else { /* str_cr == ENC_CODERANGE_BROKEN */
3750 res_encindex = str_encindex;
3751 res_cr = str_cr;
3752 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3753 }
3754
3755 if (len < 0) {
3756 rb_raise(rb_eArgError, "negative string size (or size too big)");
3757 }
3758 str_buf_cat(str, ptr, len);
3759 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3760 return str;
3761
3762 incompatible:
3763 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3764 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3766}
3767
3768VALUE
3769rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3770{
3771 return rb_enc_cr_str_buf_cat(str, ptr, len,
3772 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3773}
3774
3775VALUE
3777{
3778 /* ptr must reference NUL terminated ASCII string. */
3779 int encindex = ENCODING_GET(str);
3780 rb_encoding *enc = rb_enc_from_index(encindex);
3781 if (rb_enc_asciicompat(enc)) {
3782 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3783 encindex, ENC_CODERANGE_7BIT, 0);
3784 }
3785 else {
3786 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3787 while (*ptr) {
3788 unsigned int c = (unsigned char)*ptr;
3789 int len = rb_enc_codelen(c, enc);
3790 rb_enc_mbcput(c, buf, enc);
3791 rb_enc_cr_str_buf_cat(str, buf, len,
3792 encindex, ENC_CODERANGE_VALID, 0);
3793 ptr++;
3794 }
3795 return str;
3796 }
3797}
3798
3799VALUE
3801{
3802 int str2_cr = rb_enc_str_coderange(str2);
3803
3804 if (rb_str_enc_fastpath(str)) {
3805 switch (str2_cr) {
3806 case ENC_CODERANGE_7BIT:
3807 // If RHS is 7bit we can do simple concatenation
3808 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3809 RB_GC_GUARD(str2);
3810 return str;
3812 // If RHS is valid, we can do simple concatenation if encodings are the same
3813 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3814 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3815 int str_cr = ENC_CODERANGE(str);
3816 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3817 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3818 }
3819 RB_GC_GUARD(str2);
3820 return str;
3821 }
3822 }
3823 }
3824
3825 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3826 ENCODING_GET(str2), str2_cr, &str2_cr);
3827
3828 ENC_CODERANGE_SET(str2, str2_cr);
3829
3830 return str;
3831}
3832
3833VALUE
3835{
3836 StringValue(str2);
3837 return rb_str_buf_append(str, str2);
3838}
3839
3840VALUE
3841rb_str_concat_literals(size_t num, const VALUE *strary)
3842{
3843 VALUE str;
3844 size_t i, s = 0;
3845 unsigned long len = 1;
3846
3847 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3848 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3849
3850 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3851 str = rb_str_buf_new(len);
3852 str_enc_copy_direct(str, strary[0]);
3853
3854 for (i = s; i < num; ++i) {
3855 const VALUE v = strary[i];
3856 int encidx = ENCODING_GET(v);
3857
3858 rb_str_buf_append(str, v);
3859 if (encidx != ENCINDEX_US_ASCII) {
3860 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3861 rb_enc_set_index(str, encidx);
3862 }
3863 }
3864 return str;
3865}
3866
3867/*
3868 * call-seq:
3869 * concat(*objects) -> string
3870 *
3871 * :include: doc/string/concat.rdoc
3872 */
3873static VALUE
3874rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3875{
3876 str_modifiable(str);
3877
3878 if (argc == 1) {
3879 return rb_str_concat(str, argv[0]);
3880 }
3881 else if (argc > 1) {
3882 int i;
3883 VALUE arg_str = rb_str_tmp_new(0);
3884 rb_enc_copy(arg_str, str);
3885 for (i = 0; i < argc; i++) {
3886 rb_str_concat(arg_str, argv[i]);
3887 }
3888 rb_str_buf_append(str, arg_str);
3889 }
3890
3891 return str;
3892}
3893
3894/*
3895 * call-seq:
3896 * append_as_bytes(*objects) -> self
3897 *
3898 * Concatenates each object in +objects+ into +self+; returns +self+;
3899 * performs no encoding validation or conversion:
3900 *
3901 * s = 'foo'
3902 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3903 * s.valid_encoding? # => false
3904 * s.append_as_bytes("\xAC 12")
3905 * s.valid_encoding? # => true
3906 *
3907 * When a given object is an integer,
3908 * the value is considered an 8-bit byte;
3909 * if the integer occupies more than one byte (i.e,. is greater than 255),
3910 * appends only the low-order byte (similar to String#setbyte):
3911 *
3912 * s = ""
3913 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3914 * s.bytesize # => 2
3915 *
3916 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3917 */
3918
3919VALUE
3920rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3921{
3922 long needed_capacity = 0;
3923 volatile VALUE t0;
3924 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3925
3926 for (int index = 0; index < argc; index++) {
3927 VALUE obj = argv[index];
3928 enum ruby_value_type type = types[index] = rb_type(obj);
3929 switch (type) {
3930 case T_FIXNUM:
3931 case T_BIGNUM:
3932 needed_capacity++;
3933 break;
3934 case T_STRING:
3935 needed_capacity += RSTRING_LEN(obj);
3936 break;
3937 default:
3938 rb_raise(
3940 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3941 rb_obj_class(obj)
3942 );
3943 break;
3944 }
3945 }
3946
3947 str_ensure_available_capa(str, needed_capacity);
3948 char *sptr = RSTRING_END(str);
3949
3950 for (int index = 0; index < argc; index++) {
3951 VALUE obj = argv[index];
3952 enum ruby_value_type type = types[index];
3953 switch (type) {
3954 case T_FIXNUM:
3955 case T_BIGNUM: {
3956 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3957 char byte = (char)(NUM2INT(obj) & 0xFF);
3958 *sptr = byte;
3959 sptr++;
3960 break;
3961 }
3962 case T_STRING: {
3963 const char *ptr;
3964 long len;
3965 RSTRING_GETMEM(obj, ptr, len);
3966 memcpy(sptr, ptr, len);
3967 sptr += len;
3968 break;
3969 }
3970 default:
3971 rb_bug("append_as_bytes arguments should have been validated");
3972 }
3973 }
3974
3975 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3976 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3977
3978 int cr = ENC_CODERANGE(str);
3979 switch (cr) {
3980 case ENC_CODERANGE_7BIT: {
3981 for (int index = 0; index < argc; index++) {
3982 VALUE obj = argv[index];
3983 enum ruby_value_type type = types[index];
3984 switch (type) {
3985 case T_FIXNUM:
3986 case T_BIGNUM: {
3987 if (!ISASCII(NUM2INT(obj))) {
3988 goto clear_cr;
3989 }
3990 break;
3991 }
3992 case T_STRING: {
3993 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3994 goto clear_cr;
3995 }
3996 break;
3997 }
3998 default:
3999 rb_bug("append_as_bytes arguments should have been validated");
4000 }
4001 }
4002 break;
4003 }
4005 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4006 goto keep_cr;
4007 }
4008 else {
4009 goto clear_cr;
4010 }
4011 break;
4012 default:
4013 goto clear_cr;
4014 break;
4015 }
4016
4017 RB_GC_GUARD(t0);
4018
4019 clear_cr:
4020 // If no fast path was hit, we clear the coderange.
4021 // append_as_bytes is predominantly meant to be used in
4022 // buffering situation, hence it's likely the coderange
4023 // will never be scanned, so it's not worth spending time
4024 // precomputing the coderange except for simple and common
4025 // situations.
4027 keep_cr:
4028 return str;
4029}
4030
4031/*
4032 * call-seq:
4033 * self << object -> self
4034 *
4035 * Appends a string representation of +object+ to +self+;
4036 * returns +self+.
4037 *
4038 * If +object+ is a string, appends it to +self+:
4039 *
4040 * s = 'foo'
4041 * s << 'bar' # => "foobar"
4042 * s # => "foobar"
4043 *
4044 * If +object+ is an integer,
4045 * its value is considered a codepoint;
4046 * converts the value to a character before concatenating:
4047 *
4048 * s = 'foo'
4049 * s << 33 # => "foo!"
4050 *
4051 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4052 * and the encoding of +self+ is Encoding::US_ASCII,
4053 * changes the encoding to Encoding::ASCII_8BIT:
4054 *
4055 * s = 'foo'.encode(Encoding::US_ASCII)
4056 * s.encoding # => #<Encoding:US-ASCII>
4057 * s << 0xff # => "foo\xFF"
4058 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4059 *
4060 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4061 *
4062 * s = 'foo'
4063 * s.encoding # => <Encoding:UTF-8>
4064 * s << 0x00110000 # 1114112 out of char range (RangeError)
4065 * s = 'foo'.encode(Encoding::EUC_JP)
4066 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4067 *
4068 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4069 */
4070VALUE
4072{
4073 unsigned int code;
4074 rb_encoding *enc = STR_ENC_GET(str1);
4075 int encidx;
4076
4077 if (RB_INTEGER_TYPE_P(str2)) {
4078 if (rb_num_to_uint(str2, &code) == 0) {
4079 }
4080 else if (FIXNUM_P(str2)) {
4081 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4082 }
4083 else {
4084 rb_raise(rb_eRangeError, "bignum out of char range");
4085 }
4086 }
4087 else {
4088 return rb_str_append(str1, str2);
4089 }
4090
4091 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4092
4093 if (encidx >= 0) {
4094 rb_str_buf_cat_byte(str1, (unsigned char)code);
4095 }
4096 else {
4097 long pos = RSTRING_LEN(str1);
4098 int cr = ENC_CODERANGE(str1);
4099 int len;
4100 char *buf;
4101
4102 switch (len = rb_enc_codelen(code, enc)) {
4103 case ONIGERR_INVALID_CODE_POINT_VALUE:
4104 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4105 break;
4106 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4107 case 0:
4108 rb_raise(rb_eRangeError, "%u out of char range", code);
4109 break;
4110 }
4111 buf = ALLOCA_N(char, len + 1);
4112 rb_enc_mbcput(code, buf, enc);
4113 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4114 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4115 }
4116 rb_str_resize(str1, pos+len);
4117 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4118 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4120 }
4121 else if (cr == ENC_CODERANGE_BROKEN) {
4123 }
4124 ENC_CODERANGE_SET(str1, cr);
4125 }
4126 return str1;
4127}
4128
4129int
4130rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4131{
4132 int encidx = rb_enc_to_index(enc);
4133
4134 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4135 /* US-ASCII automatically extended to ASCII-8BIT */
4136 if (code > 0xFF) {
4137 rb_raise(rb_eRangeError, "%u out of char range", code);
4138 }
4139 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4140 return ENCINDEX_ASCII_8BIT;
4141 }
4142 return encidx;
4143 }
4144 else {
4145 return -1;
4146 }
4147}
4148
4149/*
4150 * call-seq:
4151 * prepend(*other_strings) -> new_string
4152 *
4153 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4154 *
4155 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4156 *
4157 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4158 *
4159 */
4160
4161static VALUE
4162rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4163{
4164 str_modifiable(str);
4165
4166 if (argc == 1) {
4167 rb_str_update(str, 0L, 0L, argv[0]);
4168 }
4169 else if (argc > 1) {
4170 int i;
4171 VALUE arg_str = rb_str_tmp_new(0);
4172 rb_enc_copy(arg_str, str);
4173 for (i = 0; i < argc; i++) {
4174 rb_str_append(arg_str, argv[i]);
4175 }
4176 rb_str_update(str, 0L, 0L, arg_str);
4177 }
4178
4179 return str;
4180}
4181
4182st_index_t
4184{
4185 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4186 st_index_t precomputed_hash;
4187 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4188
4189 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4190 return precomputed_hash;
4191 }
4192
4193 return str_do_hash(str);
4194}
4195
4196int
4198{
4199 long len1, len2;
4200 const char *ptr1, *ptr2;
4201 RSTRING_GETMEM(str1, ptr1, len1);
4202 RSTRING_GETMEM(str2, ptr2, len2);
4203 return (len1 != len2 ||
4204 !rb_str_comparable(str1, str2) ||
4205 memcmp(ptr1, ptr2, len1) != 0);
4206}
4207
4208/*
4209 * call-seq:
4210 * hash -> integer
4211 *
4212 * :include: doc/string/hash.rdoc
4213 *
4214 */
4215
4216static VALUE
4217rb_str_hash_m(VALUE str)
4218{
4219 st_index_t hval = rb_str_hash(str);
4220 return ST2FIX(hval);
4221}
4222
4223#define lesser(a,b) (((a)>(b))?(b):(a))
4224
4225int
4227{
4228 int idx1, idx2;
4229 int rc1, rc2;
4230
4231 if (RSTRING_LEN(str1) == 0) return TRUE;
4232 if (RSTRING_LEN(str2) == 0) return TRUE;
4233 idx1 = ENCODING_GET(str1);
4234 idx2 = ENCODING_GET(str2);
4235 if (idx1 == idx2) return TRUE;
4236 rc1 = rb_enc_str_coderange(str1);
4237 rc2 = rb_enc_str_coderange(str2);
4238 if (rc1 == ENC_CODERANGE_7BIT) {
4239 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4240 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4241 return TRUE;
4242 }
4243 if (rc2 == ENC_CODERANGE_7BIT) {
4244 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4245 return TRUE;
4246 }
4247 return FALSE;
4248}
4249
4250int
4252{
4253 long len1, len2;
4254 const char *ptr1, *ptr2;
4255 int retval;
4256
4257 if (str1 == str2) return 0;
4258 RSTRING_GETMEM(str1, ptr1, len1);
4259 RSTRING_GETMEM(str2, ptr2, len2);
4260 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4261 if (len1 == len2) {
4262 if (!rb_str_comparable(str1, str2)) {
4263 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4264 return 1;
4265 return -1;
4266 }
4267 return 0;
4268 }
4269 if (len1 > len2) return 1;
4270 return -1;
4271 }
4272 if (retval > 0) return 1;
4273 return -1;
4274}
4275
4276/*
4277 * call-seq:
4278 * self == other -> true or false
4279 *
4280 * Returns whether +other+ is equal to +self+.
4281 *
4282 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4283 *
4284 * s = 'foo'
4285 * s == 'foo' # => true
4286 * s == 'food' # => false
4287 * s == 'FOO' # => false
4288 *
4289 * Returns +false+ if the two strings' encodings are not compatible:
4290 *
4291 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4292 *
4293 * When +other+ is not a string:
4294 *
4295 * - If +other+ responds to method <tt>to_str</tt>,
4296 * <tt>other == self</tt> is called and its return value is returned.
4297 * - If +other+ does not respond to <tt>to_str</tt>,
4298 * +false+ is returned.
4299 *
4300 * Related: {Comparing}[rdoc-ref:String@Comparing].
4301 */
4302
4303VALUE
4305{
4306 if (str1 == str2) return Qtrue;
4307 if (!RB_TYPE_P(str2, T_STRING)) {
4308 if (!rb_respond_to(str2, idTo_str)) {
4309 return Qfalse;
4310 }
4311 return rb_equal(str2, str1);
4312 }
4313 return rb_str_eql_internal(str1, str2);
4314}
4315
4316/*
4317 * call-seq:
4318 * eql?(object) -> true or false
4319 *
4320 * :include: doc/string/eql_p.rdoc
4321 *
4322 */
4323
4324VALUE
4325rb_str_eql(VALUE str1, VALUE str2)
4326{
4327 if (str1 == str2) return Qtrue;
4328 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4329 return rb_str_eql_internal(str1, str2);
4330}
4331
4332/*
4333 * call-seq:
4334 * self <=> other -> -1, 0, 1, or nil
4335 *
4336 * Compares +self+ and +other+,
4337 * evaluating their _contents_, not their _lengths_.
4338 *
4339 * Returns:
4340 *
4341 * - +-1+, if +self+ is smaller.
4342 * - +0+, if the two are equal.
4343 * - +1+, if +self+ is larger.
4344 * - +nil+, if the two are incomparable.
4345 *
4346 * Examples:
4347 *
4348 * 'a' <=> 'b' # => -1
4349 * 'a' <=> 'ab' # => -1
4350 * 'a' <=> 'a' # => 0
4351 * 'b' <=> 'a' # => 1
4352 * 'ab' <=> 'a' # => 1
4353 * 'a' <=> :a # => nil
4354 *
4355 * \Class \String includes module Comparable,
4356 * each of whose methods uses String#<=> for comparison.
4357 *
4358 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4359 */
4360
4361static VALUE
4362rb_str_cmp_m(VALUE str1, VALUE str2)
4363{
4364 int result;
4365 VALUE s = rb_check_string_type(str2);
4366 if (NIL_P(s)) {
4367 return rb_invcmp(str1, str2);
4368 }
4369 result = rb_str_cmp(str1, s);
4370 return INT2FIX(result);
4371}
4372
4373static VALUE str_casecmp(VALUE str1, VALUE str2);
4374static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4375
4376/*
4377 * call-seq:
4378 * casecmp(other_string) -> -1, 0, 1, or nil
4379 *
4380 * Ignoring case, compares +self+ and +other_string+; returns:
4381 *
4382 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4383 * - 0 if the two are equal.
4384 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4385 * - +nil+ if the two are incomparable.
4386 *
4387 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4388 *
4389 * Examples:
4390 *
4391 * 'foo'.casecmp('goo') # => -1
4392 * 'goo'.casecmp('foo') # => 1
4393 * 'foo'.casecmp('food') # => -1
4394 * 'food'.casecmp('foo') # => 1
4395 * 'FOO'.casecmp('foo') # => 0
4396 * 'foo'.casecmp('FOO') # => 0
4397 * 'foo'.casecmp(1) # => nil
4398 *
4399 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4400 */
4401
4402static VALUE
4403rb_str_casecmp(VALUE str1, VALUE str2)
4404{
4405 VALUE s = rb_check_string_type(str2);
4406 if (NIL_P(s)) {
4407 return Qnil;
4408 }
4409 return str_casecmp(str1, s);
4410}
4411
4412static VALUE
4413str_casecmp(VALUE str1, VALUE str2)
4414{
4415 long len;
4416 rb_encoding *enc;
4417 const char *p1, *p1end, *p2, *p2end;
4418
4419 enc = rb_enc_compatible(str1, str2);
4420 if (!enc) {
4421 return Qnil;
4422 }
4423
4424 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4425 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4426 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4427 while (p1 < p1end && p2 < p2end) {
4428 if (*p1 != *p2) {
4429 unsigned int c1 = TOLOWER(*p1 & 0xff);
4430 unsigned int c2 = TOLOWER(*p2 & 0xff);
4431 if (c1 != c2)
4432 return INT2FIX(c1 < c2 ? -1 : 1);
4433 }
4434 p1++;
4435 p2++;
4436 }
4437 }
4438 else {
4439 while (p1 < p1end && p2 < p2end) {
4440 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4441 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4442
4443 if (0 <= c1 && 0 <= c2) {
4444 c1 = TOLOWER(c1);
4445 c2 = TOLOWER(c2);
4446 if (c1 != c2)
4447 return INT2FIX(c1 < c2 ? -1 : 1);
4448 }
4449 else {
4450 int r;
4451 l1 = rb_enc_mbclen(p1, p1end, enc);
4452 l2 = rb_enc_mbclen(p2, p2end, enc);
4453 len = l1 < l2 ? l1 : l2;
4454 r = memcmp(p1, p2, len);
4455 if (r != 0)
4456 return INT2FIX(r < 0 ? -1 : 1);
4457 if (l1 != l2)
4458 return INT2FIX(l1 < l2 ? -1 : 1);
4459 }
4460 p1 += l1;
4461 p2 += l2;
4462 }
4463 }
4464 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4465 if (p1 == p1end) return INT2FIX(-1);
4466 return INT2FIX(1);
4467}
4468
4469/*
4470 * call-seq:
4471 * casecmp?(other_string) -> true, false, or nil
4472 *
4473 * Returns +true+ if +self+ and +other_string+ are equal after
4474 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4475 *
4476 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4477 *
4478 * Examples:
4479 *
4480 * 'foo'.casecmp?('goo') # => false
4481 * 'goo'.casecmp?('foo') # => false
4482 * 'foo'.casecmp?('food') # => false
4483 * 'food'.casecmp?('foo') # => false
4484 * 'FOO'.casecmp?('foo') # => true
4485 * 'foo'.casecmp?('FOO') # => true
4486 * 'foo'.casecmp?(1) # => nil
4487 *
4488 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4489 */
4490
4491static VALUE
4492rb_str_casecmp_p(VALUE str1, VALUE str2)
4493{
4494 VALUE s = rb_check_string_type(str2);
4495 if (NIL_P(s)) {
4496 return Qnil;
4497 }
4498 return str_casecmp_p(str1, s);
4499}
4500
4501static VALUE
4502str_casecmp_p(VALUE str1, VALUE str2)
4503{
4504 rb_encoding *enc;
4505 VALUE folded_str1, folded_str2;
4506 VALUE fold_opt = sym_fold;
4507
4508 enc = rb_enc_compatible(str1, str2);
4509 if (!enc) {
4510 return Qnil;
4511 }
4512
4513 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4514 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4515
4516 return rb_str_eql(folded_str1, folded_str2);
4517}
4518
4519static long
4520strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4521 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4522{
4523 const char *search_start = str_ptr;
4524 long pos, search_len = str_len - offset;
4525
4526 for (;;) {
4527 const char *t;
4528 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4529 if (pos < 0) return pos;
4530 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4531 if (t == search_start + pos) break;
4532 search_len -= t - search_start;
4533 if (search_len <= 0) return -1;
4534 offset += t - search_start;
4535 search_start = t;
4536 }
4537 return pos + offset;
4538}
4539
4540/* found index in byte */
4541#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4542#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4543
4544static long
4545rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4546{
4547 const char *str_ptr, *str_ptr_end, *sub_ptr;
4548 long str_len, sub_len;
4549 rb_encoding *enc;
4550
4551 enc = rb_enc_check(str, sub);
4552 if (is_broken_string(sub)) return -1;
4553
4554 str_ptr = RSTRING_PTR(str);
4555 str_ptr_end = RSTRING_END(str);
4556 str_len = RSTRING_LEN(str);
4557 sub_ptr = RSTRING_PTR(sub);
4558 sub_len = RSTRING_LEN(sub);
4559
4560 if (str_len < sub_len) return -1;
4561
4562 if (offset != 0) {
4563 long str_len_char, sub_len_char;
4564 int single_byte = single_byte_optimizable(str);
4565 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4566 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4567 if (offset < 0) {
4568 offset += str_len_char;
4569 if (offset < 0) return -1;
4570 }
4571 if (str_len_char - offset < sub_len_char) return -1;
4572 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4573 str_ptr += offset;
4574 }
4575 if (sub_len == 0) return offset;
4576
4577 /* need proceed one character at a time */
4578 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4579}
4580
4581
4582/*
4583 * call-seq:
4584 * index(pattern, offset = 0) -> integer or nil
4585 *
4586 * :include: doc/string/index.rdoc
4587 *
4588 */
4589
4590static VALUE
4591rb_str_index_m(int argc, VALUE *argv, VALUE str)
4592{
4593 VALUE sub;
4594 VALUE initpos;
4595 rb_encoding *enc = STR_ENC_GET(str);
4596 long pos;
4597
4598 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4599 long slen = str_strlen(str, enc); /* str's enc */
4600 pos = NUM2LONG(initpos);
4601 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4602 if (RB_TYPE_P(sub, T_REGEXP)) {
4604 }
4605 return Qnil;
4606 }
4607 }
4608 else {
4609 pos = 0;
4610 }
4611
4612 if (RB_TYPE_P(sub, T_REGEXP)) {
4613 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4614 enc, single_byte_optimizable(str));
4615
4616 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4617 VALUE match = rb_backref_get();
4618 struct re_registers *regs = RMATCH_REGS(match);
4619 pos = rb_str_sublen(str, BEG(0));
4620 return LONG2NUM(pos);
4621 }
4622 }
4623 else {
4624 StringValue(sub);
4625 pos = rb_str_index(str, sub, pos);
4626 if (pos >= 0) {
4627 pos = rb_str_sublen(str, pos);
4628 return LONG2NUM(pos);
4629 }
4630 }
4631 return Qnil;
4632}
4633
4634/* Ensure that the given pos is a valid character boundary.
4635 * Note that in this function, "character" means a code point
4636 * (Unicode scalar value), not a grapheme cluster.
4637 */
4638static void
4639str_ensure_byte_pos(VALUE str, long pos)
4640{
4641 if (!single_byte_optimizable(str)) {
4642 const char *s = RSTRING_PTR(str);
4643 const char *e = RSTRING_END(str);
4644 const char *p = s + pos;
4645 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4646 rb_raise(rb_eIndexError,
4647 "offset %ld does not land on character boundary", pos);
4648 }
4649 }
4650}
4651
4652/*
4653 * call-seq:
4654 * byteindex(object, offset = 0) -> integer or nil
4655 *
4656 * Returns the 0-based integer index of a substring of +self+
4657 * specified by +object+ (a string or Regexp) and +offset+,
4658 * or +nil+ if there is no such substring;
4659 * the returned index is the count of _bytes_ (not characters).
4660 *
4661 * When +object+ is a string,
4662 * returns the index of the first found substring equal to +object+:
4663 *
4664 * s = 'foo' # => "foo"
4665 * s.size # => 3 # Three 1-byte characters.
4666 * s.bytesize # => 3 # Three bytes.
4667 * s.byteindex('f') # => 0
4668 * s.byteindex('o') # => 1
4669 * s.byteindex('oo') # => 1
4670 * s.byteindex('ooo') # => nil
4671 *
4672 * When +object+ is a Regexp,
4673 * returns the index of the first found substring matching +object+;
4674 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4675 *
4676 * s = 'foo'
4677 * s.byteindex(/f/) # => 0
4678 * $~ # => #<MatchData "f">
4679 * s.byteindex(/o/) # => 1
4680 * s.byteindex(/oo/) # => 1
4681 * s.byteindex(/ooo/) # => nil
4682 * $~ # => nil
4683 *
4684 * \Integer argument +offset+, if given, specifies the 0-based index
4685 * of the byte where searching is to begin.
4686 *
4687 * When +offset+ is non-negative,
4688 * searching begins at byte position +offset+:
4689 *
4690 * s = 'foo'
4691 * s.byteindex('o', 1) # => 1
4692 * s.byteindex('o', 2) # => 2
4693 * s.byteindex('o', 3) # => nil
4694 *
4695 * When +offset+ is negative, counts backward from the end of +self+:
4696 *
4697 * s = 'foo'
4698 * s.byteindex('o', -1) # => 2
4699 * s.byteindex('o', -2) # => 1
4700 * s.byteindex('o', -3) # => 1
4701 * s.byteindex('o', -4) # => nil
4702 *
4703 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4704 *
4705 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4706 * s.size # => 2 # Two 3-byte characters.
4707 * s.bytesize # => 6 # Six bytes.
4708 * s.byteindex("\uFFFF") # => 0
4709 * s.byteindex("\uFFFF", 1) # Raises IndexError
4710 * s.byteindex("\uFFFF", 2) # Raises IndexError
4711 * s.byteindex("\uFFFF", 3) # => 3
4712 * s.byteindex("\uFFFF", 4) # Raises IndexError
4713 * s.byteindex("\uFFFF", 5) # Raises IndexError
4714 * s.byteindex("\uFFFF", 6) # => nil
4715 *
4716 * Related: see {Querying}[rdoc-ref:String@Querying].
4717 */
4718
4719static VALUE
4720rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4721{
4722 VALUE sub;
4723 VALUE initpos;
4724 long pos;
4725
4726 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4727 long slen = RSTRING_LEN(str);
4728 pos = NUM2LONG(initpos);
4729 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4730 if (RB_TYPE_P(sub, T_REGEXP)) {
4732 }
4733 return Qnil;
4734 }
4735 }
4736 else {
4737 pos = 0;
4738 }
4739
4740 str_ensure_byte_pos(str, pos);
4741
4742 if (RB_TYPE_P(sub, T_REGEXP)) {
4743 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4744 VALUE match = rb_backref_get();
4745 struct re_registers *regs = RMATCH_REGS(match);
4746 pos = BEG(0);
4747 return LONG2NUM(pos);
4748 }
4749 }
4750 else {
4751 StringValue(sub);
4752 pos = rb_str_byteindex(str, sub, pos);
4753 if (pos >= 0) return LONG2NUM(pos);
4754 }
4755 return Qnil;
4756}
4757
4758#ifndef HAVE_MEMRCHR
4759static void*
4760memrchr(const char *search_str, int chr, long search_len)
4761{
4762 const char *ptr = search_str + search_len;
4763 while (ptr > search_str) {
4764 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4765 }
4766
4767 return ((void *)0);
4768}
4769#endif
4770
4771static long
4772str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4773{
4774 char *hit, *adjusted;
4775 int c;
4776 long slen, searchlen;
4777 char *sbeg, *e, *t;
4778
4779 sbeg = RSTRING_PTR(str);
4780 slen = RSTRING_LEN(sub);
4781 if (slen == 0) return s - sbeg;
4782 e = RSTRING_END(str);
4783 t = RSTRING_PTR(sub);
4784 c = *t & 0xff;
4785 searchlen = s - sbeg + 1;
4786
4787 if (memcmp(s, t, slen) == 0) {
4788 return s - sbeg;
4789 }
4790
4791 do {
4792 hit = memrchr(sbeg, c, searchlen);
4793 if (!hit) break;
4794 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4795 if (hit != adjusted) {
4796 searchlen = adjusted - sbeg;
4797 continue;
4798 }
4799 if (memcmp(hit, t, slen) == 0)
4800 return hit - sbeg;
4801 searchlen = adjusted - sbeg;
4802 } while (searchlen > 0);
4803
4804 return -1;
4805}
4806
4807/* found index in byte */
4808static long
4809rb_str_rindex(VALUE str, VALUE sub, long pos)
4810{
4811 long len, slen;
4812 char *sbeg, *s;
4813 rb_encoding *enc;
4814 int singlebyte;
4815
4816 enc = rb_enc_check(str, sub);
4817 if (is_broken_string(sub)) return -1;
4818 singlebyte = single_byte_optimizable(str);
4819 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4820 slen = str_strlen(sub, enc); /* rb_enc_check */
4821
4822 /* substring longer than string */
4823 if (len < slen) return -1;
4824 if (len - pos < slen) pos = len - slen;
4825 if (len == 0) return pos;
4826
4827 sbeg = RSTRING_PTR(str);
4828
4829 if (pos == 0) {
4830 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4831 return 0;
4832 else
4833 return -1;
4834 }
4835
4836 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4837 return str_rindex(str, sub, s, enc);
4838}
4839
4840/*
4841 * call-seq:
4842 * rindex(pattern, offset = self.length) -> integer or nil
4843 *
4844 * :include:doc/string/rindex.rdoc
4845 *
4846 */
4847
4848static VALUE
4849rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4850{
4851 VALUE sub;
4852 VALUE initpos;
4853 rb_encoding *enc = STR_ENC_GET(str);
4854 long pos, len = str_strlen(str, enc); /* str's enc */
4855
4856 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4857 pos = NUM2LONG(initpos);
4858 if (pos < 0 && (pos += len) < 0) {
4859 if (RB_TYPE_P(sub, T_REGEXP)) {
4861 }
4862 return Qnil;
4863 }
4864 if (pos > len) pos = len;
4865 }
4866 else {
4867 pos = len;
4868 }
4869
4870 if (RB_TYPE_P(sub, T_REGEXP)) {
4871 /* enc = rb_enc_check(str, sub); */
4872 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4873 enc, single_byte_optimizable(str));
4874
4875 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4876 VALUE match = rb_backref_get();
4877 struct re_registers *regs = RMATCH_REGS(match);
4878 pos = rb_str_sublen(str, BEG(0));
4879 return LONG2NUM(pos);
4880 }
4881 }
4882 else {
4883 StringValue(sub);
4884 pos = rb_str_rindex(str, sub, pos);
4885 if (pos >= 0) {
4886 pos = rb_str_sublen(str, pos);
4887 return LONG2NUM(pos);
4888 }
4889 }
4890 return Qnil;
4891}
4892
4893static long
4894rb_str_byterindex(VALUE str, VALUE sub, long pos)
4895{
4896 long len, slen;
4897 char *sbeg, *s;
4898 rb_encoding *enc;
4899
4900 enc = rb_enc_check(str, sub);
4901 if (is_broken_string(sub)) return -1;
4902 len = RSTRING_LEN(str);
4903 slen = RSTRING_LEN(sub);
4904
4905 /* substring longer than string */
4906 if (len < slen) return -1;
4907 if (len - pos < slen) pos = len - slen;
4908 if (len == 0) return pos;
4909
4910 sbeg = RSTRING_PTR(str);
4911
4912 if (pos == 0) {
4913 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4914 return 0;
4915 else
4916 return -1;
4917 }
4918
4919 s = sbeg + pos;
4920 return str_rindex(str, sub, s, enc);
4921}
4922
4923/*
4924 * call-seq:
4925 * byterindex(object, offset = self.bytesize) -> integer or nil
4926 *
4927 * Returns the 0-based integer index of a substring of +self+
4928 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4929 * or +nil+ if there is no such substring;
4930 * the returned index is the count of _bytes_ (not characters).
4931 *
4932 * When +object+ is a string,
4933 * returns the index of the _last_ found substring equal to +object+:
4934 *
4935 * s = 'foo' # => "foo"
4936 * s.size # => 3 # Three 1-byte characters.
4937 * s.bytesize # => 3 # Three bytes.
4938 * s.byterindex('f') # => 0
4939 * s.byterindex('o') # => 2
4940 * s.byterindex('oo') # => 1
4941 * s.byterindex('ooo') # => nil
4942 *
4943 * When +object+ is a Regexp,
4944 * returns the index of the last found substring matching +object+;
4945 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4946 *
4947 * s = 'foo'
4948 * s.byterindex(/f/) # => 0
4949 * $~ # => #<MatchData "f">
4950 * s.byterindex(/o/) # => 2
4951 * s.byterindex(/oo/) # => 1
4952 * s.byterindex(/ooo/) # => nil
4953 * $~ # => nil
4954 *
4955 * The last match means starting at the possible last position,
4956 * not the last of the longest matches:
4957 *
4958 * s = 'foo'
4959 * s.byterindex(/o+/) # => 2
4960 * $~ #=> #<MatchData "o">
4961 *
4962 * To get the last longest match, use a negative lookbehind:
4963 *
4964 * s = 'foo'
4965 * s.byterindex(/(?<!o)o+/) # => 1
4966 * $~ # => #<MatchData "oo">
4967 *
4968 * Or use method #byteindex with negative lookahead:
4969 *
4970 * s = 'foo'
4971 * s.byteindex(/o+(?!.*o)/) # => 1
4972 * $~ #=> #<MatchData "oo">
4973 *
4974 * \Integer argument +offset+, if given, specifies the 0-based index
4975 * of the byte where searching is to end.
4976 *
4977 * When +offset+ is non-negative,
4978 * searching ends at byte position +offset+:
4979 *
4980 * s = 'foo'
4981 * s.byterindex('o', 0) # => nil
4982 * s.byterindex('o', 1) # => 1
4983 * s.byterindex('o', 2) # => 2
4984 * s.byterindex('o', 3) # => 2
4985 *
4986 * When +offset+ is negative, counts backward from the end of +self+:
4987 *
4988 * s = 'foo'
4989 * s.byterindex('o', -1) # => 2
4990 * s.byterindex('o', -2) # => 1
4991 * s.byterindex('o', -3) # => nil
4992 *
4993 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4994 *
4995 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4996 * s.size # => 2 # Two 3-byte characters.
4997 * s.bytesize # => 6 # Six bytes.
4998 * s.byterindex("\uFFFF") # => 3
4999 * s.byterindex("\uFFFF", 1) # Raises IndexError
5000 * s.byterindex("\uFFFF", 2) # Raises IndexError
5001 * s.byterindex("\uFFFF", 3) # => 3
5002 * s.byterindex("\uFFFF", 4) # Raises IndexError
5003 * s.byterindex("\uFFFF", 5) # Raises IndexError
5004 * s.byterindex("\uFFFF", 6) # => nil
5005 *
5006 * Related: see {Querying}[rdoc-ref:String@Querying].
5007 */
5008
5009static VALUE
5010rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5011{
5012 VALUE sub;
5013 VALUE initpos;
5014 long pos, len = RSTRING_LEN(str);
5015
5016 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5017 pos = NUM2LONG(initpos);
5018 if (pos < 0 && (pos += len) < 0) {
5019 if (RB_TYPE_P(sub, T_REGEXP)) {
5021 }
5022 return Qnil;
5023 }
5024 if (pos > len) pos = len;
5025 }
5026 else {
5027 pos = len;
5028 }
5029
5030 str_ensure_byte_pos(str, pos);
5031
5032 if (RB_TYPE_P(sub, T_REGEXP)) {
5033 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5034 VALUE match = rb_backref_get();
5035 struct re_registers *regs = RMATCH_REGS(match);
5036 pos = BEG(0);
5037 return LONG2NUM(pos);
5038 }
5039 }
5040 else {
5041 StringValue(sub);
5042 pos = rb_str_byterindex(str, sub, pos);
5043 if (pos >= 0) return LONG2NUM(pos);
5044 }
5045 return Qnil;
5046}
5047
5048/*
5049 * call-seq:
5050 * self =~ other -> integer or nil
5051 *
5052 * When +other+ is a Regexp:
5053 *
5054 * - Returns the integer index (in characters) of the first match
5055 * for +self+ and +other+, or +nil+ if none;
5056 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5057 *
5058 * Examples:
5059 *
5060 * 'foo' =~ /f/ # => 0
5061 * $~ # => #<MatchData "f">
5062 * 'foo' =~ /o/ # => 1
5063 * $~ # => #<MatchData "o">
5064 * 'foo' =~ /x/ # => nil
5065 * $~ # => nil
5066 *
5067 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5068 * (see Regexp#=~):
5069 *
5070 * number = nil
5071 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5072 * number # => nil # Not assigned.
5073 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5074 * number # => "9" # Assigned.
5075 *
5076 * When +other+ is not a Regexp, returns the value
5077 * returned by <tt>other =~ self</tt>.
5078 *
5079 * Related: see {Querying}[rdoc-ref:String@Querying].
5080 */
5081
5082static VALUE
5083rb_str_match(VALUE x, VALUE y)
5084{
5085 switch (OBJ_BUILTIN_TYPE(y)) {
5086 case T_STRING:
5087 rb_raise(rb_eTypeError, "type mismatch: String given");
5088
5089 case T_REGEXP:
5090 return rb_reg_match(y, x);
5091
5092 default:
5093 return rb_funcall(y, idEqTilde, 1, x);
5094 }
5095}
5096
5097
5098static VALUE get_pat(VALUE);
5099
5100
5101/*
5102 * call-seq:
5103 * match(pattern, offset = 0) -> matchdata or nil
5104 * match(pattern, offset = 0) {|matchdata| ... } -> object
5105 *
5106 * Creates a MatchData object based on +self+ and the given arguments;
5107 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5108 *
5109 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5110 *
5111 * regexp = Regexp.new(pattern)
5112 *
5113 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5114 * (see Regexp#match):
5115 *
5116 * matchdata = regexp.match(self[offset..])
5117 *
5118 * With no block given, returns the computed +matchdata+ or +nil+:
5119 *
5120 * 'foo'.match('f') # => #<MatchData "f">
5121 * 'foo'.match('o') # => #<MatchData "o">
5122 * 'foo'.match('x') # => nil
5123 * 'foo'.match('f', 1) # => nil
5124 * 'foo'.match('o', 1) # => #<MatchData "o">
5125 *
5126 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5127 * returns the block's return value:
5128 *
5129 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5130 *
5131 * With a block given and +nil+ +matchdata+, does not call the block:
5132 *
5133 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5134 *
5135 * Related: see {Querying}[rdoc-ref:String@Querying].
5136 */
5137
5138static VALUE
5139rb_str_match_m(int argc, VALUE *argv, VALUE str)
5140{
5141 VALUE re, result;
5142 if (argc < 1)
5143 rb_check_arity(argc, 1, 2);
5144 re = argv[0];
5145 argv[0] = str;
5146 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5147 if (!NIL_P(result) && rb_block_given_p()) {
5148 return rb_yield(result);
5149 }
5150 return result;
5151}
5152
5153/*
5154 * call-seq:
5155 * match?(pattern, offset = 0) -> true or false
5156 *
5157 * Returns whether a match is found for +self+ and the given arguments;
5158 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5159 *
5160 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5161 *
5162 * regexp = Regexp.new(pattern)
5163 *
5164 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5165 * +false+ otherwise:
5166 *
5167 * 'foo'.match?(/o/) # => true
5168 * 'foo'.match?('o') # => true
5169 * 'foo'.match?(/x/) # => false
5170 * 'foo'.match?('f', 1) # => false
5171 * 'foo'.match?('o', 1) # => true
5172 *
5173 * Related: see {Querying}[rdoc-ref:String@Querying].
5174 */
5175
5176static VALUE
5177rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5178{
5179 VALUE re;
5180 rb_check_arity(argc, 1, 2);
5181 re = get_pat(argv[0]);
5182 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5183}
5184
5185enum neighbor_char {
5186 NEIGHBOR_NOT_CHAR,
5187 NEIGHBOR_FOUND,
5188 NEIGHBOR_WRAPPED
5189};
5190
5191static enum neighbor_char
5192enc_succ_char(char *p, long len, rb_encoding *enc)
5193{
5194 long i;
5195 int l;
5196
5197 if (rb_enc_mbminlen(enc) > 1) {
5198 /* wchar, trivial case */
5199 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5200 if (!MBCLEN_CHARFOUND_P(r)) {
5201 return NEIGHBOR_NOT_CHAR;
5202 }
5203 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5204 l = rb_enc_code_to_mbclen(c, enc);
5205 if (!l) return NEIGHBOR_NOT_CHAR;
5206 if (l != len) return NEIGHBOR_WRAPPED;
5207 rb_enc_mbcput(c, p, enc);
5208 r = rb_enc_precise_mbclen(p, p + len, enc);
5209 if (!MBCLEN_CHARFOUND_P(r)) {
5210 return NEIGHBOR_NOT_CHAR;
5211 }
5212 return NEIGHBOR_FOUND;
5213 }
5214 while (1) {
5215 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5216 p[i] = '\0';
5217 if (i < 0)
5218 return NEIGHBOR_WRAPPED;
5219 ++((unsigned char*)p)[i];
5220 l = rb_enc_precise_mbclen(p, p+len, enc);
5221 if (MBCLEN_CHARFOUND_P(l)) {
5222 l = MBCLEN_CHARFOUND_LEN(l);
5223 if (l == len) {
5224 return NEIGHBOR_FOUND;
5225 }
5226 else {
5227 memset(p+l, 0xff, len-l);
5228 }
5229 }
5230 if (MBCLEN_INVALID_P(l) && i < len-1) {
5231 long len2;
5232 int l2;
5233 for (len2 = len-1; 0 < len2; len2--) {
5234 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5235 if (!MBCLEN_INVALID_P(l2))
5236 break;
5237 }
5238 memset(p+len2+1, 0xff, len-(len2+1));
5239 }
5240 }
5241}
5242
5243static enum neighbor_char
5244enc_pred_char(char *p, long len, rb_encoding *enc)
5245{
5246 long i;
5247 int l;
5248 if (rb_enc_mbminlen(enc) > 1) {
5249 /* wchar, trivial case */
5250 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5251 if (!MBCLEN_CHARFOUND_P(r)) {
5252 return NEIGHBOR_NOT_CHAR;
5253 }
5254 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5255 if (!c) return NEIGHBOR_NOT_CHAR;
5256 --c;
5257 l = rb_enc_code_to_mbclen(c, enc);
5258 if (!l) return NEIGHBOR_NOT_CHAR;
5259 if (l != len) return NEIGHBOR_WRAPPED;
5260 rb_enc_mbcput(c, p, enc);
5261 r = rb_enc_precise_mbclen(p, p + len, enc);
5262 if (!MBCLEN_CHARFOUND_P(r)) {
5263 return NEIGHBOR_NOT_CHAR;
5264 }
5265 return NEIGHBOR_FOUND;
5266 }
5267 while (1) {
5268 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5269 p[i] = '\xff';
5270 if (i < 0)
5271 return NEIGHBOR_WRAPPED;
5272 --((unsigned char*)p)[i];
5273 l = rb_enc_precise_mbclen(p, p+len, enc);
5274 if (MBCLEN_CHARFOUND_P(l)) {
5275 l = MBCLEN_CHARFOUND_LEN(l);
5276 if (l == len) {
5277 return NEIGHBOR_FOUND;
5278 }
5279 else {
5280 memset(p+l, 0, len-l);
5281 }
5282 }
5283 if (MBCLEN_INVALID_P(l) && i < len-1) {
5284 long len2;
5285 int l2;
5286 for (len2 = len-1; 0 < len2; len2--) {
5287 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5288 if (!MBCLEN_INVALID_P(l2))
5289 break;
5290 }
5291 memset(p+len2+1, 0, len-(len2+1));
5292 }
5293 }
5294}
5295
5296/*
5297 overwrite +p+ by succeeding letter in +enc+ and returns
5298 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5299 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5300 assuming each ranges are successive, and mbclen
5301 never change in each ranges.
5302 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5303 character.
5304 */
5305static enum neighbor_char
5306enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5307{
5308 enum neighbor_char ret;
5309 unsigned int c;
5310 int ctype;
5311 int range;
5312 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5313
5314 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5315 int try;
5316 const int max_gaps = 1;
5317
5318 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5319 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5320 ctype = ONIGENC_CTYPE_DIGIT;
5321 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5322 ctype = ONIGENC_CTYPE_ALPHA;
5323 else
5324 return NEIGHBOR_NOT_CHAR;
5325
5326 MEMCPY(save, p, char, len);
5327 for (try = 0; try <= max_gaps; ++try) {
5328 ret = enc_succ_char(p, len, enc);
5329 if (ret == NEIGHBOR_FOUND) {
5330 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5331 if (rb_enc_isctype(c, ctype, enc))
5332 return NEIGHBOR_FOUND;
5333 }
5334 }
5335 MEMCPY(p, save, char, len);
5336 range = 1;
5337 while (1) {
5338 MEMCPY(save, p, char, len);
5339 ret = enc_pred_char(p, len, enc);
5340 if (ret == NEIGHBOR_FOUND) {
5341 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5342 if (!rb_enc_isctype(c, ctype, enc)) {
5343 MEMCPY(p, save, char, len);
5344 break;
5345 }
5346 }
5347 else {
5348 MEMCPY(p, save, char, len);
5349 break;
5350 }
5351 range++;
5352 }
5353 if (range == 1) {
5354 return NEIGHBOR_NOT_CHAR;
5355 }
5356
5357 if (ctype != ONIGENC_CTYPE_DIGIT) {
5358 MEMCPY(carry, p, char, len);
5359 return NEIGHBOR_WRAPPED;
5360 }
5361
5362 MEMCPY(carry, p, char, len);
5363 enc_succ_char(carry, len, enc);
5364 return NEIGHBOR_WRAPPED;
5365}
5366
5367
5368static VALUE str_succ(VALUE str);
5369
5370/*
5371 * call-seq:
5372 * succ -> new_str
5373 *
5374 * :include: doc/string/succ.rdoc
5375 *
5376 */
5377
5378VALUE
5380{
5381 VALUE str;
5382 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5383 rb_enc_cr_str_copy_for_substr(str, orig);
5384 return str_succ(str);
5385}
5386
5387static VALUE
5388str_succ(VALUE str)
5389{
5390 rb_encoding *enc;
5391 char *sbeg, *s, *e, *last_alnum = 0;
5392 int found_alnum = 0;
5393 long l, slen;
5394 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5395 long carry_pos = 0, carry_len = 1;
5396 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5397
5398 slen = RSTRING_LEN(str);
5399 if (slen == 0) return str;
5400
5401 enc = STR_ENC_GET(str);
5402 sbeg = RSTRING_PTR(str);
5403 s = e = sbeg + slen;
5404
5405 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5406 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5407 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5408 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5409 break;
5410 }
5411 }
5412 l = rb_enc_precise_mbclen(s, e, enc);
5413 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5414 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5415 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5416 switch (neighbor) {
5417 case NEIGHBOR_NOT_CHAR:
5418 continue;
5419 case NEIGHBOR_FOUND:
5420 return str;
5421 case NEIGHBOR_WRAPPED:
5422 last_alnum = s;
5423 break;
5424 }
5425 found_alnum = 1;
5426 carry_pos = s - sbeg;
5427 carry_len = l;
5428 }
5429 if (!found_alnum) { /* str contains no alnum */
5430 s = e;
5431 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5432 enum neighbor_char neighbor;
5433 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5434 l = rb_enc_precise_mbclen(s, e, enc);
5435 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5436 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5437 MEMCPY(tmp, s, char, l);
5438 neighbor = enc_succ_char(tmp, l, enc);
5439 switch (neighbor) {
5440 case NEIGHBOR_FOUND:
5441 MEMCPY(s, tmp, char, l);
5442 return str;
5443 break;
5444 case NEIGHBOR_WRAPPED:
5445 MEMCPY(s, tmp, char, l);
5446 break;
5447 case NEIGHBOR_NOT_CHAR:
5448 break;
5449 }
5450 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5451 /* wrapped to \0...\0. search next valid char. */
5452 enc_succ_char(s, l, enc);
5453 }
5454 if (!rb_enc_asciicompat(enc)) {
5455 MEMCPY(carry, s, char, l);
5456 carry_len = l;
5457 }
5458 carry_pos = s - sbeg;
5459 }
5461 }
5462 RESIZE_CAPA(str, slen + carry_len);
5463 sbeg = RSTRING_PTR(str);
5464 s = sbeg + carry_pos;
5465 memmove(s + carry_len, s, slen - carry_pos);
5466 memmove(s, carry, carry_len);
5467 slen += carry_len;
5468 STR_SET_LEN(str, slen);
5469 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5471 return str;
5472}
5473
5474
5475/*
5476 * call-seq:
5477 * succ! -> self
5478 *
5479 * Like String#succ, but modifies +self+ in place; returns +self+.
5480 *
5481 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5482 */
5483
5484static VALUE
5485rb_str_succ_bang(VALUE str)
5486{
5487 rb_str_modify(str);
5488 str_succ(str);
5489 return str;
5490}
5491
5492static int
5493all_digits_p(const char *s, long len)
5494{
5495 while (len-- > 0) {
5496 if (!ISDIGIT(*s)) return 0;
5497 s++;
5498 }
5499 return 1;
5500}
5501
5502static int
5503str_upto_i(VALUE str, VALUE arg)
5504{
5505 rb_yield(str);
5506 return 0;
5507}
5508
5509/*
5510 * call-seq:
5511 * upto(other_string, exclusive = false) {|string| ... } -> self
5512 * upto(other_string, exclusive = false) -> new_enumerator
5513 *
5514 * :include: doc/string/upto.rdoc
5515 *
5516 */
5517
5518static VALUE
5519rb_str_upto(int argc, VALUE *argv, VALUE beg)
5520{
5521 VALUE end, exclusive;
5522
5523 rb_scan_args(argc, argv, "11", &end, &exclusive);
5524 RETURN_ENUMERATOR(beg, argc, argv);
5525 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5526}
5527
5528VALUE
5529rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5530{
5531 VALUE current, after_end;
5532 ID succ;
5533 int n, ascii;
5534 rb_encoding *enc;
5535
5536 CONST_ID(succ, "succ");
5537 StringValue(end);
5538 enc = rb_enc_check(beg, end);
5539 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5540 /* single character */
5541 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5542 char c = RSTRING_PTR(beg)[0];
5543 char e = RSTRING_PTR(end)[0];
5544
5545 if (c > e || (excl && c == e)) return beg;
5546 for (;;) {
5547 VALUE str = rb_enc_str_new(&c, 1, enc);
5549 if ((*each)(str, arg)) break;
5550 if (!excl && c == e) break;
5551 c++;
5552 if (excl && c == e) break;
5553 }
5554 return beg;
5555 }
5556 /* both edges are all digits */
5557 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5558 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5559 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5560 VALUE b, e;
5561 int width;
5562
5563 width = RSTRING_LENINT(beg);
5564 b = rb_str_to_inum(beg, 10, FALSE);
5565 e = rb_str_to_inum(end, 10, FALSE);
5566 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5567 long bi = FIX2LONG(b);
5568 long ei = FIX2LONG(e);
5569 rb_encoding *usascii = rb_usascii_encoding();
5570
5571 while (bi <= ei) {
5572 if (excl && bi == ei) break;
5573 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5574 bi++;
5575 }
5576 }
5577 else {
5578 ID op = excl ? '<' : idLE;
5579 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5580
5581 args[0] = INT2FIX(width);
5582 while (rb_funcall(b, op, 1, e)) {
5583 args[1] = b;
5584 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5585 b = rb_funcallv(b, succ, 0, 0);
5586 }
5587 }
5588 return beg;
5589 }
5590 /* normal case */
5591 n = rb_str_cmp(beg, end);
5592 if (n > 0 || (excl && n == 0)) return beg;
5593
5594 after_end = rb_funcallv(end, succ, 0, 0);
5595 current = str_duplicate(rb_cString, beg);
5596 while (!rb_str_equal(current, after_end)) {
5597 VALUE next = Qnil;
5598 if (excl || !rb_str_equal(current, end))
5599 next = rb_funcallv(current, succ, 0, 0);
5600 if ((*each)(current, arg)) break;
5601 if (NIL_P(next)) break;
5602 current = next;
5603 StringValue(current);
5604 if (excl && rb_str_equal(current, end)) break;
5605 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5606 break;
5607 }
5608
5609 return beg;
5610}
5611
5612VALUE
5613rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5614{
5615 VALUE current;
5616 ID succ;
5617
5618 CONST_ID(succ, "succ");
5619 /* both edges are all digits */
5620 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5621 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5622 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5623 int width = RSTRING_LENINT(beg);
5624 b = rb_str_to_inum(beg, 10, FALSE);
5625 if (FIXNUM_P(b)) {
5626 long bi = FIX2LONG(b);
5627 rb_encoding *usascii = rb_usascii_encoding();
5628
5629 while (FIXABLE(bi)) {
5630 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5631 bi++;
5632 }
5633 b = LONG2NUM(bi);
5634 }
5635 args[0] = INT2FIX(width);
5636 while (1) {
5637 args[1] = b;
5638 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5639 b = rb_funcallv(b, succ, 0, 0);
5640 }
5641 }
5642 /* normal case */
5643 current = str_duplicate(rb_cString, beg);
5644 while (1) {
5645 VALUE next = rb_funcallv(current, succ, 0, 0);
5646 if ((*each)(current, arg)) break;
5647 current = next;
5648 StringValue(current);
5649 if (RSTRING_LEN(current) == 0)
5650 break;
5651 }
5652
5653 return beg;
5654}
5655
5656static int
5657include_range_i(VALUE str, VALUE arg)
5658{
5659 VALUE *argp = (VALUE *)arg;
5660 if (!rb_equal(str, *argp)) return 0;
5661 *argp = Qnil;
5662 return 1;
5663}
5664
5665VALUE
5666rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5667{
5668 beg = rb_str_new_frozen(beg);
5669 StringValue(end);
5670 end = rb_str_new_frozen(end);
5671 if (NIL_P(val)) return Qfalse;
5672 val = rb_check_string_type(val);
5673 if (NIL_P(val)) return Qfalse;
5674 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5675 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5676 rb_enc_asciicompat(STR_ENC_GET(val))) {
5677 const char *bp = RSTRING_PTR(beg);
5678 const char *ep = RSTRING_PTR(end);
5679 const char *vp = RSTRING_PTR(val);
5680 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5681 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5682 return Qfalse;
5683 else {
5684 char b = *bp;
5685 char e = *ep;
5686 char v = *vp;
5687
5688 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5689 if (b <= v && v < e) return Qtrue;
5690 return RBOOL(!RTEST(exclusive) && v == e);
5691 }
5692 }
5693 }
5694#if 0
5695 /* both edges are all digits */
5696 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5697 all_digits_p(bp, RSTRING_LEN(beg)) &&
5698 all_digits_p(ep, RSTRING_LEN(end))) {
5699 /* TODO */
5700 }
5701#endif
5702 }
5703 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5704
5705 return RBOOL(NIL_P(val));
5706}
5707
5708static VALUE
5709rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5710{
5711 if (rb_reg_search(re, str, 0, 0) >= 0) {
5712 VALUE match = rb_backref_get();
5713 int nth = rb_reg_backref_number(match, backref);
5714 return rb_reg_nth_match(nth, match);
5715 }
5716 return Qnil;
5717}
5718
5719static VALUE
5720rb_str_aref(VALUE str, VALUE indx)
5721{
5722 long idx;
5723
5724 if (FIXNUM_P(indx)) {
5725 idx = FIX2LONG(indx);
5726 }
5727 else if (RB_TYPE_P(indx, T_REGEXP)) {
5728 return rb_str_subpat(str, indx, INT2FIX(0));
5729 }
5730 else if (RB_TYPE_P(indx, T_STRING)) {
5731 if (rb_str_index(str, indx, 0) != -1)
5732 return str_duplicate(rb_cString, indx);
5733 return Qnil;
5734 }
5735 else {
5736 /* check if indx is Range */
5737 long beg, len = str_strlen(str, NULL);
5738 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5739 case Qfalse:
5740 break;
5741 case Qnil:
5742 return Qnil;
5743 default:
5744 return rb_str_substr(str, beg, len);
5745 }
5746 idx = NUM2LONG(indx);
5747 }
5748
5749 return str_substr(str, idx, 1, FALSE);
5750}
5751
5752
5753/*
5754 * call-seq:
5755 * self[offset] -> new_string or nil
5756 * self[offset, size] -> new_string or nil
5757 * self[range] -> new_string or nil
5758 * self[regexp, capture = 0] -> new_string or nil
5759 * self[substring] -> new_string or nil
5760 *
5761 * :include: doc/string/aref.rdoc
5762 *
5763 */
5764
5765static VALUE
5766rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5767{
5768 if (argc == 2) {
5769 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5770 return rb_str_subpat(str, argv[0], argv[1]);
5771 }
5772 else {
5773 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5774 }
5775 }
5776 rb_check_arity(argc, 1, 2);
5777 return rb_str_aref(str, argv[0]);
5778}
5779
5780VALUE
5782{
5783 char *ptr = RSTRING_PTR(str);
5784 long olen = RSTRING_LEN(str), nlen;
5785
5786 str_modifiable(str);
5787 if (len > olen) len = olen;
5788 nlen = olen - len;
5789 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5790 char *oldptr = ptr;
5791 size_t old_capa = RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5792 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5793 STR_SET_EMBED(str);
5794 ptr = RSTRING(str)->as.embed.ary;
5795 memmove(ptr, oldptr + len, nlen);
5796 if (fl == STR_NOEMBED) {
5797 SIZED_FREE_N(oldptr, old_capa);
5798 }
5799 }
5800 else {
5801 if (!STR_SHARED_P(str)) {
5802 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5803 rb_enc_cr_str_exact_copy(shared, str);
5804 OBJ_FREEZE(shared);
5805 }
5806 ptr = RSTRING(str)->as.heap.ptr += len;
5807 }
5808 STR_SET_LEN(str, nlen);
5809
5810 if (!SHARABLE_MIDDLE_SUBSTRING) {
5811 TERM_FILL(ptr + nlen, TERM_LEN(str));
5812 }
5814 return str;
5815}
5816
5817static void
5818rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5819{
5820 char *sptr;
5821 long slen;
5822 int cr;
5823
5824 if (beg == 0 && vlen == 0) {
5825 rb_str_drop_bytes(str, len);
5826 return;
5827 }
5828
5829 str_modify_keep_cr(str);
5830 RSTRING_GETMEM(str, sptr, slen);
5831 if (len < vlen) {
5832 /* expand string */
5833 RESIZE_CAPA(str, slen + vlen - len);
5834 sptr = RSTRING_PTR(str);
5835 }
5836
5838 cr = rb_enc_str_coderange(val);
5839 else
5841
5842 if (vlen != len) {
5843 memmove(sptr + beg + vlen,
5844 sptr + beg + len,
5845 slen - (beg + len));
5846 }
5847 if (vlen < beg && len < 0) {
5848 MEMZERO(sptr + slen, char, -len);
5849 }
5850 if (vlen > 0) {
5851 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5852 }
5853 slen += vlen - len;
5854 STR_SET_LEN(str, slen);
5855 TERM_FILL(&sptr[slen], TERM_LEN(str));
5856 ENC_CODERANGE_SET(str, cr);
5857}
5858
5859static inline void
5860rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5861{
5862 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5863}
5864
5865void
5866rb_str_update(VALUE str, long beg, long len, VALUE val)
5867{
5868 long slen;
5869 char *p, *e;
5870 rb_encoding *enc;
5871 int singlebyte = single_byte_optimizable(str);
5872 int cr;
5873
5874 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5875
5876 StringValue(val);
5877 enc = rb_enc_check(str, val);
5878 slen = str_strlen(str, enc); /* rb_enc_check */
5879
5880 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5881 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5882 }
5883 if (beg < 0) {
5884 beg += slen;
5885 }
5886 RUBY_ASSERT(beg >= 0);
5887 RUBY_ASSERT(beg <= slen);
5888
5889 if (len > slen - beg) {
5890 len = slen - beg;
5891 }
5892 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5893 if (!p) p = RSTRING_END(str);
5894 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5895 if (!e) e = RSTRING_END(str);
5896 /* error check */
5897 beg = p - RSTRING_PTR(str); /* physical position */
5898 len = e - p; /* physical length */
5899 rb_str_update_0(str, beg, len, val);
5900 rb_enc_associate(str, enc);
5902 if (cr != ENC_CODERANGE_BROKEN)
5903 ENC_CODERANGE_SET(str, cr);
5904}
5905
5906static void
5907rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5908{
5909 int nth;
5910 VALUE match;
5911 long start, end, len;
5912 rb_encoding *enc;
5913 struct re_registers *regs;
5914
5915 if (rb_reg_search(re, str, 0, 0) < 0) {
5916 rb_raise(rb_eIndexError, "regexp not matched");
5917 }
5918 match = rb_backref_get();
5919 nth = rb_reg_backref_number(match, backref);
5920 regs = RMATCH_REGS(match);
5921 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5922 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5923 }
5924 if (nth < 0) {
5925 nth += regs->num_regs;
5926 }
5927
5928 start = BEG(nth);
5929 if (start == -1) {
5930 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5931 }
5932 end = END(nth);
5933 len = end - start;
5934 StringValue(val);
5935 enc = rb_enc_check_str(str, val);
5936 rb_str_update_0(str, start, len, val);
5937 rb_enc_associate(str, enc);
5938}
5939
5940static VALUE
5941rb_str_aset(VALUE str, VALUE indx, VALUE val)
5942{
5943 long idx, beg;
5944
5945 switch (TYPE(indx)) {
5946 case T_REGEXP:
5947 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5948 return val;
5949
5950 case T_STRING:
5951 beg = rb_str_index(str, indx, 0);
5952 if (beg < 0) {
5953 rb_raise(rb_eIndexError, "string not matched");
5954 }
5955 beg = rb_str_sublen(str, beg);
5956 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5957 return val;
5958
5959 default:
5960 /* check if indx is Range */
5961 {
5962 long beg, len;
5963 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5964 rb_str_update(str, beg, len, val);
5965 return val;
5966 }
5967 }
5968 /* FALLTHROUGH */
5969
5970 case T_FIXNUM:
5971 idx = NUM2LONG(indx);
5972 rb_str_update(str, idx, 1, val);
5973 return val;
5974 }
5975}
5976
5977/*
5978 * call-seq:
5979 * self[index] = other_string -> new_string
5980 * self[start, length] = other_string -> new_string
5981 * self[range] = other_string -> new_string
5982 * self[regexp, capture = 0] = other_string -> new_string
5983 * self[substring] = other_string -> new_string
5984 *
5985 * :include: doc/string/aset.rdoc
5986 *
5987 */
5988
5989static VALUE
5990rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5991{
5992 if (argc == 3) {
5993 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5994 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5995 }
5996 else {
5997 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5998 }
5999 return argv[2];
6000 }
6001 rb_check_arity(argc, 2, 3);
6002 return rb_str_aset(str, argv[0], argv[1]);
6003}
6004
6005/*
6006 * call-seq:
6007 * insert(offset, other_string) -> self
6008 *
6009 * :include: doc/string/insert.rdoc
6010 *
6011 */
6012
6013static VALUE
6014rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6015{
6016 long pos = NUM2LONG(idx);
6017
6018 if (pos == -1) {
6019 return rb_str_append(str, str2);
6020 }
6021 else if (pos < 0) {
6022 pos++;
6023 }
6024 rb_str_update(str, pos, 0, str2);
6025 return str;
6026}
6027
6028
6029/*
6030 * call-seq:
6031 * slice!(index) -> new_string or nil
6032 * slice!(start, length) -> new_string or nil
6033 * slice!(range) -> new_string or nil
6034 * slice!(regexp, capture = 0) -> new_string or nil
6035 * slice!(substring) -> new_string or nil
6036 *
6037 * Like String#[] (and its alias String#slice), except that:
6038 *
6039 * - Performs substitutions in +self+ (not in a copy of +self+).
6040 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6041 *
6042 * A few examples:
6043 *
6044 * s = 'hello'
6045 * s.slice!('e') # => "e"
6046 * s # => "hllo"
6047 * s.slice!('e') # => nil
6048 * s # => "hllo"
6049 *
6050 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6051 */
6052
6053static VALUE
6054rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6055{
6056 VALUE result = Qnil;
6057 VALUE indx;
6058 long beg, len = 1;
6059 char *p;
6060
6061 rb_check_arity(argc, 1, 2);
6062 str_modify_keep_cr(str);
6063 indx = argv[0];
6064 if (RB_TYPE_P(indx, T_REGEXP)) {
6065 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6066 VALUE match = rb_backref_get();
6067 struct re_registers *regs = RMATCH_REGS(match);
6068 int nth = 0;
6069 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6070 if ((nth += regs->num_regs) <= 0) return Qnil;
6071 }
6072 else if (nth >= regs->num_regs) return Qnil;
6073 beg = BEG(nth);
6074 len = END(nth) - beg;
6075 goto subseq;
6076 }
6077 else if (argc == 2) {
6078 beg = NUM2LONG(indx);
6079 len = NUM2LONG(argv[1]);
6080 goto num_index;
6081 }
6082 else if (FIXNUM_P(indx)) {
6083 beg = FIX2LONG(indx);
6084 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6085 if (!len) return Qnil;
6086 beg = p - RSTRING_PTR(str);
6087 goto subseq;
6088 }
6089 else if (RB_TYPE_P(indx, T_STRING)) {
6090 beg = rb_str_index(str, indx, 0);
6091 if (beg == -1) return Qnil;
6092 len = RSTRING_LEN(indx);
6093 result = str_duplicate(rb_cString, indx);
6094 goto squash;
6095 }
6096 else {
6097 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6098 case Qnil:
6099 return Qnil;
6100 case Qfalse:
6101 beg = NUM2LONG(indx);
6102 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6103 if (!len) return Qnil;
6104 beg = p - RSTRING_PTR(str);
6105 goto subseq;
6106 default:
6107 goto num_index;
6108 }
6109 }
6110
6111 num_index:
6112 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6113 beg = p - RSTRING_PTR(str);
6114
6115 subseq:
6116 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6117 rb_enc_cr_str_copy_for_substr(result, str);
6118
6119 squash:
6120 if (len > 0) {
6121 if (beg == 0) {
6122 rb_str_drop_bytes(str, len);
6123 }
6124 else {
6125 char *sptr = RSTRING_PTR(str);
6126 long slen = RSTRING_LEN(str);
6127 if (beg + len > slen) /* pathological check */
6128 len = slen - beg;
6129 memmove(sptr + beg,
6130 sptr + beg + len,
6131 slen - (beg + len));
6132 slen -= len;
6133 STR_SET_LEN(str, slen);
6134 TERM_FILL(&sptr[slen], TERM_LEN(str));
6135 }
6136 }
6137 return result;
6138}
6139
6140static VALUE
6141get_pat(VALUE pat)
6142{
6143 VALUE val;
6144
6145 switch (OBJ_BUILTIN_TYPE(pat)) {
6146 case T_REGEXP:
6147 return pat;
6148
6149 case T_STRING:
6150 break;
6151
6152 default:
6153 val = rb_check_string_type(pat);
6154 if (NIL_P(val)) {
6155 Check_Type(pat, T_REGEXP);
6156 }
6157 pat = val;
6158 }
6159
6160 return rb_reg_regcomp(pat);
6161}
6162
6163static VALUE
6164get_pat_quoted(VALUE pat, int check)
6165{
6166 VALUE val;
6167
6168 switch (OBJ_BUILTIN_TYPE(pat)) {
6169 case T_REGEXP:
6170 return pat;
6171
6172 case T_STRING:
6173 break;
6174
6175 default:
6176 val = rb_check_string_type(pat);
6177 if (NIL_P(val)) {
6178 Check_Type(pat, T_REGEXP);
6179 }
6180 pat = val;
6181 }
6182 if (check && is_broken_string(pat)) {
6183 rb_exc_raise(rb_reg_check_preprocess(pat));
6184 }
6185 return pat;
6186}
6187
6188static long
6189rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6190{
6191 if (BUILTIN_TYPE(pat) == T_STRING) {
6192 pos = rb_str_byteindex(str, pat, pos);
6193 if (set_backref_str) {
6194 if (pos >= 0) {
6195 str = rb_str_new_frozen_String(str);
6196 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6197 if (match) {
6198 *match = match_data;
6199 }
6200 }
6201 else {
6203 }
6204 }
6205 return pos;
6206 }
6207 else {
6208 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6209 }
6210}
6211
6212static long
6213rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6214{
6215 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6216}
6217
6218
6219/*
6220 * call-seq:
6221 * sub!(pattern, replacement) -> self or nil
6222 * sub!(pattern) {|match| ... } -> self or nil
6223 *
6224 * Like String#sub, except that:
6225 *
6226 * - Changes are made to +self+, not to copy of +self+.
6227 * - Returns +self+ if any changes are made, +nil+ otherwise.
6228 *
6229 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6230 */
6231
6232static VALUE
6233rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6234{
6235 VALUE pat, repl, hash = Qnil;
6236 int iter = 0;
6237 long plen;
6238 int min_arity = rb_block_given_p() ? 1 : 2;
6239 long beg;
6240
6241 rb_check_arity(argc, min_arity, 2);
6242 if (argc == 1) {
6243 iter = 1;
6244 }
6245 else {
6246 repl = argv[1];
6247 if (!RB_TYPE_P(repl, T_STRING)) {
6248 hash = rb_check_hash_type(repl);
6249 if (NIL_P(hash)) {
6250 StringValue(repl);
6251 }
6252 }
6253 }
6254
6255 pat = get_pat_quoted(argv[0], 1);
6256
6257 str_modifiable(str);
6258 beg = rb_pat_search(pat, str, 0, 1);
6259 if (beg >= 0) {
6260 rb_encoding *enc;
6261 int cr = ENC_CODERANGE(str);
6262 long beg0, end0;
6263 VALUE match, match0 = Qnil;
6264 struct re_registers *regs;
6265 char *p, *rp;
6266 long len, rlen;
6267
6268 match = rb_backref_get();
6269 regs = RMATCH_REGS(match);
6270 if (RB_TYPE_P(pat, T_STRING)) {
6271 beg0 = beg;
6272 end0 = beg0 + RSTRING_LEN(pat);
6273 match0 = pat;
6274 }
6275 else {
6276 beg0 = BEG(0);
6277 end0 = END(0);
6278 if (iter) match0 = rb_reg_nth_match(0, match);
6279 }
6280
6281 if (iter || !NIL_P(hash)) {
6282 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6283
6284 if (iter) {
6285 repl = rb_obj_as_string(rb_yield(match0));
6286 }
6287 else {
6288 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6289 repl = rb_obj_as_string(repl);
6290 }
6291 str_mod_check(str, p, len);
6292 rb_check_frozen(str);
6293 }
6294 else {
6295 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6296 }
6297
6298 enc = rb_enc_compatible(str, repl);
6299 if (!enc) {
6300 rb_encoding *str_enc = STR_ENC_GET(str);
6301 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6302 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6303 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6304 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6305 rb_enc_inspect_name(str_enc),
6306 rb_enc_inspect_name(STR_ENC_GET(repl)));
6307 }
6308 enc = STR_ENC_GET(repl);
6309 }
6310 rb_str_modify(str);
6311 rb_enc_associate(str, enc);
6313 int cr2 = ENC_CODERANGE(repl);
6314 if (cr2 == ENC_CODERANGE_BROKEN ||
6315 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6317 else
6318 cr = cr2;
6319 }
6320 plen = end0 - beg0;
6321 rlen = RSTRING_LEN(repl);
6322 len = RSTRING_LEN(str);
6323 if (rlen > plen) {
6324 RESIZE_CAPA(str, len + rlen - plen);
6325 }
6326 p = RSTRING_PTR(str);
6327 if (rlen != plen) {
6328 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6329 }
6330 rp = RSTRING_PTR(repl);
6331 memmove(p + beg0, rp, rlen);
6332 len += rlen - plen;
6333 STR_SET_LEN(str, len);
6334 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6335 ENC_CODERANGE_SET(str, cr);
6336
6337 RB_GC_GUARD(match);
6338
6339 return str;
6340 }
6341 return Qnil;
6342}
6343
6344
6345/*
6346 * call-seq:
6347 * sub(pattern, replacement) -> new_string
6348 * sub(pattern) {|match| ... } -> new_string
6349 *
6350 * :include: doc/string/sub.rdoc
6351 */
6352
6353static VALUE
6354rb_str_sub(int argc, VALUE *argv, VALUE str)
6355{
6356 str = str_duplicate(rb_cString, str);
6357 rb_str_sub_bang(argc, argv, str);
6358 return str;
6359}
6360
6361static VALUE
6362str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6363{
6364 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6365 long beg, beg0, end0;
6366 long offset, blen, slen, len, last;
6367 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6368 char *sp, *cp;
6369 int need_backref_str = -1;
6370 rb_encoding *str_enc;
6371
6372 switch (argc) {
6373 case 1:
6374 RETURN_ENUMERATOR(str, argc, argv);
6375 mode = ITER;
6376 break;
6377 case 2:
6378 repl = argv[1];
6379 if (!RB_TYPE_P(repl, T_STRING)) {
6380 hash = rb_check_hash_type(repl);
6381 if (NIL_P(hash)) {
6382 StringValue(repl);
6383 }
6384 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6385 mode = FAST_MAP;
6386 }
6387 else {
6388 mode = MAP;
6389 }
6390 }
6391 break;
6392 default:
6393 rb_error_arity(argc, 1, 2);
6394 }
6395
6396 pat = get_pat_quoted(argv[0], 1);
6397 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6398
6399 if (beg < 0) {
6400 if (bang) return Qnil; /* no match, no substitution */
6401 return str_duplicate(rb_cString, str);
6402 }
6403
6404 offset = 0;
6405 blen = RSTRING_LEN(str) + 30; /* len + margin */
6406 dest = rb_str_buf_new(blen);
6407 sp = RSTRING_PTR(str);
6408 slen = RSTRING_LEN(str);
6409 cp = sp;
6410 str_enc = STR_ENC_GET(str);
6411 rb_enc_associate(dest, str_enc);
6412 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6413
6414 do {
6415 struct re_registers *regs = RMATCH_REGS(match);
6416 if (RB_TYPE_P(pat, T_STRING)) {
6417 beg0 = beg;
6418 end0 = beg0 + RSTRING_LEN(pat);
6419 match0 = pat;
6420 }
6421 else {
6422 beg0 = BEG(0);
6423 end0 = END(0);
6424 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6425 }
6426
6427 if (mode != STR) {
6428 if (mode == ITER) {
6429 val = rb_obj_as_string(rb_yield(match0));
6430 }
6431 else {
6432 struct RString fake_str = {RBASIC_INIT};
6433 VALUE key;
6434 if (mode == FAST_MAP) {
6435 // It is safe to use a fake_str here because we established that it won't escape,
6436 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6437 // default proc.
6438 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6439 }
6440 else {
6441 key = rb_str_subseq(str, beg0, end0 - beg0);
6442 }
6443 val = rb_hash_aref(hash, key);
6444 val = rb_obj_as_string(val);
6445 }
6446 str_mod_check(str, sp, slen);
6447 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6448 rb_raise(rb_eRuntimeError, "block should not cheat");
6449 }
6450 }
6451 else if (need_backref_str) {
6452 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6453 if (need_backref_str < 0) {
6454 need_backref_str = val != repl;
6455 }
6456 }
6457 else {
6458 val = repl;
6459 }
6460
6461 len = beg0 - offset; /* copy pre-match substr */
6462 if (len) {
6463 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6464 }
6465
6466 rb_str_buf_append(dest, val);
6467
6468 last = offset;
6469 offset = end0;
6470 if (beg0 == end0) {
6471 /*
6472 * Always consume at least one character of the input string
6473 * in order to prevent infinite loops.
6474 */
6475 if (RSTRING_LEN(str) <= end0) break;
6476 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6477 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6478 offset = end0 + len;
6479 }
6480 cp = RSTRING_PTR(str) + offset;
6481 if (offset > RSTRING_LEN(str)) break;
6482
6483 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6484 if (mode != FAST_MAP && mode != STR) {
6485 match = Qnil;
6486 }
6487 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6488
6489 RB_GC_GUARD(match);
6490 } while (beg >= 0);
6491
6492 if (RSTRING_LEN(str) > offset) {
6493 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6494 }
6495 rb_pat_search0(pat, str, last, 1, &match);
6496 if (bang) {
6497 str_shared_replace(str, dest);
6498 }
6499 else {
6500 str = dest;
6501 }
6502
6503 return str;
6504}
6505
6506
6507/*
6508 * call-seq:
6509 * gsub!(pattern, replacement) -> self or nil
6510 * gsub!(pattern) {|match| ... } -> self or nil
6511 * gsub!(pattern) -> an_enumerator
6512 *
6513 * Like String#gsub, except that:
6514 *
6515 * - Performs substitutions in +self+ (not in a copy of +self+).
6516 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6517 *
6518 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6519 */
6520
6521static VALUE
6522rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6523{
6524 str_modify_keep_cr(str);
6525 return str_gsub(argc, argv, str, 1);
6526}
6527
6528
6529/*
6530 * call-seq:
6531 * gsub(pattern, replacement) -> new_string
6532 * gsub(pattern) {|match| ... } -> new_string
6533 * gsub(pattern) -> enumerator
6534 *
6535 * Returns a copy of +self+ with zero or more substrings replaced.
6536 *
6537 * Argument +pattern+ may be a string or a Regexp;
6538 * argument +replacement+ may be a string or a Hash.
6539 * Varying types for the argument values makes this method very versatile.
6540 *
6541 * Below are some simple examples;
6542 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6543 *
6544 * With arguments +pattern+ and string +replacement+ given,
6545 * replaces each matching substring with the given +replacement+ string:
6546 *
6547 * s = 'abracadabra'
6548 * s.gsub('ab', 'AB') # => "ABracadABra"
6549 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6550 *
6551 * With arguments +pattern+ and hash +replacement+ given,
6552 * replaces each matching substring with a value from the given +replacement+ hash,
6553 * or removes it:
6554 *
6555 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6556 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6557 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6558 *
6559 * With argument +pattern+ and a block given,
6560 * calls the block with each matching substring;
6561 * replaces that substring with the block's return value:
6562 *
6563 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6564 * # => "ABrACADABrA"
6565 *
6566 * With argument +pattern+ and no block given,
6567 * returns a new Enumerator.
6568 *
6569 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6570 */
6571
6572static VALUE
6573rb_str_gsub(int argc, VALUE *argv, VALUE str)
6574{
6575 return str_gsub(argc, argv, str, 0);
6576}
6577
6578
6579/*
6580 * call-seq:
6581 * replace(other_string) -> self
6582 *
6583 * Replaces the contents of +self+ with the contents of +other_string+;
6584 * returns +self+:
6585 *
6586 * s = 'foo' # => "foo"
6587 * s.replace('bar') # => "bar"
6588 *
6589 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6590 */
6591
6592VALUE
6594{
6595 str_modifiable(str);
6596 if (str == str2) return str;
6597
6598 StringValue(str2);
6599 str_discard(str);
6600 return str_replace(str, str2);
6601}
6602
6603/*
6604 * call-seq:
6605 * clear -> self
6606 *
6607 * Removes the contents of +self+:
6608 *
6609 * s = 'foo'
6610 * s.clear # => ""
6611 * s # => ""
6612 *
6613 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6614 */
6615
6616static VALUE
6617rb_str_clear(VALUE str)
6618{
6619 str_discard(str);
6620 STR_SET_EMBED(str);
6621 STR_SET_LEN(str, 0);
6622 RSTRING_PTR(str)[0] = 0;
6623 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6625 else
6627 return str;
6628}
6629
6630/*
6631 * call-seq:
6632 * chr -> string
6633 *
6634 * :include: doc/string/chr.rdoc
6635 *
6636 */
6637
6638static VALUE
6639rb_str_chr(VALUE str)
6640{
6641 return rb_str_substr(str, 0, 1);
6642}
6643
6644/*
6645 * call-seq:
6646 * getbyte(index) -> integer or nil
6647 *
6648 * :include: doc/string/getbyte.rdoc
6649 *
6650 */
6651VALUE
6652rb_str_getbyte(VALUE str, VALUE index)
6653{
6654 long pos = NUM2LONG(index);
6655
6656 if (pos < 0)
6657 pos += RSTRING_LEN(str);
6658 if (pos < 0 || RSTRING_LEN(str) <= pos)
6659 return Qnil;
6660
6661 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6662}
6663
6664/*
6665 * call-seq:
6666 * setbyte(index, integer) -> integer
6667 *
6668 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6669 * returns +integer+:
6670 *
6671 * s = 'xyzzy'
6672 * s.setbyte(2, 129) # => 129
6673 * s # => "xy\x81zy"
6674 *
6675 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6676 */
6677VALUE
6678rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6679{
6680 long pos = NUM2LONG(index);
6681 long len = RSTRING_LEN(str);
6682 char *ptr, *head, *left = 0;
6683 rb_encoding *enc;
6684 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6685
6686 if (pos < -len || len <= pos)
6687 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6688 if (pos < 0)
6689 pos += len;
6690
6691 VALUE v = rb_to_int(value);
6692 VALUE w = rb_int_and(v, INT2FIX(0xff));
6693 char byte = (char)(NUM2INT(w) & 0xFF);
6694
6695 if (!str_independent(str))
6696 str_make_independent(str);
6697 enc = STR_ENC_GET(str);
6698 head = RSTRING_PTR(str);
6699 ptr = &head[pos];
6700 if (!STR_EMBED_P(str)) {
6701 cr = ENC_CODERANGE(str);
6702 switch (cr) {
6703 case ENC_CODERANGE_7BIT:
6704 left = ptr;
6705 *ptr = byte;
6706 if (ISASCII(byte)) goto end;
6707 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6708 if (!MBCLEN_CHARFOUND_P(nlen))
6710 else
6712 goto end;
6714 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6715 width = rb_enc_precise_mbclen(left, head+len, enc);
6716 *ptr = byte;
6717 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6718 if (!MBCLEN_CHARFOUND_P(nlen))
6720 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6722 goto end;
6723 }
6724 }
6726 *ptr = byte;
6727
6728 end:
6729 return value;
6730}
6731
6732static VALUE
6733str_byte_substr(VALUE str, long beg, long len, int empty)
6734{
6735 long n = RSTRING_LEN(str);
6736
6737 if (beg > n || len < 0) return Qnil;
6738 if (beg < 0) {
6739 beg += n;
6740 if (beg < 0) return Qnil;
6741 }
6742 if (len > n - beg)
6743 len = n - beg;
6744 if (len <= 0) {
6745 if (!empty) return Qnil;
6746 len = 0;
6747 }
6748
6749 VALUE str2 = str_subseq(str, beg, len);
6750
6751 str_enc_copy_direct(str2, str);
6752
6753 if (RSTRING_LEN(str2) == 0) {
6754 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6756 else
6758 }
6759 else {
6760 switch (ENC_CODERANGE(str)) {
6761 case ENC_CODERANGE_7BIT:
6763 break;
6764 default:
6766 break;
6767 }
6768 }
6769
6770 return str2;
6771}
6772
6773VALUE
6774rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6775{
6776 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6777}
6778
6779static VALUE
6780str_byte_aref(VALUE str, VALUE indx)
6781{
6782 long idx;
6783 if (FIXNUM_P(indx)) {
6784 idx = FIX2LONG(indx);
6785 }
6786 else {
6787 /* check if indx is Range */
6788 long beg, len = RSTRING_LEN(str);
6789
6790 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6791 case Qfalse:
6792 break;
6793 case Qnil:
6794 return Qnil;
6795 default:
6796 return str_byte_substr(str, beg, len, TRUE);
6797 }
6798
6799 idx = NUM2LONG(indx);
6800 }
6801 return str_byte_substr(str, idx, 1, FALSE);
6802}
6803
6804/*
6805 * call-seq:
6806 * byteslice(offset, length = 1) -> string or nil
6807 * byteslice(range) -> string or nil
6808 *
6809 * :include: doc/string/byteslice.rdoc
6810 */
6811
6812static VALUE
6813rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6814{
6815 if (argc == 2) {
6816 long beg = NUM2LONG(argv[0]);
6817 long len = NUM2LONG(argv[1]);
6818 return str_byte_substr(str, beg, len, TRUE);
6819 }
6820 rb_check_arity(argc, 1, 2);
6821 return str_byte_aref(str, argv[0]);
6822}
6823
6824static void
6825str_check_beg_len(VALUE str, long *beg, long *len)
6826{
6827 long end, slen = RSTRING_LEN(str);
6828
6829 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6830 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6831 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6832 }
6833 if (*beg < 0) {
6834 *beg += slen;
6835 }
6836 RUBY_ASSERT(*beg >= 0);
6837 RUBY_ASSERT(*beg <= slen);
6838
6839 if (*len > slen - *beg) {
6840 *len = slen - *beg;
6841 }
6842 end = *beg + *len;
6843 str_ensure_byte_pos(str, *beg);
6844 str_ensure_byte_pos(str, end);
6845}
6846
6847/*
6848 * call-seq:
6849 * bytesplice(offset, length, str) -> self
6850 * bytesplice(offset, length, str, str_offset, str_length) -> self
6851 * bytesplice(range, str) -> self
6852 * bytesplice(range, str, str_range) -> self
6853 *
6854 * :include: doc/string/bytesplice.rdoc
6855 */
6856
6857static VALUE
6858rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6859{
6860 long beg, len, vbeg, vlen;
6861 VALUE val;
6862 int cr;
6863
6864 rb_check_arity(argc, 2, 5);
6865 if (!(argc == 2 || argc == 3 || argc == 5)) {
6866 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6867 }
6868 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6869 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6870 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6871 rb_builtin_class_name(argv[0]));
6872 }
6873 val = argv[1];
6874 StringValue(val);
6875 if (argc == 2) {
6876 /* bytesplice(range, str) */
6877 vbeg = 0;
6878 vlen = RSTRING_LEN(val);
6879 }
6880 else {
6881 /* bytesplice(range, str, str_range) */
6882 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6883 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6884 rb_builtin_class_name(argv[2]));
6885 }
6886 }
6887 }
6888 else {
6889 beg = NUM2LONG(argv[0]);
6890 len = NUM2LONG(argv[1]);
6891 val = argv[2];
6892 StringValue(val);
6893 if (argc == 3) {
6894 /* bytesplice(index, length, str) */
6895 vbeg = 0;
6896 vlen = RSTRING_LEN(val);
6897 }
6898 else {
6899 /* bytesplice(index, length, str, str_index, str_length) */
6900 vbeg = NUM2LONG(argv[3]);
6901 vlen = NUM2LONG(argv[4]);
6902 }
6903 }
6904 str_check_beg_len(str, &beg, &len);
6905 str_check_beg_len(val, &vbeg, &vlen);
6906 str_modify_keep_cr(str);
6907
6908 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6909 rb_enc_associate(str, rb_enc_check(str, val));
6910 }
6911
6912 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6914 if (cr != ENC_CODERANGE_BROKEN)
6915 ENC_CODERANGE_SET(str, cr);
6916 return str;
6917}
6918
6919/*
6920 * call-seq:
6921 * reverse -> new_string
6922 *
6923 * Returns a new string with the characters from +self+ in reverse order.
6924 *
6925 * 'drawer'.reverse # => "reward"
6926 * 'reviled'.reverse # => "deliver"
6927 * 'stressed'.reverse # => "desserts"
6928 * 'semordnilaps'.reverse # => "spalindromes"
6929 *
6930 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6931 */
6932
6933static VALUE
6934rb_str_reverse(VALUE str)
6935{
6936 rb_encoding *enc;
6937 VALUE rev;
6938 char *s, *e, *p;
6939 int cr;
6940
6941 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6942 enc = STR_ENC_GET(str);
6943 rev = rb_str_new(0, RSTRING_LEN(str));
6944 s = RSTRING_PTR(str); e = RSTRING_END(str);
6945 p = RSTRING_END(rev);
6946 cr = ENC_CODERANGE(str);
6947
6948 if (RSTRING_LEN(str) > 1) {
6949 if (single_byte_optimizable(str)) {
6950 while (s < e) {
6951 *--p = *s++;
6952 }
6953 }
6954 else if (cr == ENC_CODERANGE_VALID) {
6955 while (s < e) {
6956 int clen = rb_enc_fast_mbclen(s, e, enc);
6957
6958 p -= clen;
6959 memcpy(p, s, clen);
6960 s += clen;
6961 }
6962 }
6963 else {
6964 cr = rb_enc_asciicompat(enc) ?
6966 while (s < e) {
6967 int clen = rb_enc_mbclen(s, e, enc);
6968
6969 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6970 p -= clen;
6971 memcpy(p, s, clen);
6972 s += clen;
6973 }
6974 }
6975 }
6976 STR_SET_LEN(rev, RSTRING_LEN(str));
6977 str_enc_copy_direct(rev, str);
6978 ENC_CODERANGE_SET(rev, cr);
6979
6980 return rev;
6981}
6982
6983
6984/*
6985 * call-seq:
6986 * reverse! -> self
6987 *
6988 * Returns +self+ with its characters reversed:
6989 *
6990 * 'drawer'.reverse! # => "reward"
6991 * 'reviled'.reverse! # => "deliver"
6992 * 'stressed'.reverse! # => "desserts"
6993 * 'semordnilaps'.reverse! # => "spalindromes"
6994 *
6995 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6996 */
6997
6998static VALUE
6999rb_str_reverse_bang(VALUE str)
7000{
7001 if (RSTRING_LEN(str) > 1) {
7002 if (single_byte_optimizable(str)) {
7003 char *s, *e, c;
7004
7005 str_modify_keep_cr(str);
7006 s = RSTRING_PTR(str);
7007 e = RSTRING_END(str) - 1;
7008 while (s < e) {
7009 c = *s;
7010 *s++ = *e;
7011 *e-- = c;
7012 }
7013 }
7014 else {
7015 str_shared_replace(str, rb_str_reverse(str));
7016 }
7017 }
7018 else {
7019 str_modify_keep_cr(str);
7020 }
7021 return str;
7022}
7023
7024
7025/*
7026 * call-seq:
7027 * include?(other_string) -> true or false
7028 *
7029 * Returns whether +self+ contains +other_string+:
7030 *
7031 * s = 'bar'
7032 * s.include?('ba') # => true
7033 * s.include?('ar') # => true
7034 * s.include?('bar') # => true
7035 * s.include?('a') # => true
7036 * s.include?('') # => true
7037 * s.include?('foo') # => false
7038 *
7039 * Related: see {Querying}[rdoc-ref:String@Querying].
7040 */
7041
7042VALUE
7043rb_str_include(VALUE str, VALUE arg)
7044{
7045 long i;
7046
7047 StringValue(arg);
7048 i = rb_str_index(str, arg, 0);
7049
7050 return RBOOL(i != -1);
7051}
7052
7053
7054/*
7055 * call-seq:
7056 * to_i(base = 10) -> integer
7057 *
7058 * Returns the result of interpreting leading characters in +self+
7059 * as an integer in the given +base+;
7060 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7061 *
7062 * '123456'.to_i # => 123456
7063 * '123def'.to_i(16) # => 1195503
7064 *
7065 * With +base+ zero given, string +object+ may contain leading characters
7066 * to specify the actual base:
7067 *
7068 * '123def'.to_i(0) # => 123
7069 * '0123def'.to_i(0) # => 83
7070 * '0b123def'.to_i(0) # => 1
7071 * '0o123def'.to_i(0) # => 83
7072 * '0d123def'.to_i(0) # => 123
7073 * '0x123def'.to_i(0) # => 1195503
7074 *
7075 * Characters past a leading valid number (in the given +base+) are ignored:
7076 *
7077 * '12.345'.to_i # => 12
7078 * '12345'.to_i(2) # => 1
7079 *
7080 * Returns zero if there is no leading valid number:
7081 *
7082 * 'abcdef'.to_i # => 0
7083 * '2'.to_i(2) # => 0
7084 *
7085 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7086 */
7087
7088static VALUE
7089rb_str_to_i(int argc, VALUE *argv, VALUE str)
7090{
7091 int base = 10;
7092
7093 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7094 rb_raise(rb_eArgError, "invalid radix %d", base);
7095 }
7096 return rb_str_to_inum(str, base, FALSE);
7097}
7098
7099
7100/*
7101 * call-seq:
7102 * to_f -> float
7103 *
7104 * Returns the result of interpreting leading characters in +self+ as a Float:
7105 *
7106 * '3.14159'.to_f # => 3.14159
7107 * '1.234e-2'.to_f # => 0.01234
7108 *
7109 * Characters past a leading valid number are ignored:
7110 *
7111 * '3.14 (pi to two places)'.to_f # => 3.14
7112 *
7113 * Returns zero if there is no leading valid number:
7114 *
7115 * 'abcdef'.to_f # => 0.0
7116 *
7117 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7118 */
7119
7120static VALUE
7121rb_str_to_f(VALUE str)
7122{
7123 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7124}
7125
7126
7127/*
7128 * call-seq:
7129 * to_s -> self or new_string
7130 *
7131 * Returns +self+ if +self+ is a +String+,
7132 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7133 *
7134 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7135 */
7136
7137static VALUE
7138rb_str_to_s(VALUE str)
7139{
7140 if (rb_obj_class(str) != rb_cString) {
7141 return str_duplicate(rb_cString, str);
7142 }
7143 return str;
7144}
7145
7146#if 0
7147static void
7148str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7149{
7150 char s[RUBY_MAX_CHAR_LEN];
7151 int n = rb_enc_codelen(c, enc);
7152
7153 rb_enc_mbcput(c, s, enc);
7154 rb_enc_str_buf_cat(str, s, n, enc);
7155}
7156#endif
7157
7158#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7159
7160int
7161rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7162{
7163 char buf[CHAR_ESC_LEN + 1];
7164 int l;
7165
7166#if SIZEOF_INT > 4
7167 c &= 0xffffffff;
7168#endif
7169 if (unicode_p) {
7170 if (c < 0x7F && ISPRINT(c)) {
7171 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7172 }
7173 else if (c < 0x10000) {
7174 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7175 }
7176 else {
7177 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7178 }
7179 }
7180 else {
7181 if (c < 0x100) {
7182 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7183 }
7184 else {
7185 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7186 }
7187 }
7188 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7189 rb_str_buf_cat(result, buf, l);
7190 return l;
7191}
7192
7193const char *
7194ruby_escaped_char(int c)
7195{
7196 switch (c) {
7197 case '\0': return "\\0";
7198 case '\n': return "\\n";
7199 case '\r': return "\\r";
7200 case '\t': return "\\t";
7201 case '\f': return "\\f";
7202 case '\013': return "\\v";
7203 case '\010': return "\\b";
7204 case '\007': return "\\a";
7205 case '\033': return "\\e";
7206 case '\x7f': return "\\c?";
7207 }
7208 return NULL;
7209}
7210
7211VALUE
7212rb_str_escape(VALUE str)
7213{
7214 int encidx = ENCODING_GET(str);
7215 rb_encoding *enc = rb_enc_from_index(encidx);
7216 const char *p = RSTRING_PTR(str);
7217 const char *pend = RSTRING_END(str);
7218 const char *prev = p;
7219 char buf[CHAR_ESC_LEN + 1];
7220 VALUE result = rb_str_buf_new(0);
7221 int unicode_p = rb_enc_unicode_p(enc);
7222 int asciicompat = rb_enc_asciicompat(enc);
7223
7224 while (p < pend) {
7225 unsigned int c;
7226 const char *cc;
7227 int n = rb_enc_precise_mbclen(p, pend, enc);
7228 if (!MBCLEN_CHARFOUND_P(n)) {
7229 if (p > prev) str_buf_cat(result, prev, p - prev);
7230 n = rb_enc_mbminlen(enc);
7231 if (pend < p + n)
7232 n = (int)(pend - p);
7233 while (n--) {
7234 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7235 str_buf_cat(result, buf, strlen(buf));
7236 prev = ++p;
7237 }
7238 continue;
7239 }
7240 n = MBCLEN_CHARFOUND_LEN(n);
7241 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7242 p += n;
7243 cc = ruby_escaped_char(c);
7244 if (cc) {
7245 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7246 str_buf_cat(result, cc, strlen(cc));
7247 prev = p;
7248 }
7249 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7250 }
7251 else {
7252 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7253 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7254 prev = p;
7255 }
7256 }
7257 if (p > prev) str_buf_cat(result, prev, p - prev);
7258 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7259
7260 return result;
7261}
7262
7263/*
7264 * call-seq:
7265 * inspect -> string
7266 *
7267 * :include: doc/string/inspect.rdoc
7268 *
7269 */
7270
7271VALUE
7273{
7274 int encidx = ENCODING_GET(str);
7275 rb_encoding *enc = rb_enc_from_index(encidx);
7276 const char *p, *pend, *prev;
7277 char buf[CHAR_ESC_LEN + 1];
7278 VALUE result = rb_str_buf_new(0);
7279 rb_encoding *resenc = rb_default_internal_encoding();
7280 int unicode_p = rb_enc_unicode_p(enc);
7281 int asciicompat = rb_enc_asciicompat(enc);
7282
7283 if (resenc == NULL) resenc = rb_default_external_encoding();
7284 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7285 rb_enc_associate(result, resenc);
7286 str_buf_cat2(result, "\"");
7287
7288 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7289 prev = p;
7290 while (p < pend) {
7291 unsigned int c, cc;
7292 int n;
7293
7294 n = rb_enc_precise_mbclen(p, pend, enc);
7295 if (!MBCLEN_CHARFOUND_P(n)) {
7296 if (p > prev) str_buf_cat(result, prev, p - prev);
7297 n = rb_enc_mbminlen(enc);
7298 if (pend < p + n)
7299 n = (int)(pend - p);
7300 while (n--) {
7301 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7302 str_buf_cat(result, buf, strlen(buf));
7303 prev = ++p;
7304 }
7305 continue;
7306 }
7307 n = MBCLEN_CHARFOUND_LEN(n);
7308 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7309 p += n;
7310 if ((asciicompat || unicode_p) &&
7311 (c == '"'|| c == '\\' ||
7312 (c == '#' &&
7313 p < pend &&
7314 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7315 (cc = rb_enc_codepoint(p,pend,enc),
7316 (cc == '$' || cc == '@' || cc == '{'))))) {
7317 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7318 str_buf_cat2(result, "\\");
7319 if (asciicompat || enc == resenc) {
7320 prev = p - n;
7321 continue;
7322 }
7323 }
7324 switch (c) {
7325 case '\n': cc = 'n'; break;
7326 case '\r': cc = 'r'; break;
7327 case '\t': cc = 't'; break;
7328 case '\f': cc = 'f'; break;
7329 case '\013': cc = 'v'; break;
7330 case '\010': cc = 'b'; break;
7331 case '\007': cc = 'a'; break;
7332 case 033: cc = 'e'; break;
7333 default: cc = 0; break;
7334 }
7335 if (cc) {
7336 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7337 buf[0] = '\\';
7338 buf[1] = (char)cc;
7339 str_buf_cat(result, buf, 2);
7340 prev = p;
7341 continue;
7342 }
7343 /* The special casing of 0x85 (NEXT_LINE) here is because
7344 * Oniguruma historically treats it as printable, but it
7345 * doesn't match the print POSIX bracket class or character
7346 * property in regexps.
7347 *
7348 * See Ruby Bug #16842 for details:
7349 * https://bugs.ruby-lang.org/issues/16842
7350 */
7351 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7352 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7353 continue;
7354 }
7355 else {
7356 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7357 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7358 prev = p;
7359 continue;
7360 }
7361 }
7362 if (p > prev) str_buf_cat(result, prev, p - prev);
7363 str_buf_cat2(result, "\"");
7364
7365 return result;
7366}
7367
7368#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7369
7370/*
7371 * call-seq:
7372 * dump -> new_string
7373 *
7374 * :include: doc/string/dump.rdoc
7375 *
7376 */
7377
7378VALUE
7380{
7381 int encidx = rb_enc_get_index(str);
7382 rb_encoding *enc = rb_enc_from_index(encidx);
7383 long len;
7384 const char *p, *pend;
7385 char *q, *qend;
7386 VALUE result;
7387 int u8 = (encidx == rb_utf8_encindex());
7388 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7389
7390 len = 2; /* "" */
7391 if (!rb_enc_asciicompat(enc)) {
7392 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7393 len += strlen(enc->name);
7394 }
7395
7396 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7397 while (p < pend) {
7398 int clen;
7399 unsigned char c = *p++;
7400
7401 switch (c) {
7402 case '"': case '\\':
7403 case '\n': case '\r':
7404 case '\t': case '\f':
7405 case '\013': case '\010': case '\007': case '\033':
7406 clen = 2;
7407 break;
7408
7409 case '#':
7410 clen = IS_EVSTR(p, pend) ? 2 : 1;
7411 break;
7412
7413 default:
7414 if (ISPRINT(c)) {
7415 clen = 1;
7416 }
7417 else {
7418 if (u8 && c > 0x7F) { /* \u notation */
7419 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7420 if (MBCLEN_CHARFOUND_P(n)) {
7421 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7422 if (cc <= 0xFFFF)
7423 clen = 6; /* \uXXXX */
7424 else if (cc <= 0xFFFFF)
7425 clen = 9; /* \u{XXXXX} */
7426 else
7427 clen = 10; /* \u{XXXXXX} */
7428 p += MBCLEN_CHARFOUND_LEN(n)-1;
7429 break;
7430 }
7431 }
7432 clen = 4; /* \xNN */
7433 }
7434 break;
7435 }
7436
7437 if (clen > LONG_MAX - len) {
7438 rb_raise(rb_eRuntimeError, "string size too big");
7439 }
7440 len += clen;
7441 }
7442
7443 result = rb_str_new(0, len);
7444 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7445 q = RSTRING_PTR(result); qend = q + len + 1;
7446
7447 *q++ = '"';
7448 while (p < pend) {
7449 unsigned char c = *p++;
7450
7451 if (c == '"' || c == '\\') {
7452 *q++ = '\\';
7453 *q++ = c;
7454 }
7455 else if (c == '#') {
7456 if (IS_EVSTR(p, pend)) *q++ = '\\';
7457 *q++ = '#';
7458 }
7459 else if (c == '\n') {
7460 *q++ = '\\';
7461 *q++ = 'n';
7462 }
7463 else if (c == '\r') {
7464 *q++ = '\\';
7465 *q++ = 'r';
7466 }
7467 else if (c == '\t') {
7468 *q++ = '\\';
7469 *q++ = 't';
7470 }
7471 else if (c == '\f') {
7472 *q++ = '\\';
7473 *q++ = 'f';
7474 }
7475 else if (c == '\013') {
7476 *q++ = '\\';
7477 *q++ = 'v';
7478 }
7479 else if (c == '\010') {
7480 *q++ = '\\';
7481 *q++ = 'b';
7482 }
7483 else if (c == '\007') {
7484 *q++ = '\\';
7485 *q++ = 'a';
7486 }
7487 else if (c == '\033') {
7488 *q++ = '\\';
7489 *q++ = 'e';
7490 }
7491 else if (ISPRINT(c)) {
7492 *q++ = c;
7493 }
7494 else {
7495 *q++ = '\\';
7496 if (u8) {
7497 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7498 if (MBCLEN_CHARFOUND_P(n)) {
7499 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7500 p += n;
7501 if (cc <= 0xFFFF)
7502 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7503 else
7504 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7505 q += strlen(q);
7506 continue;
7507 }
7508 }
7509 snprintf(q, qend-q, "x%02X", c);
7510 q += 3;
7511 }
7512 }
7513 *q++ = '"';
7514 *q = '\0';
7515 if (!rb_enc_asciicompat(enc)) {
7516 snprintf(q, qend-q, nonascii_suffix, enc->name);
7517 encidx = rb_ascii8bit_encindex();
7518 }
7519 /* result from dump is ASCII */
7520 rb_enc_associate_index(result, encidx);
7522 return result;
7523}
7524
7525static int
7526unescape_ascii(unsigned int c)
7527{
7528 switch (c) {
7529 case 'n':
7530 return '\n';
7531 case 'r':
7532 return '\r';
7533 case 't':
7534 return '\t';
7535 case 'f':
7536 return '\f';
7537 case 'v':
7538 return '\13';
7539 case 'b':
7540 return '\010';
7541 case 'a':
7542 return '\007';
7543 case 'e':
7544 return 033;
7545 }
7547}
7548
7549static void
7550undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7551{
7552 const char *s = *ss;
7553 unsigned int c;
7554 int codelen;
7555 size_t hexlen;
7556 unsigned char buf[6];
7557 static rb_encoding *enc_utf8 = NULL;
7558
7559 switch (*s) {
7560 case '\\':
7561 case '"':
7562 case '#':
7563 rb_str_cat(undumped, s, 1); /* cat itself */
7564 s++;
7565 break;
7566 case 'n':
7567 case 'r':
7568 case 't':
7569 case 'f':
7570 case 'v':
7571 case 'b':
7572 case 'a':
7573 case 'e':
7574 *buf = unescape_ascii(*s);
7575 rb_str_cat(undumped, (char *)buf, 1);
7576 s++;
7577 break;
7578 case 'u':
7579 if (*binary) {
7580 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7581 }
7582 *utf8 = true;
7583 if (++s >= s_end) {
7584 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7585 }
7586 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7587 if (*penc != enc_utf8) {
7588 *penc = enc_utf8;
7589 rb_enc_associate(undumped, enc_utf8);
7590 }
7591 if (*s == '{') { /* handle \u{...} form */
7592 s++;
7593 for (;;) {
7594 if (s >= s_end) {
7595 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7596 }
7597 if (*s == '}') {
7598 s++;
7599 break;
7600 }
7601 if (ISSPACE(*s)) {
7602 s++;
7603 continue;
7604 }
7605 c = scan_hex(s, s_end-s, &hexlen);
7606 if (hexlen == 0 || hexlen > 6) {
7607 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7608 }
7609 if (c > 0x10ffff) {
7610 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7611 }
7612 if (0xd800 <= c && c <= 0xdfff) {
7613 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7614 }
7615 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7616 rb_str_cat(undumped, (char *)buf, codelen);
7617 s += hexlen;
7618 }
7619 }
7620 else { /* handle \uXXXX form */
7621 c = scan_hex(s, 4, &hexlen);
7622 if (hexlen != 4) {
7623 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7624 }
7625 if (0xd800 <= c && c <= 0xdfff) {
7626 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7627 }
7628 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7629 rb_str_cat(undumped, (char *)buf, codelen);
7630 s += hexlen;
7631 }
7632 break;
7633 case 'x':
7634 if (++s >= s_end) {
7635 rb_raise(rb_eRuntimeError, "invalid hex escape");
7636 }
7637 *buf = scan_hex(s, 2, &hexlen);
7638 if (hexlen != 2) {
7639 rb_raise(rb_eRuntimeError, "invalid hex escape");
7640 }
7641 if (!ISASCII(*buf)) {
7642 if (*utf8) {
7643 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7644 }
7645 *binary = true;
7646 }
7647 rb_str_cat(undumped, (char *)buf, 1);
7648 s += hexlen;
7649 break;
7650 default:
7651 rb_str_cat(undumped, s-1, 2);
7652 s++;
7653 }
7654
7655 *ss = s;
7656}
7657
7658static VALUE rb_str_is_ascii_only_p(VALUE str);
7659
7660/*
7661 * call-seq:
7662 * undump -> new_string
7663 *
7664 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7665 *
7666 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7667 */
7668
7669static VALUE
7670str_undump(VALUE str)
7671{
7672 const char *s = RSTRING_PTR(str);
7673 const char *s_end = RSTRING_END(str);
7674 rb_encoding *enc = rb_enc_get(str);
7675 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7676 bool utf8 = false;
7677 bool binary = false;
7678 int w;
7679
7681 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7682 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7683 }
7684 if (!str_null_check(str, &w)) {
7685 rb_raise(rb_eRuntimeError, "string contains null byte");
7686 }
7687 if (RSTRING_LEN(str) < 2) goto invalid_format;
7688 if (*s != '"') goto invalid_format;
7689
7690 /* strip '"' at the start */
7691 s++;
7692
7693 for (;;) {
7694 if (s >= s_end) {
7695 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7696 }
7697
7698 if (*s == '"') {
7699 /* epilogue */
7700 s++;
7701 if (s == s_end) {
7702 /* ascii compatible dumped string */
7703 break;
7704 }
7705 else {
7706 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7707 static const char dup_suffix[] = ".dup";
7708 const char *encname;
7709 int encidx;
7710 ptrdiff_t size;
7711
7712 /* check separately for strings dumped by older versions */
7713 size = sizeof(dup_suffix) - 1;
7714 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7715
7716 size = sizeof(force_encoding_suffix) - 1;
7717 if (s_end - s <= size) goto invalid_format;
7718 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7719 s += size;
7720
7721 if (utf8) {
7722 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7723 }
7724
7725 encname = s;
7726 s = memchr(s, '"', s_end-s);
7727 size = s - encname;
7728 if (!s) goto invalid_format;
7729 if (s_end - s != 2) goto invalid_format;
7730 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7731
7732 encidx = rb_enc_find_index2(encname, (long)size);
7733 if (encidx < 0) {
7734 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7735 }
7736 rb_enc_associate_index(undumped, encidx);
7737 }
7738 break;
7739 }
7740
7741 if (*s == '\\') {
7742 s++;
7743 if (s >= s_end) {
7744 rb_raise(rb_eRuntimeError, "invalid escape");
7745 }
7746 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7747 }
7748 else {
7749 rb_str_cat(undumped, s++, 1);
7750 }
7751 }
7752
7753 RB_GC_GUARD(str);
7754
7755 return undumped;
7756invalid_format:
7757 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7758}
7759
7760static void
7761rb_str_check_dummy_enc(rb_encoding *enc)
7762{
7763 if (rb_enc_dummy_p(enc)) {
7764 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7765 rb_enc_name(enc));
7766 }
7767}
7768
7769static rb_encoding *
7770str_true_enc(VALUE str)
7771{
7772 rb_encoding *enc = STR_ENC_GET(str);
7773 rb_str_check_dummy_enc(enc);
7774 return enc;
7775}
7776
7777static OnigCaseFoldType
7778check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7779{
7780 if (argc==0)
7781 return flags;
7782 if (argc>2)
7783 rb_raise(rb_eArgError, "too many options");
7784 if (argv[0]==sym_turkic) {
7785 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7786 if (argc==2) {
7787 if (argv[1]==sym_lithuanian)
7788 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7789 else
7790 rb_raise(rb_eArgError, "invalid second option");
7791 }
7792 }
7793 else if (argv[0]==sym_lithuanian) {
7794 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7795 if (argc==2) {
7796 if (argv[1]==sym_turkic)
7797 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7798 else
7799 rb_raise(rb_eArgError, "invalid second option");
7800 }
7801 }
7802 else if (argc>1)
7803 rb_raise(rb_eArgError, "too many options");
7804 else if (argv[0]==sym_ascii)
7805 flags |= ONIGENC_CASE_ASCII_ONLY;
7806 else if (argv[0]==sym_fold) {
7807 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7808 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7809 else
7810 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7811 }
7812 else
7813 rb_raise(rb_eArgError, "invalid option");
7814 return flags;
7815}
7816
7817static inline bool
7818case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7819{
7820 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7821 return true;
7822 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7823}
7824
7825/* 16 should be long enough to absorb any kind of single character length increase */
7826#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7827#ifndef CASEMAP_DEBUG
7828# define CASEMAP_DEBUG 0
7829#endif
7830
7831struct mapping_buffer;
7832typedef struct mapping_buffer {
7833 size_t capa;
7834 size_t used;
7835 struct mapping_buffer *next;
7836 OnigUChar space[FLEX_ARY_LEN];
7838
7839static void
7840mapping_buffer_free(void *p)
7841{
7842 mapping_buffer *previous_buffer;
7843 mapping_buffer *current_buffer = p;
7844 while (current_buffer) {
7845 previous_buffer = current_buffer;
7846 current_buffer = current_buffer->next;
7847 ruby_xfree_sized(previous_buffer, offsetof(mapping_buffer, space) + previous_buffer->capa);
7848 }
7849}
7850
7851static const rb_data_type_t mapping_buffer_type = {
7852 "mapping_buffer",
7853 {0, mapping_buffer_free,},
7854 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7855};
7856
7857static VALUE
7858rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7859{
7860 VALUE target;
7861
7862 const OnigUChar *source_current, *source_end;
7863 int target_length = 0;
7864 VALUE buffer_anchor;
7865 mapping_buffer *current_buffer = 0;
7866 mapping_buffer **pre_buffer;
7867 size_t buffer_count = 0;
7868 int buffer_length_or_invalid;
7869
7870 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7871
7872 source_current = (OnigUChar*)RSTRING_PTR(source);
7873 source_end = (OnigUChar*)RSTRING_END(source);
7874
7875 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7876 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7877 while (source_current < source_end) {
7878 /* increase multiplier using buffer count to converge quickly */
7879 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7880 if (CASEMAP_DEBUG) {
7881 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7882 }
7883 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7884 *pre_buffer = current_buffer;
7885 pre_buffer = &current_buffer->next;
7886 current_buffer->next = NULL;
7887 current_buffer->capa = capa;
7888 buffer_length_or_invalid = enc->case_map(flags,
7889 &source_current, source_end,
7890 current_buffer->space,
7891 current_buffer->space+current_buffer->capa,
7892 enc);
7893 if (buffer_length_or_invalid < 0) {
7894 current_buffer = DATA_PTR(buffer_anchor);
7895 DATA_PTR(buffer_anchor) = 0;
7896 mapping_buffer_free(current_buffer);
7897 rb_raise(rb_eArgError, "input string invalid");
7898 }
7899 target_length += current_buffer->used = buffer_length_or_invalid;
7900 }
7901 if (CASEMAP_DEBUG) {
7902 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7903 }
7904
7905 if (buffer_count==1) {
7906 target = rb_str_new((const char*)current_buffer->space, target_length);
7907 }
7908 else {
7909 char *target_current;
7910
7911 target = rb_str_new(0, target_length);
7912 target_current = RSTRING_PTR(target);
7913 current_buffer = DATA_PTR(buffer_anchor);
7914 while (current_buffer) {
7915 memcpy(target_current, current_buffer->space, current_buffer->used);
7916 target_current += current_buffer->used;
7917 current_buffer = current_buffer->next;
7918 }
7919 }
7920 current_buffer = DATA_PTR(buffer_anchor);
7921 DATA_PTR(buffer_anchor) = 0;
7922 mapping_buffer_free(current_buffer);
7923
7924 RB_GC_GUARD(buffer_anchor);
7925
7926 /* TODO: check about string terminator character */
7927 str_enc_copy_direct(target, source);
7928 /*ENC_CODERANGE_SET(mapped, cr);*/
7929
7930 return target;
7931}
7932
7933static VALUE
7934rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7935{
7936 const OnigUChar *source_current, *source_end;
7937 OnigUChar *target_current, *target_end;
7938 long old_length = RSTRING_LEN(source);
7939 int length_or_invalid;
7940
7941 if (old_length == 0) return Qnil;
7942
7943 source_current = (OnigUChar*)RSTRING_PTR(source);
7944 source_end = (OnigUChar*)RSTRING_END(source);
7945 if (source == target) {
7946 target_current = (OnigUChar*)source_current;
7947 target_end = (OnigUChar*)source_end;
7948 }
7949 else {
7950 target_current = (OnigUChar*)RSTRING_PTR(target);
7951 target_end = (OnigUChar*)RSTRING_END(target);
7952 }
7953
7954 length_or_invalid = onigenc_ascii_only_case_map(flags,
7955 &source_current, source_end,
7956 target_current, target_end, enc);
7957 if (length_or_invalid < 0)
7958 rb_raise(rb_eArgError, "input string invalid");
7959 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7960 fprintf(stderr, "problem with rb_str_ascii_casemap"
7961 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7962 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7963 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7964 }
7965
7966 str_enc_copy(target, source);
7967
7968 return target;
7969}
7970
7971static bool
7972upcase_single(VALUE str)
7973{
7974 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7975 bool modified = false;
7976
7977 while (s < send) {
7978 unsigned int c = *(unsigned char*)s;
7979
7980 if ('a' <= c && c <= 'z') {
7981 *s = 'A' + (c - 'a');
7982 modified = true;
7983 }
7984 s++;
7985 }
7986 return modified;
7987}
7988
7989/*
7990 * call-seq:
7991 * upcase!(mapping) -> self or nil
7992 *
7993 * Like String#upcase, except that:
7994 *
7995 * - Changes character casings in +self+ (not in a copy of +self+).
7996 * - Returns +self+ if any changes are made, +nil+ otherwise.
7997 *
7998 * Related: See {Modifying}[rdoc-ref:String@Modifying].
7999 */
8000
8001static VALUE
8002rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8003{
8004 rb_encoding *enc;
8005 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8006
8007 flags = check_case_options(argc, argv, flags);
8008 str_modify_keep_cr(str);
8009 enc = str_true_enc(str);
8010 if (case_option_single_p(flags, enc, str)) {
8011 if (upcase_single(str))
8012 flags |= ONIGENC_CASE_MODIFIED;
8013 }
8014 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8015 rb_str_ascii_casemap(str, str, &flags, enc);
8016 else
8017 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8018
8019 if (ONIGENC_CASE_MODIFIED&flags) return str;
8020 return Qnil;
8021}
8022
8023
8024/*
8025 * call-seq:
8026 * upcase(mapping = :ascii) -> new_string
8027 *
8028 * :include: doc/string/upcase.rdoc
8029 */
8030
8031static VALUE
8032rb_str_upcase(int argc, VALUE *argv, VALUE str)
8033{
8034 rb_encoding *enc;
8035 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8036 VALUE ret;
8037
8038 flags = check_case_options(argc, argv, flags);
8039 enc = str_true_enc(str);
8040 if (case_option_single_p(flags, enc, str)) {
8041 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8042 str_enc_copy_direct(ret, str);
8043 upcase_single(ret);
8044 }
8045 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8046 ret = rb_str_new(0, RSTRING_LEN(str));
8047 rb_str_ascii_casemap(str, ret, &flags, enc);
8048 }
8049 else {
8050 ret = rb_str_casemap(str, &flags, enc);
8051 }
8052
8053 return ret;
8054}
8055
8056static bool
8057downcase_single(VALUE str)
8058{
8059 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8060 bool modified = false;
8061
8062 while (s < send) {
8063 unsigned int c = *(unsigned char*)s;
8064
8065 if ('A' <= c && c <= 'Z') {
8066 *s = 'a' + (c - 'A');
8067 modified = true;
8068 }
8069 s++;
8070 }
8071
8072 return modified;
8073}
8074
8075/*
8076 * call-seq:
8077 * downcase!(mapping) -> self or nil
8078 *
8079 * Like String#downcase, except that:
8080 *
8081 * - Changes character casings in +self+ (not in a copy of +self+).
8082 * - Returns +self+ if any changes are made, +nil+ otherwise.
8083 *
8084 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8085 */
8086
8087static VALUE
8088rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8089{
8090 rb_encoding *enc;
8091 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8092
8093 flags = check_case_options(argc, argv, flags);
8094 str_modify_keep_cr(str);
8095 enc = str_true_enc(str);
8096 if (case_option_single_p(flags, enc, str)) {
8097 if (downcase_single(str))
8098 flags |= ONIGENC_CASE_MODIFIED;
8099 }
8100 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8101 rb_str_ascii_casemap(str, str, &flags, enc);
8102 else
8103 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8104
8105 if (ONIGENC_CASE_MODIFIED&flags) return str;
8106 return Qnil;
8107}
8108
8109
8110/*
8111 * call-seq:
8112 * downcase(mapping = :ascii) -> new_string
8113 *
8114 * :include: doc/string/downcase.rdoc
8115 *
8116 */
8117
8118static VALUE
8119rb_str_downcase(int argc, VALUE *argv, VALUE str)
8120{
8121 rb_encoding *enc;
8122 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8123 VALUE ret;
8124
8125 flags = check_case_options(argc, argv, flags);
8126 enc = str_true_enc(str);
8127 if (case_option_single_p(flags, enc, str)) {
8128 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8129 str_enc_copy_direct(ret, str);
8130 downcase_single(ret);
8131 }
8132 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8133 ret = rb_str_new(0, RSTRING_LEN(str));
8134 rb_str_ascii_casemap(str, ret, &flags, enc);
8135 }
8136 else {
8137 ret = rb_str_casemap(str, &flags, enc);
8138 }
8139
8140 return ret;
8141}
8142
8143
8144/*
8145 * call-seq:
8146 * capitalize!(mapping = :ascii) -> self or nil
8147 *
8148 * Like String#capitalize, except that:
8149 *
8150 * - Changes character casings in +self+ (not in a copy of +self+).
8151 * - Returns +self+ if any changes are made, +nil+ otherwise.
8152 *
8153 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8154 */
8155
8156static VALUE
8157rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8158{
8159 rb_encoding *enc;
8160 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8161
8162 flags = check_case_options(argc, argv, flags);
8163 str_modify_keep_cr(str);
8164 enc = str_true_enc(str);
8165 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8166 if (flags&ONIGENC_CASE_ASCII_ONLY)
8167 rb_str_ascii_casemap(str, str, &flags, enc);
8168 else
8169 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8170
8171 if (ONIGENC_CASE_MODIFIED&flags) return str;
8172 return Qnil;
8173}
8174
8175
8176/*
8177 * call-seq:
8178 * capitalize(mapping = :ascii) -> new_string
8179 *
8180 * :include: doc/string/capitalize.rdoc
8181 *
8182 */
8183
8184static VALUE
8185rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8186{
8187 rb_encoding *enc;
8188 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8189 VALUE ret;
8190
8191 flags = check_case_options(argc, argv, flags);
8192 enc = str_true_enc(str);
8193 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8194 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8195 ret = rb_str_new(0, RSTRING_LEN(str));
8196 rb_str_ascii_casemap(str, ret, &flags, enc);
8197 }
8198 else {
8199 ret = rb_str_casemap(str, &flags, enc);
8200 }
8201 return ret;
8202}
8203
8204
8205/*
8206 * call-seq:
8207 * swapcase!(mapping) -> self or nil
8208 *
8209 * Like String#swapcase, except that:
8210 *
8211 * - Changes are made to +self+, not to copy of +self+.
8212 * - Returns +self+ if any changes are made, +nil+ otherwise.
8213 *
8214 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8215 */
8216
8217static VALUE
8218rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8219{
8220 rb_encoding *enc;
8221 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8222
8223 flags = check_case_options(argc, argv, flags);
8224 str_modify_keep_cr(str);
8225 enc = str_true_enc(str);
8226 if (flags&ONIGENC_CASE_ASCII_ONLY)
8227 rb_str_ascii_casemap(str, str, &flags, enc);
8228 else
8229 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8230
8231 if (ONIGENC_CASE_MODIFIED&flags) return str;
8232 return Qnil;
8233}
8234
8235
8236/*
8237 * call-seq:
8238 * swapcase(mapping = :ascii) -> new_string
8239 *
8240 * :include: doc/string/swapcase.rdoc
8241 *
8242 */
8243
8244static VALUE
8245rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8246{
8247 rb_encoding *enc;
8248 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8249 VALUE ret;
8250
8251 flags = check_case_options(argc, argv, flags);
8252 enc = str_true_enc(str);
8253 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8254 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8255 ret = rb_str_new(0, RSTRING_LEN(str));
8256 rb_str_ascii_casemap(str, ret, &flags, enc);
8257 }
8258 else {
8259 ret = rb_str_casemap(str, &flags, enc);
8260 }
8261 return ret;
8262}
8263
8264typedef unsigned char *USTR;
8265
8266struct tr {
8267 int gen;
8268 unsigned int now, max;
8269 char *p, *pend;
8270};
8271
8272static unsigned int
8273trnext(struct tr *t, rb_encoding *enc)
8274{
8275 int n;
8276
8277 for (;;) {
8278 nextpart:
8279 if (!t->gen) {
8280 if (t->p == t->pend) return -1;
8281 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8282 t->p += n;
8283 }
8284 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8285 t->p += n;
8286 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8287 t->p += n;
8288 if (t->p < t->pend) {
8289 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8290 t->p += n;
8291 if (t->now > c) {
8292 if (t->now < 0x80 && c < 0x80) {
8293 rb_raise(rb_eArgError,
8294 "invalid range \"%c-%c\" in string transliteration",
8295 t->now, c);
8296 }
8297 else {
8298 rb_raise(rb_eArgError, "invalid range in string transliteration");
8299 }
8300 continue; /* not reached */
8301 }
8302 else if (t->now < c) {
8303 t->gen = 1;
8304 t->max = c;
8305 }
8306 }
8307 }
8308 return t->now;
8309 }
8310 else {
8311 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8312 if (t->now == t->max) {
8313 t->gen = 0;
8314 goto nextpart;
8315 }
8316 }
8317 if (t->now < t->max) {
8318 return t->now;
8319 }
8320 else {
8321 t->gen = 0;
8322 return t->max;
8323 }
8324 }
8325 }
8326}
8327
8328static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8329
8330static VALUE
8331tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8332{
8333 const unsigned int errc = -1;
8334 unsigned int trans[256];
8335 rb_encoding *enc, *e1, *e2;
8336 struct tr trsrc, trrepl;
8337 int cflag = 0;
8338 unsigned int c, c0, last = 0;
8339 int modify = 0, i, l;
8340 unsigned char *s, *send;
8341 VALUE hash = 0;
8342 int singlebyte = single_byte_optimizable(str);
8343 int termlen;
8344 int cr;
8345
8346#define CHECK_IF_ASCII(c) \
8347 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8348 (cr = ENC_CODERANGE_VALID) : 0)
8349
8350 StringValue(src);
8351 StringValue(repl);
8352 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8353 if (RSTRING_LEN(repl) == 0) {
8354 return rb_str_delete_bang(1, &src, str);
8355 }
8356
8357 cr = ENC_CODERANGE(str);
8358 e1 = rb_enc_check(str, src);
8359 e2 = rb_enc_check(str, repl);
8360 if (e1 == e2) {
8361 enc = e1;
8362 }
8363 else {
8364 enc = rb_enc_check(src, repl);
8365 }
8366 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8367 if (RSTRING_LEN(src) > 1 &&
8368 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8369 trsrc.p + l < trsrc.pend) {
8370 cflag = 1;
8371 trsrc.p += l;
8372 }
8373 trrepl.p = RSTRING_PTR(repl);
8374 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8375 trsrc.gen = trrepl.gen = 0;
8376 trsrc.now = trrepl.now = 0;
8377 trsrc.max = trrepl.max = 0;
8378
8379 if (cflag) {
8380 for (i=0; i<256; i++) {
8381 trans[i] = 1;
8382 }
8383 while ((c = trnext(&trsrc, enc)) != errc) {
8384 if (c < 256) {
8385 trans[c] = errc;
8386 }
8387 else {
8388 if (!hash) hash = rb_hash_new();
8389 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8390 }
8391 }
8392 while ((c = trnext(&trrepl, enc)) != errc)
8393 /* retrieve last replacer */;
8394 last = trrepl.now;
8395 for (i=0; i<256; i++) {
8396 if (trans[i] != errc) {
8397 trans[i] = last;
8398 }
8399 }
8400 }
8401 else {
8402 unsigned int r;
8403
8404 for (i=0; i<256; i++) {
8405 trans[i] = errc;
8406 }
8407 while ((c = trnext(&trsrc, enc)) != errc) {
8408 r = trnext(&trrepl, enc);
8409 if (r == errc) r = trrepl.now;
8410 if (c < 256) {
8411 trans[c] = r;
8412 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8413 }
8414 else {
8415 if (!hash) hash = rb_hash_new();
8416 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8417 }
8418 }
8419 }
8420
8421 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8422 cr = ENC_CODERANGE_7BIT;
8423 str_modify_keep_cr(str);
8424 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8425 termlen = rb_enc_mbminlen(enc);
8426 if (sflag) {
8427 int clen, tlen;
8428 long offset, max = RSTRING_LEN(str);
8429 unsigned int save = -1;
8430 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8431
8432 while (s < send) {
8433 int may_modify = 0;
8434
8435 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8436 if (!MBCLEN_CHARFOUND_P(r)) {
8437 SIZED_FREE_N(buf, max + termlen);
8438 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8439 }
8440 clen = MBCLEN_CHARFOUND_LEN(r);
8441 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8442
8443 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8444
8445 s += clen;
8446 if (c < 256) {
8447 c = trans[c];
8448 }
8449 else if (hash) {
8450 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8451 if (NIL_P(tmp)) {
8452 if (cflag) c = last;
8453 else c = errc;
8454 }
8455 else if (cflag) c = errc;
8456 else c = NUM2INT(tmp);
8457 }
8458 else {
8459 c = errc;
8460 }
8461 if (c != (unsigned int)-1) {
8462 if (save == c) {
8463 CHECK_IF_ASCII(c);
8464 continue;
8465 }
8466 save = c;
8467 tlen = rb_enc_codelen(c, enc);
8468 modify = 1;
8469 }
8470 else {
8471 save = -1;
8472 c = c0;
8473 if (enc != e1) may_modify = 1;
8474 }
8475 if ((offset = t - buf) + tlen > max) {
8476 size_t MAYBE_UNUSED(old) = max + termlen;
8477 max = offset + tlen + (send - s);
8478 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8479 t = buf + offset;
8480 }
8481 rb_enc_mbcput(c, t, enc);
8482 if (may_modify && memcmp(s, t, tlen) != 0) {
8483 modify = 1;
8484 }
8485 CHECK_IF_ASCII(c);
8486 t += tlen;
8487 }
8488 if (!STR_EMBED_P(str)) {
8489 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8490 }
8491 TERM_FILL((char *)t, termlen);
8492 RSTRING(str)->as.heap.ptr = (char *)buf;
8493 STR_SET_LEN(str, t - buf);
8494 STR_SET_NOEMBED(str);
8495 RSTRING(str)->as.heap.aux.capa = max;
8496 }
8497 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8498 while (s < send) {
8499 c = (unsigned char)*s;
8500 if (trans[c] != errc) {
8501 if (!cflag) {
8502 c = trans[c];
8503 *s = c;
8504 modify = 1;
8505 }
8506 else {
8507 *s = last;
8508 modify = 1;
8509 }
8510 }
8511 CHECK_IF_ASCII(c);
8512 s++;
8513 }
8514 }
8515 else {
8516 int clen, tlen;
8517 long offset, max = (long)((send - s) * 1.2);
8518 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8519
8520 while (s < send) {
8521 int may_modify = 0;
8522
8523 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8524 if (!MBCLEN_CHARFOUND_P(r)) {
8525 SIZED_FREE_N(buf, max + termlen);
8526 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8527 }
8528 clen = MBCLEN_CHARFOUND_LEN(r);
8529 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8530
8531 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8532
8533 if (c < 256) {
8534 c = trans[c];
8535 }
8536 else if (hash) {
8537 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8538 if (NIL_P(tmp)) {
8539 if (cflag) c = last;
8540 else c = errc;
8541 }
8542 else if (cflag) c = errc;
8543 else c = NUM2INT(tmp);
8544 }
8545 else {
8546 c = cflag ? last : errc;
8547 }
8548 if (c != errc) {
8549 tlen = rb_enc_codelen(c, enc);
8550 modify = 1;
8551 }
8552 else {
8553 c = c0;
8554 if (enc != e1) may_modify = 1;
8555 }
8556 if ((offset = t - buf) + tlen > max) {
8557 size_t MAYBE_UNUSED(old) = max + termlen;
8558 max = offset + tlen + (long)((send - s) * 1.2);
8559 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8560 t = buf + offset;
8561 }
8562 if (s != t) {
8563 rb_enc_mbcput(c, t, enc);
8564 if (may_modify && memcmp(s, t, tlen) != 0) {
8565 modify = 1;
8566 }
8567 }
8568 CHECK_IF_ASCII(c);
8569 s += clen;
8570 t += tlen;
8571 }
8572 if (!STR_EMBED_P(str)) {
8573 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8574 }
8575 TERM_FILL((char *)t, termlen);
8576 RSTRING(str)->as.heap.ptr = (char *)buf;
8577 STR_SET_LEN(str, t - buf);
8578 STR_SET_NOEMBED(str);
8579 RSTRING(str)->as.heap.aux.capa = max;
8580 }
8581
8582 if (modify) {
8583 if (cr != ENC_CODERANGE_BROKEN)
8584 ENC_CODERANGE_SET(str, cr);
8585 rb_enc_associate(str, enc);
8586 return str;
8587 }
8588 return Qnil;
8589}
8590
8591
8592/*
8593 * call-seq:
8594 * tr!(selector, replacements) -> self or nil
8595 *
8596 * Like String#tr, except:
8597 *
8598 * - Performs substitutions in +self+ (not in a copy of +self+).
8599 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8600 *
8601 * Related: {Modifying}[rdoc-ref:String@Modifying].
8602 */
8603
8604static VALUE
8605rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8606{
8607 return tr_trans(str, src, repl, 0);
8608}
8609
8610
8611/*
8612 * call-seq:
8613 * tr(selector, replacements) -> new_string
8614 *
8615 * Returns a copy of +self+ with each character specified by string +selector+
8616 * translated to the corresponding character in string +replacements+.
8617 * The correspondence is _positional_:
8618 *
8619 * - Each occurrence of the first character specified by +selector+
8620 * is translated to the first character in +replacements+.
8621 * - Each occurrence of the second character specified by +selector+
8622 * is translated to the second character in +replacements+.
8623 * - And so on.
8624 *
8625 * Example:
8626 *
8627 * 'hello'.tr('el', 'ip') #=> "hippo"
8628 *
8629 * If +replacements+ is shorter than +selector+,
8630 * it is implicitly padded with its own last character:
8631 *
8632 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8633 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8634 *
8635 * Arguments +selector+ and +replacements+ must be valid character selectors
8636 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8637 * and may use any of its valid forms, including negation, ranges, and escapes:
8638 *
8639 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8640 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8641 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8642 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8643 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8644 *
8645 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8646 */
8647
8648static VALUE
8649rb_str_tr(VALUE str, VALUE src, VALUE repl)
8650{
8651 str = str_duplicate(rb_cString, str);
8652 tr_trans(str, src, repl, 0);
8653 return str;
8654}
8655
8656#define TR_TABLE_MAX (UCHAR_MAX+1)
8657#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8658static void
8659tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8660 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8661{
8662 const unsigned int errc = -1;
8663 char buf[TR_TABLE_MAX];
8664 struct tr tr;
8665 unsigned int c;
8666 VALUE table = 0, ptable = 0;
8667 int i, l, cflag = 0;
8668
8669 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8670 tr.gen = tr.now = tr.max = 0;
8671
8672 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8673 cflag = 1;
8674 tr.p += l;
8675 }
8676 if (first) {
8677 for (i=0; i<TR_TABLE_MAX; i++) {
8678 stable[i] = 1;
8679 }
8680 stable[TR_TABLE_MAX] = cflag;
8681 }
8682 else if (stable[TR_TABLE_MAX] && !cflag) {
8683 stable[TR_TABLE_MAX] = 0;
8684 }
8685 for (i=0; i<TR_TABLE_MAX; i++) {
8686 buf[i] = cflag;
8687 }
8688
8689 while ((c = trnext(&tr, enc)) != errc) {
8690 if (c < TR_TABLE_MAX) {
8691 buf[(unsigned char)c] = !cflag;
8692 }
8693 else {
8694 VALUE key = UINT2NUM(c);
8695
8696 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8697 if (cflag) {
8698 ptable = *ctablep;
8699 table = ptable ? ptable : rb_hash_new();
8700 *ctablep = table;
8701 }
8702 else {
8703 table = rb_hash_new();
8704 ptable = *tablep;
8705 *tablep = table;
8706 }
8707 }
8708 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8709 rb_hash_aset(table, key, Qtrue);
8710 }
8711 }
8712 }
8713 for (i=0; i<TR_TABLE_MAX; i++) {
8714 stable[i] = stable[i] && buf[i];
8715 }
8716 if (!table && !cflag) {
8717 *tablep = 0;
8718 }
8719}
8720
8721
8722static int
8723tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8724{
8725 if (c < TR_TABLE_MAX) {
8726 return table[c] != 0;
8727 }
8728 else {
8729 VALUE v = UINT2NUM(c);
8730
8731 if (del) {
8732 if (!NIL_P(rb_hash_lookup(del, v)) &&
8733 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8734 return TRUE;
8735 }
8736 }
8737 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8738 return FALSE;
8739 }
8740 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8741 }
8742}
8743
8744/*
8745 * call-seq:
8746 * delete!(*selectors) -> self or nil
8747 *
8748 * Like String#delete, but modifies +self+ in place;
8749 * returns +self+ if any characters were deleted, +nil+ otherwise.
8750 *
8751 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8752 */
8753
8754static VALUE
8755rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8756{
8757 char squeez[TR_TABLE_SIZE];
8758 rb_encoding *enc = 0;
8759 char *s, *send, *t;
8760 VALUE del = 0, nodel = 0;
8761 int modify = 0;
8762 int i, ascompat, cr;
8763
8764 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8766 for (i=0; i<argc; i++) {
8767 VALUE s = argv[i];
8768
8769 StringValue(s);
8770 enc = rb_enc_check(str, s);
8771 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8772 }
8773
8774 str_modify_keep_cr(str);
8775 ascompat = rb_enc_asciicompat(enc);
8776 s = t = RSTRING_PTR(str);
8777 send = RSTRING_END(str);
8778 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8779 while (s < send) {
8780 unsigned int c;
8781 int clen;
8782
8783 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8784 if (squeez[c]) {
8785 modify = 1;
8786 }
8787 else {
8788 if (t != s) *t = c;
8789 t++;
8790 }
8791 s++;
8792 }
8793 else {
8794 c = rb_enc_codepoint_len(s, send, &clen, enc);
8795
8796 if (tr_find(c, squeez, del, nodel)) {
8797 modify = 1;
8798 }
8799 else {
8800 if (t != s) rb_enc_mbcput(c, t, enc);
8801 t += clen;
8803 }
8804 s += clen;
8805 }
8806 }
8807 TERM_FILL(t, TERM_LEN(str));
8808 STR_SET_LEN(str, t - RSTRING_PTR(str));
8809 ENC_CODERANGE_SET(str, cr);
8810
8811 if (modify) return str;
8812 return Qnil;
8813}
8814
8815
8816/*
8817 * call-seq:
8818 * delete(*selectors) -> new_string
8819 *
8820 * :include: doc/string/delete.rdoc
8821 *
8822 */
8823
8824static VALUE
8825rb_str_delete(int argc, VALUE *argv, VALUE str)
8826{
8827 str = str_duplicate(rb_cString, str);
8828 rb_str_delete_bang(argc, argv, str);
8829 return str;
8830}
8831
8832
8833/*
8834 * call-seq:
8835 * squeeze!(*selectors) -> self or nil
8836 *
8837 * Like String#squeeze, except that:
8838 *
8839 * - Characters are squeezed in +self+ (not in a copy of +self+).
8840 * - Returns +self+ if any changes are made, +nil+ otherwise.
8841 *
8842 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8843 */
8844
8845static VALUE
8846rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8847{
8848 char squeez[TR_TABLE_SIZE];
8849 rb_encoding *enc = 0;
8850 VALUE del = 0, nodel = 0;
8851 unsigned char *s, *send, *t;
8852 int i, modify = 0;
8853 int ascompat, singlebyte = single_byte_optimizable(str);
8854 unsigned int save;
8855
8856 if (argc == 0) {
8857 enc = STR_ENC_GET(str);
8858 }
8859 else {
8860 for (i=0; i<argc; i++) {
8861 VALUE s = argv[i];
8862
8863 StringValue(s);
8864 enc = rb_enc_check(str, s);
8865 if (singlebyte && !single_byte_optimizable(s))
8866 singlebyte = 0;
8867 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8868 }
8869 }
8870
8871 str_modify_keep_cr(str);
8872 s = t = (unsigned char *)RSTRING_PTR(str);
8873 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8874 send = (unsigned char *)RSTRING_END(str);
8875 save = -1;
8876 ascompat = rb_enc_asciicompat(enc);
8877
8878 if (singlebyte) {
8879 while (s < send) {
8880 unsigned int c = *s++;
8881 if (c != save || (argc > 0 && !squeez[c])) {
8882 *t++ = save = c;
8883 }
8884 }
8885 }
8886 else {
8887 while (s < send) {
8888 unsigned int c;
8889 int clen;
8890
8891 if (ascompat && (c = *s) < 0x80) {
8892 if (c != save || (argc > 0 && !squeez[c])) {
8893 *t++ = save = c;
8894 }
8895 s++;
8896 }
8897 else {
8898 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8899
8900 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8901 if (t != s) rb_enc_mbcput(c, t, enc);
8902 save = c;
8903 t += clen;
8904 }
8905 s += clen;
8906 }
8907 }
8908 }
8909
8910 TERM_FILL((char *)t, TERM_LEN(str));
8911 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8912 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8913 modify = 1;
8914 }
8915
8916 if (modify) return str;
8917 return Qnil;
8918}
8919
8920
8921/*
8922 * call-seq:
8923 * squeeze(*selectors) -> new_string
8924 *
8925 * :include: doc/string/squeeze.rdoc
8926 *
8927 */
8928
8929static VALUE
8930rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8931{
8932 str = str_duplicate(rb_cString, str);
8933 rb_str_squeeze_bang(argc, argv, str);
8934 return str;
8935}
8936
8937
8938/*
8939 * call-seq:
8940 * tr_s!(selector, replacements) -> self or nil
8941 *
8942 * Like String#tr_s, except:
8943 *
8944 * - Modifies +self+ in place (not a copy of +self+).
8945 * - Returns +self+ if any changes were made, +nil+ otherwise.
8946 *
8947 * Related: {Modifying}[rdoc-ref:String@Modifying].
8948 */
8949
8950static VALUE
8951rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8952{
8953 return tr_trans(str, src, repl, 1);
8954}
8955
8956
8957/*
8958 * call-seq:
8959 * tr_s(selector, replacements) -> new_string
8960 *
8961 * Like String#tr, except:
8962 *
8963 * - Also squeezes the modified portions of the translated string;
8964 * see String#squeeze.
8965 * - Returns the translated and squeezed string.
8966 *
8967 * Examples:
8968 *
8969 * 'hello'.tr_s('l', 'r') #=> "hero"
8970 * 'hello'.tr_s('el', '-') #=> "h-o"
8971 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8972 *
8973 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8974 *
8975 */
8976
8977static VALUE
8978rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8979{
8980 str = str_duplicate(rb_cString, str);
8981 tr_trans(str, src, repl, 1);
8982 return str;
8983}
8984
8985
8986/*
8987 * call-seq:
8988 * count(*selectors) -> integer
8989 *
8990 * :include: doc/string/count.rdoc
8991 */
8992
8993static VALUE
8994rb_str_count(int argc, VALUE *argv, VALUE str)
8995{
8996 char table[TR_TABLE_SIZE];
8997 rb_encoding *enc = 0;
8998 VALUE del = 0, nodel = 0, tstr;
8999 char *s, *send;
9000 int i;
9001 int ascompat;
9002 size_t n = 0;
9003
9005
9006 tstr = argv[0];
9007 StringValue(tstr);
9008 enc = rb_enc_check(str, tstr);
9009 if (argc == 1) {
9010 const char *ptstr;
9011 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9012 (ptstr = RSTRING_PTR(tstr),
9013 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9014 !is_broken_string(str)) {
9015 int clen;
9016 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9017
9018 s = RSTRING_PTR(str);
9019 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9020 send = RSTRING_END(str);
9021 while (s < send) {
9022 if (*(unsigned char*)s++ == c) n++;
9023 }
9024 return SIZET2NUM(n);
9025 }
9026 }
9027
9028 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9029 for (i=1; i<argc; i++) {
9030 tstr = argv[i];
9031 StringValue(tstr);
9032 enc = rb_enc_check(str, tstr);
9033 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9034 }
9035
9036 s = RSTRING_PTR(str);
9037 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9038 send = RSTRING_END(str);
9039 ascompat = rb_enc_asciicompat(enc);
9040 while (s < send) {
9041 unsigned int c;
9042
9043 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9044 if (table[c]) {
9045 n++;
9046 }
9047 s++;
9048 }
9049 else {
9050 int clen;
9051 c = rb_enc_codepoint_len(s, send, &clen, enc);
9052 if (tr_find(c, table, del, nodel)) {
9053 n++;
9054 }
9055 s += clen;
9056 }
9057 }
9058
9059 return SIZET2NUM(n);
9060}
9061
9062static VALUE
9063rb_fs_check(VALUE val)
9064{
9065 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9066 val = rb_check_string_type(val);
9067 if (NIL_P(val)) return 0;
9068 }
9069 return val;
9070}
9071
9072static const char isspacetable[256] = {
9073 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9075 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9089};
9090
9091#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9092
9093static long
9094split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9095{
9096 if (empty_count >= 0 && len == 0) {
9097 return empty_count + 1;
9098 }
9099 if (empty_count > 0) {
9100 /* make different substrings */
9101 if (result) {
9102 do {
9103 rb_ary_push(result, str_new_empty_String(str));
9104 } while (--empty_count > 0);
9105 }
9106 else {
9107 do {
9108 rb_yield(str_new_empty_String(str));
9109 } while (--empty_count > 0);
9110 }
9111 }
9112 str = rb_str_subseq(str, beg, len);
9113 if (result) {
9114 rb_ary_push(result, str);
9115 }
9116 else {
9117 rb_yield(str);
9118 }
9119 return empty_count;
9120}
9121
9122typedef enum {
9123 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9124} split_type_t;
9125
9126static split_type_t
9127literal_split_pattern(VALUE spat, split_type_t default_type)
9128{
9129 rb_encoding *enc = STR_ENC_GET(spat);
9130 const char *ptr;
9131 long len;
9132 RSTRING_GETMEM(spat, ptr, len);
9133 if (len == 0) {
9134 /* Special case - split into chars */
9135 return SPLIT_TYPE_CHARS;
9136 }
9137 else if (rb_enc_asciicompat(enc)) {
9138 if (len == 1 && ptr[0] == ' ') {
9139 return SPLIT_TYPE_AWK;
9140 }
9141 }
9142 else {
9143 int l;
9144 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9145 return SPLIT_TYPE_AWK;
9146 }
9147 }
9148 return default_type;
9149}
9150
9151/*
9152 * call-seq:
9153 * split(field_sep = $;, limit = 0) -> array_of_substrings
9154 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9155 *
9156 * :include: doc/string/split.rdoc
9157 *
9158 */
9159
9160static VALUE
9161rb_str_split_m(int argc, VALUE *argv, VALUE str)
9162{
9163 rb_encoding *enc;
9164 VALUE spat;
9165 VALUE limit;
9166 split_type_t split_type;
9167 long beg, end, i = 0, empty_count = -1;
9168 int lim = 0;
9169 VALUE result, tmp;
9170
9171 result = rb_block_given_p() ? Qfalse : Qnil;
9172 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9173 lim = NUM2INT(limit);
9174 if (lim <= 0) limit = Qnil;
9175 else if (lim == 1) {
9176 if (RSTRING_LEN(str) == 0)
9177 return result ? rb_ary_new2(0) : str;
9178 tmp = str_duplicate(rb_cString, str);
9179 if (!result) {
9180 rb_yield(tmp);
9181 return str;
9182 }
9183 return rb_ary_new3(1, tmp);
9184 }
9185 i = 1;
9186 }
9187 if (NIL_P(limit) && !lim) empty_count = 0;
9188
9189 enc = STR_ENC_GET(str);
9190 split_type = SPLIT_TYPE_REGEXP;
9191 if (!NIL_P(spat)) {
9192 spat = get_pat_quoted(spat, 0);
9193 }
9194 else if (NIL_P(spat = rb_fs)) {
9195 split_type = SPLIT_TYPE_AWK;
9196 }
9197 else if (!(spat = rb_fs_check(spat))) {
9198 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9199 }
9200 else {
9201 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9202 }
9203 if (split_type != SPLIT_TYPE_AWK) {
9204 switch (BUILTIN_TYPE(spat)) {
9205 case T_REGEXP:
9206 rb_reg_options(spat); /* check if uninitialized */
9207 tmp = RREGEXP_SRC(spat);
9208 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9209 if (split_type == SPLIT_TYPE_AWK) {
9210 spat = tmp;
9211 split_type = SPLIT_TYPE_STRING;
9212 }
9213 break;
9214
9215 case T_STRING:
9216 mustnot_broken(spat);
9217 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9218 break;
9219
9220 default:
9222 }
9223 }
9224
9225#define SPLIT_STR(beg, len) ( \
9226 empty_count = split_string(result, str, beg, len, empty_count), \
9227 str_mod_check(str, str_start, str_len))
9228
9229 beg = 0;
9230 char *ptr = RSTRING_PTR(str);
9231 char *const str_start = ptr;
9232 const long str_len = RSTRING_LEN(str);
9233 char *const eptr = str_start + str_len;
9234 if (split_type == SPLIT_TYPE_AWK) {
9235 char *bptr = ptr;
9236 int skip = 1;
9237 unsigned int c;
9238
9239 if (result) result = rb_ary_new();
9240 end = beg;
9241 if (is_ascii_string(str)) {
9242 while (ptr < eptr) {
9243 c = (unsigned char)*ptr++;
9244 if (skip) {
9245 if (ascii_isspace(c)) {
9246 beg = ptr - bptr;
9247 }
9248 else {
9249 end = ptr - bptr;
9250 skip = 0;
9251 if (!NIL_P(limit) && lim <= i) break;
9252 }
9253 }
9254 else if (ascii_isspace(c)) {
9255 SPLIT_STR(beg, end-beg);
9256 skip = 1;
9257 beg = ptr - bptr;
9258 if (!NIL_P(limit)) ++i;
9259 }
9260 else {
9261 end = ptr - bptr;
9262 }
9263 }
9264 }
9265 else {
9266 while (ptr < eptr) {
9267 int n;
9268
9269 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9270 ptr += n;
9271 if (skip) {
9272 if (rb_isspace(c)) {
9273 beg = ptr - bptr;
9274 }
9275 else {
9276 end = ptr - bptr;
9277 skip = 0;
9278 if (!NIL_P(limit) && lim <= i) break;
9279 }
9280 }
9281 else if (rb_isspace(c)) {
9282 SPLIT_STR(beg, end-beg);
9283 skip = 1;
9284 beg = ptr - bptr;
9285 if (!NIL_P(limit)) ++i;
9286 }
9287 else {
9288 end = ptr - bptr;
9289 }
9290 }
9291 }
9292 }
9293 else if (split_type == SPLIT_TYPE_STRING) {
9294 char *substr_start = ptr;
9295 char *sptr = RSTRING_PTR(spat);
9296 long slen = RSTRING_LEN(spat);
9297
9298 if (result) result = rb_ary_new();
9299 mustnot_broken(str);
9300 enc = rb_enc_check(str, spat);
9301 while (ptr < eptr &&
9302 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9303 /* Check we are at the start of a char */
9304 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9305 if (t != ptr + end) {
9306 ptr = t;
9307 continue;
9308 }
9309 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9310 str_mod_check(spat, sptr, slen);
9311 ptr += end + slen;
9312 substr_start = ptr;
9313 if (!NIL_P(limit) && lim <= ++i) break;
9314 }
9315 beg = ptr - str_start;
9316 }
9317 else if (split_type == SPLIT_TYPE_CHARS) {
9318 int n;
9319
9320 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9321 mustnot_broken(str);
9322 enc = rb_enc_get(str);
9323 while (ptr < eptr &&
9324 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9325 SPLIT_STR(ptr - str_start, n);
9326 ptr += n;
9327 if (!NIL_P(limit) && lim <= ++i) break;
9328 }
9329 beg = ptr - str_start;
9330 }
9331 else {
9332 if (result) result = rb_ary_new();
9333 long len = RSTRING_LEN(str);
9334 long start = beg;
9335 long idx;
9336 int last_null = 0;
9337 struct re_registers *regs;
9338 VALUE match = 0;
9339
9340 for (; rb_reg_search(spat, str, start, 0) >= 0;
9341 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9342 match = rb_backref_get();
9343 if (!result) rb_match_busy(match);
9344 regs = RMATCH_REGS(match);
9345 end = BEG(0);
9346 if (start == end && BEG(0) == END(0)) {
9347 if (!ptr) {
9348 SPLIT_STR(0, 0);
9349 break;
9350 }
9351 else if (last_null == 1) {
9352 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9353 beg = start;
9354 }
9355 else {
9356 if (start == len)
9357 start++;
9358 else
9359 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9360 last_null = 1;
9361 continue;
9362 }
9363 }
9364 else {
9365 SPLIT_STR(beg, end-beg);
9366 beg = start = END(0);
9367 }
9368 last_null = 0;
9369
9370 for (idx=1; idx < regs->num_regs; idx++) {
9371 if (BEG(idx) == -1) continue;
9372 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9373 }
9374 if (!NIL_P(limit) && lim <= ++i) break;
9375 }
9376 if (match) rb_match_unbusy(match);
9377 }
9378 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9379 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9380 }
9381
9382 return result ? result : str;
9383}
9384
9385VALUE
9386rb_str_split(VALUE str, const char *sep0)
9387{
9388 VALUE sep;
9389
9390 StringValue(str);
9391 sep = rb_str_new_cstr(sep0);
9392 return rb_str_split_m(1, &sep, str);
9393}
9394
9395#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9396
9397static inline int
9398enumerator_element(VALUE ary, VALUE e)
9399{
9400 if (ary) {
9401 rb_ary_push(ary, e);
9402 return 0;
9403 }
9404 else {
9405 rb_yield(e);
9406 return 1;
9407 }
9408}
9409
9410#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9411
9412static const char *
9413chomp_newline(const char *p, const char *e, rb_encoding *enc)
9414{
9415 const char *prev = rb_enc_prev_char(p, e, e, enc);
9416 if (rb_enc_is_newline(prev, e, enc)) {
9417 e = prev;
9418 prev = rb_enc_prev_char(p, e, e, enc);
9419 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9420 e = prev;
9421 }
9422 return e;
9423}
9424
9425static VALUE
9426get_rs(void)
9427{
9428 VALUE rs = rb_rs;
9429 if (!NIL_P(rs) &&
9430 (!RB_TYPE_P(rs, T_STRING) ||
9431 RSTRING_LEN(rs) != 1 ||
9432 RSTRING_PTR(rs)[0] != '\n')) {
9433 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9434 }
9435 return rs;
9436}
9437
9438#define rb_rs get_rs()
9439
9440static VALUE
9441rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9442{
9443 rb_encoding *enc;
9444 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9445 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9446 long pos, len, rslen;
9447 int rsnewline = 0;
9448
9449 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9450 rs = rb_rs;
9451 if (!NIL_P(opts)) {
9452 static ID keywords[1];
9453 if (!keywords[0]) {
9454 keywords[0] = rb_intern_const("chomp");
9455 }
9456 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9457 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9458 }
9459
9460 if (NIL_P(rs)) {
9461 if (!ENUM_ELEM(ary, str)) {
9462 return ary;
9463 }
9464 else {
9465 return orig;
9466 }
9467 }
9468
9469 if (!RSTRING_LEN(str)) goto end;
9470 str = rb_str_new_frozen(str);
9471 ptr = subptr = RSTRING_PTR(str);
9472 pend = RSTRING_END(str);
9473 len = RSTRING_LEN(str);
9474 StringValue(rs);
9475 rslen = RSTRING_LEN(rs);
9476
9477 if (rs == rb_default_rs)
9478 enc = rb_enc_get(str);
9479 else
9480 enc = rb_enc_check(str, rs);
9481
9482 if (rslen == 0) {
9483 /* paragraph mode */
9484 int n;
9485 const char *eol = NULL;
9486 subend = subptr;
9487 while (subend < pend) {
9488 long chomp_rslen = 0;
9489 do {
9490 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9491 n = 0;
9492 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9493 if (rb_enc_is_newline(subend + n, pend, enc)) {
9494 if (eol == subend) break;
9495 subend += rslen;
9496 if (subptr) {
9497 eol = subend;
9498 chomp_rslen = -rslen;
9499 }
9500 }
9501 else {
9502 if (!subptr) subptr = subend;
9503 subend += rslen;
9504 }
9505 rslen = 0;
9506 } while (subend < pend);
9507 if (!subptr) break;
9508 if (rslen == 0) chomp_rslen = 0;
9509 line = rb_str_subseq(str, subptr - ptr,
9510 subend - subptr + (chomp ? chomp_rslen : rslen));
9511 if (ENUM_ELEM(ary, line)) {
9512 str_mod_check(str, ptr, len);
9513 }
9514 subptr = eol = NULL;
9515 }
9516 goto end;
9517 }
9518 else {
9519 rsptr = RSTRING_PTR(rs);
9520 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9521 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9522 rsnewline = 1;
9523 }
9524 }
9525
9526 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9527 rs = rb_str_new(rsptr, rslen);
9528 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9529 rsptr = RSTRING_PTR(rs);
9530 rslen = RSTRING_LEN(rs);
9531 }
9532
9533 while (subptr < pend) {
9534 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9535 if (pos < 0) break;
9536 hit = subptr + pos;
9537 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9538 if (hit != adjusted) {
9539 subptr = adjusted;
9540 continue;
9541 }
9542 subend = hit += rslen;
9543 if (chomp) {
9544 if (rsnewline) {
9545 subend = chomp_newline(subptr, subend, enc);
9546 }
9547 else {
9548 subend -= rslen;
9549 }
9550 }
9551 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9552 if (ENUM_ELEM(ary, line)) {
9553 str_mod_check(str, ptr, len);
9554 }
9555 subptr = hit;
9556 }
9557
9558 if (subptr != pend) {
9559 if (chomp) {
9560 if (rsnewline) {
9561 pend = chomp_newline(subptr, pend, enc);
9562 }
9563 else if (pend - subptr >= rslen &&
9564 memcmp(pend - rslen, rsptr, rslen) == 0) {
9565 pend -= rslen;
9566 }
9567 }
9568 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9569 ENUM_ELEM(ary, line);
9570 RB_GC_GUARD(str);
9571 }
9572
9573 end:
9574 if (ary)
9575 return ary;
9576 else
9577 return orig;
9578}
9579
9580/*
9581 * call-seq:
9582 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9583 * each_line(record_separator = $/, chomp: false) -> enumerator
9584 *
9585 * :include: doc/string/each_line.rdoc
9586 *
9587 */
9588
9589static VALUE
9590rb_str_each_line(int argc, VALUE *argv, VALUE str)
9591{
9592 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9593 return rb_str_enumerate_lines(argc, argv, str, 0);
9594}
9595
9596/*
9597 * call-seq:
9598 * lines(record_separator = $/, chomp: false) -> array_of_strings
9599 *
9600 * Returns substrings ("lines") of +self+
9601 * according to the given arguments:
9602 *
9603 * s = <<~EOT
9604 * This is the first line.
9605 * This is line two.
9606 *
9607 * This is line four.
9608 * This is line five.
9609 * EOT
9610 *
9611 * With the default argument values:
9612 *
9613 * $/ # => "\n"
9614 * s.lines
9615 * # =>
9616 * ["This is the first line.\n",
9617 * "This is line two.\n",
9618 * "\n",
9619 * "This is line four.\n",
9620 * "This is line five.\n"]
9621 *
9622 * With a different +record_separator+:
9623 *
9624 * record_separator = ' is '
9625 * s.lines(record_separator)
9626 * # =>
9627 * ["This is ",
9628 * "the first line.\nThis is ",
9629 * "line two.\n\nThis is ",
9630 * "line four.\nThis is ",
9631 * "line five.\n"]
9632 *
9633 * With keyword argument +chomp+ as +true+,
9634 * removes the trailing newline from each line:
9635 *
9636 * s.lines(chomp: true)
9637 * # =>
9638 * ["This is the first line.",
9639 * "This is line two.",
9640 * "",
9641 * "This is line four.",
9642 * "This is line five."]
9643 *
9644 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
9645 */
9646
9647static VALUE
9648rb_str_lines(int argc, VALUE *argv, VALUE str)
9649{
9650 VALUE ary = WANTARRAY("lines", 0);
9651 return rb_str_enumerate_lines(argc, argv, str, ary);
9652}
9653
9654static VALUE
9655rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9656{
9657 return LONG2FIX(RSTRING_LEN(str));
9658}
9659
9660static VALUE
9661rb_str_enumerate_bytes(VALUE str, VALUE ary)
9662{
9663 long i;
9664
9665 for (i=0; i<RSTRING_LEN(str); i++) {
9666 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9667 }
9668 if (ary)
9669 return ary;
9670 else
9671 return str;
9672}
9673
9674/*
9675 * call-seq:
9676 * each_byte {|byte| ... } -> self
9677 * each_byte -> enumerator
9678 *
9679 * :include: doc/string/each_byte.rdoc
9680 *
9681 */
9682
9683static VALUE
9684rb_str_each_byte(VALUE str)
9685{
9686 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9687 return rb_str_enumerate_bytes(str, 0);
9688}
9689
9690/*
9691 * call-seq:
9692 * bytes -> array_of_bytes
9693 *
9694 * :include: doc/string/bytes.rdoc
9695 *
9696 */
9697
9698static VALUE
9699rb_str_bytes(VALUE str)
9700{
9701 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9702 return rb_str_enumerate_bytes(str, ary);
9703}
9704
9705static VALUE
9706rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9707{
9708 return rb_str_length(str);
9709}
9710
9711static VALUE
9712rb_str_enumerate_chars(VALUE str, VALUE ary)
9713{
9714 VALUE orig = str;
9715 long i, len, n;
9716 const char *ptr;
9717 rb_encoding *enc;
9718
9719 str = rb_str_new_frozen(str);
9720 ptr = RSTRING_PTR(str);
9721 len = RSTRING_LEN(str);
9722 enc = rb_enc_get(str);
9723
9725 for (i = 0; i < len; i += n) {
9726 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9727 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9728 }
9729 }
9730 else {
9731 for (i = 0; i < len; i += n) {
9732 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9733 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9734 }
9735 }
9736 RB_GC_GUARD(str);
9737 if (ary)
9738 return ary;
9739 else
9740 return orig;
9741}
9742
9743/*
9744 * call-seq:
9745 * each_char {|char| ... } -> self
9746 * each_char -> enumerator
9747 *
9748 * :include: doc/string/each_char.rdoc
9749 *
9750 */
9751
9752static VALUE
9753rb_str_each_char(VALUE str)
9754{
9755 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9756 return rb_str_enumerate_chars(str, 0);
9757}
9758
9759/*
9760 * call-seq:
9761 * chars -> array_of_characters
9762 *
9763 * :include: doc/string/chars.rdoc
9764 *
9765 */
9766
9767static VALUE
9768rb_str_chars(VALUE str)
9769{
9770 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9771 return rb_str_enumerate_chars(str, ary);
9772}
9773
9774static VALUE
9775rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9776{
9777 VALUE orig = str;
9778 int n;
9779 unsigned int c;
9780 const char *ptr, *end;
9781 rb_encoding *enc;
9782
9783 if (single_byte_optimizable(str))
9784 return rb_str_enumerate_bytes(str, ary);
9785
9786 str = rb_str_new_frozen(str);
9787 ptr = RSTRING_PTR(str);
9788 end = RSTRING_END(str);
9789 enc = STR_ENC_GET(str);
9790
9791 while (ptr < end) {
9792 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9793 ENUM_ELEM(ary, UINT2NUM(c));
9794 ptr += n;
9795 }
9796 RB_GC_GUARD(str);
9797 if (ary)
9798 return ary;
9799 else
9800 return orig;
9801}
9802
9803/*
9804 * call-seq:
9805 * each_codepoint {|codepoint| ... } -> self
9806 * each_codepoint -> enumerator
9807 *
9808 * :include: doc/string/each_codepoint.rdoc
9809 *
9810 */
9811
9812static VALUE
9813rb_str_each_codepoint(VALUE str)
9814{
9815 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9816 return rb_str_enumerate_codepoints(str, 0);
9817}
9818
9819/*
9820 * call-seq:
9821 * codepoints -> array_of_integers
9822 *
9823 * :include: doc/string/codepoints.rdoc
9824 *
9825 */
9826
9827static VALUE
9828rb_str_codepoints(VALUE str)
9829{
9830 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9831 return rb_str_enumerate_codepoints(str, ary);
9832}
9833
9834static regex_t *
9835get_reg_grapheme_cluster(rb_encoding *enc)
9836{
9837 int encidx = rb_enc_to_index(enc);
9838
9839 const OnigUChar source_ascii[] = "\\X";
9840 const OnigUChar *source = source_ascii;
9841 size_t source_len = sizeof(source_ascii) - 1;
9842
9843 switch (encidx) {
9844#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9845#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9846#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9847#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9848#define CASE_UTF(e) \
9849 case ENCINDEX_UTF_##e: { \
9850 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9851 source = source_UTF_##e; \
9852 source_len = sizeof(source_UTF_##e); \
9853 break; \
9854 }
9855 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9856#undef CASE_UTF
9857#undef CHARS_16BE
9858#undef CHARS_16LE
9859#undef CHARS_32BE
9860#undef CHARS_32LE
9861 }
9862
9863 regex_t *reg_grapheme_cluster;
9864 OnigErrorInfo einfo;
9865 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9866 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9867 if (r) {
9868 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9869 onig_error_code_to_str(message, r, &einfo);
9870 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9871 }
9872
9873 return reg_grapheme_cluster;
9874}
9875
9876static regex_t *
9877get_cached_reg_grapheme_cluster(rb_encoding *enc)
9878{
9879 int encidx = rb_enc_to_index(enc);
9880 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9881
9882 if (encidx == rb_utf8_encindex()) {
9883 if (!reg_grapheme_cluster_utf8) {
9884 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9885 }
9886
9887 return reg_grapheme_cluster_utf8;
9888 }
9889
9890 return NULL;
9891}
9892
9893static VALUE
9894rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9895{
9896 size_t grapheme_cluster_count = 0;
9897 rb_encoding *enc = get_encoding(str);
9898 const char *ptr, *end;
9899
9900 if (!rb_enc_unicode_p(enc)) {
9901 return rb_str_length(str);
9902 }
9903
9904 bool cached_reg_grapheme_cluster = true;
9905 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9906 if (!reg_grapheme_cluster) {
9907 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9908 cached_reg_grapheme_cluster = false;
9909 }
9910
9911 ptr = RSTRING_PTR(str);
9912 end = RSTRING_END(str);
9913
9914 while (ptr < end) {
9915 OnigPosition len = onig_match(reg_grapheme_cluster,
9916 (const OnigUChar *)ptr, (const OnigUChar *)end,
9917 (const OnigUChar *)ptr, NULL, 0);
9918 if (len <= 0) break;
9919 grapheme_cluster_count++;
9920 ptr += len;
9921 }
9922
9923 if (!cached_reg_grapheme_cluster) {
9924 onig_free(reg_grapheme_cluster);
9925 }
9926
9927 return SIZET2NUM(grapheme_cluster_count);
9928}
9929
9930static VALUE
9931rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9932{
9933 VALUE orig = str;
9934 rb_encoding *enc = get_encoding(str);
9935 const char *ptr0, *ptr, *end;
9936
9937 if (!rb_enc_unicode_p(enc)) {
9938 return rb_str_enumerate_chars(str, ary);
9939 }
9940
9941 if (!ary) str = rb_str_new_frozen(str);
9942
9943 bool cached_reg_grapheme_cluster = true;
9944 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9945 if (!reg_grapheme_cluster) {
9946 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9947 cached_reg_grapheme_cluster = false;
9948 }
9949
9950 ptr0 = ptr = RSTRING_PTR(str);
9951 end = RSTRING_END(str);
9952
9953 while (ptr < end) {
9954 OnigPosition len = onig_match(reg_grapheme_cluster,
9955 (const OnigUChar *)ptr, (const OnigUChar *)end,
9956 (const OnigUChar *)ptr, NULL, 0);
9957 if (len <= 0) break;
9958 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9959 ptr += len;
9960 }
9961
9962 if (!cached_reg_grapheme_cluster) {
9963 onig_free(reg_grapheme_cluster);
9964 }
9965
9966 RB_GC_GUARD(str);
9967 if (ary)
9968 return ary;
9969 else
9970 return orig;
9971}
9972
9973/*
9974 * call-seq:
9975 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9976 * each_grapheme_cluster -> enumerator
9977 *
9978 * :include: doc/string/each_grapheme_cluster.rdoc
9979 *
9980 */
9981
9982static VALUE
9983rb_str_each_grapheme_cluster(VALUE str)
9984{
9985 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9986 return rb_str_enumerate_grapheme_clusters(str, 0);
9987}
9988
9989/*
9990 * call-seq:
9991 * grapheme_clusters -> array_of_grapheme_clusters
9992 *
9993 * :include: doc/string/grapheme_clusters.rdoc
9994 *
9995 */
9996
9997static VALUE
9998rb_str_grapheme_clusters(VALUE str)
9999{
10000 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10001 return rb_str_enumerate_grapheme_clusters(str, ary);
10002}
10003
10004static long
10005chopped_length(VALUE str)
10006{
10007 rb_encoding *enc = STR_ENC_GET(str);
10008 const char *p, *p2, *beg, *end;
10009
10010 beg = RSTRING_PTR(str);
10011 end = beg + RSTRING_LEN(str);
10012 if (beg >= end) return 0;
10013 p = rb_enc_prev_char(beg, end, end, enc);
10014 if (!p) return 0;
10015 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10016 p2 = rb_enc_prev_char(beg, p, end, enc);
10017 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10018 }
10019 return p - beg;
10020}
10021
10022/*
10023 * call-seq:
10024 * chop! -> self or nil
10025 *
10026 * Like String#chop, except that:
10027 *
10028 * - Removes trailing characters from +self+ (not from a copy of +self+).
10029 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10030 *
10031 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10032 */
10033
10034static VALUE
10035rb_str_chop_bang(VALUE str)
10036{
10037 str_modify_keep_cr(str);
10038 if (RSTRING_LEN(str) > 0) {
10039 long len;
10040 len = chopped_length(str);
10041 STR_SET_LEN(str, len);
10042 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10043 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10045 }
10046 return str;
10047 }
10048 return Qnil;
10049}
10050
10051
10052/*
10053 * call-seq:
10054 * chop -> new_string
10055 *
10056 * :include: doc/string/chop.rdoc
10057 *
10058 */
10059
10060static VALUE
10061rb_str_chop(VALUE str)
10062{
10063 return rb_str_subseq(str, 0, chopped_length(str));
10064}
10065
10066static long
10067smart_chomp(VALUE str, const char *e, const char *p)
10068{
10069 rb_encoding *enc = rb_enc_get(str);
10070 if (rb_enc_mbminlen(enc) > 1) {
10071 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10072 if (rb_enc_is_newline(pp, e, enc)) {
10073 e = pp;
10074 }
10075 pp = e - rb_enc_mbminlen(enc);
10076 if (pp >= p) {
10077 pp = rb_enc_left_char_head(p, pp, e, enc);
10078 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10079 e = pp;
10080 }
10081 }
10082 }
10083 else {
10084 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10085 case '\n':
10086 if (--e > p && *(e-1) == '\r') {
10087 --e;
10088 }
10089 break;
10090 case '\r':
10091 --e;
10092 break;
10093 }
10094 }
10095 return e - p;
10096}
10097
10098static long
10099chompped_length(VALUE str, VALUE rs)
10100{
10101 rb_encoding *enc;
10102 int newline;
10103 char *pp, *e, *rsptr;
10104 long rslen;
10105 char *const p = RSTRING_PTR(str);
10106 long len = RSTRING_LEN(str);
10107
10108 if (len == 0) return 0;
10109 e = p + len;
10110 if (rs == rb_default_rs) {
10111 return smart_chomp(str, e, p);
10112 }
10113
10114 enc = rb_enc_get(str);
10115 RSTRING_GETMEM(rs, rsptr, rslen);
10116 if (rslen == 0) {
10117 if (rb_enc_mbminlen(enc) > 1) {
10118 while (e > p) {
10119 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10120 if (!rb_enc_is_newline(pp, e, enc)) break;
10121 e = pp;
10122 pp -= rb_enc_mbminlen(enc);
10123 if (pp >= p) {
10124 pp = rb_enc_left_char_head(p, pp, e, enc);
10125 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10126 e = pp;
10127 }
10128 }
10129 }
10130 }
10131 else {
10132 while (e > p && *(e-1) == '\n') {
10133 --e;
10134 if (e > p && *(e-1) == '\r')
10135 --e;
10136 }
10137 }
10138 return e - p;
10139 }
10140 if (rslen > len) return len;
10141
10142 enc = rb_enc_get(rs);
10143 newline = rsptr[rslen-1];
10144 if (rslen == rb_enc_mbminlen(enc)) {
10145 if (rslen == 1) {
10146 if (newline == '\n')
10147 return smart_chomp(str, e, p);
10148 }
10149 else {
10150 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10151 return smart_chomp(str, e, p);
10152 }
10153 }
10154
10155 enc = rb_enc_check(str, rs);
10156 if (is_broken_string(rs)) {
10157 return len;
10158 }
10159 pp = e - rslen;
10160 if (p[len-1] == newline &&
10161 (rslen <= 1 ||
10162 memcmp(rsptr, pp, rslen) == 0)) {
10163 if (at_char_boundary(p, pp, e, enc))
10164 return len - rslen;
10165 RB_GC_GUARD(rs);
10166 }
10167 return len;
10168}
10169
10175static VALUE
10176chomp_rs(int argc, const VALUE *argv)
10177{
10178 rb_check_arity(argc, 0, 1);
10179 if (argc > 0) {
10180 VALUE rs = argv[0];
10181 if (!NIL_P(rs)) StringValue(rs);
10182 return rs;
10183 }
10184 else {
10185 return rb_rs;
10186 }
10187}
10188
10189VALUE
10190rb_str_chomp_string(VALUE str, VALUE rs)
10191{
10192 long olen = RSTRING_LEN(str);
10193 long len = chompped_length(str, rs);
10194 if (len >= olen) return Qnil;
10195 str_modify_keep_cr(str);
10196 STR_SET_LEN(str, len);
10197 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10198 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10200 }
10201 return str;
10202}
10203
10204/*
10205 * call-seq:
10206 * chomp!(line_sep = $/) -> self or nil
10207 *
10208 * Like String#chomp, except that:
10209 *
10210 * - Removes trailing characters from +self+ (not from a copy of +self+).
10211 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10212 *
10213 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10214 */
10215
10216static VALUE
10217rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10218{
10219 VALUE rs;
10220 str_modifiable(str);
10221 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10222 rs = chomp_rs(argc, argv);
10223 if (NIL_P(rs)) return Qnil;
10224 return rb_str_chomp_string(str, rs);
10225}
10226
10227
10228/*
10229 * call-seq:
10230 * chomp(line_sep = $/) -> new_string
10231 *
10232 * :include: doc/string/chomp.rdoc
10233 *
10234 */
10235
10236static VALUE
10237rb_str_chomp(int argc, VALUE *argv, VALUE str)
10238{
10239 VALUE rs = chomp_rs(argc, argv);
10240 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10241 return rb_str_subseq(str, 0, chompped_length(str, rs));
10242}
10243
10244static void
10245tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10246 VALUE str, int num_selectors, VALUE *selectors)
10247{
10248 int i;
10249
10250 for (i=0; i<num_selectors; i++) {
10251 VALUE selector = selectors[i];
10252 rb_encoding *enc;
10253
10254 StringValue(selector);
10255 enc = rb_enc_check(str, selector);
10256 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10257 }
10258}
10259
10260static long
10261lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10262{
10263 const char *const start = s;
10264
10265 if (!s || s >= e) return 0;
10266
10267 /* remove spaces at head */
10268 if (single_byte_optimizable(str)) {
10269 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10270 }
10271 else {
10272 while (s < e) {
10273 int n;
10274 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10275
10276 if (cc && !rb_isspace(cc)) break;
10277 s += n;
10278 }
10279 }
10280 return s - start;
10281}
10282
10283static long
10284lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10285 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10286{
10287 const char *const start = s;
10288
10289 if (!s || s >= e) return 0;
10290
10291 /* remove leading characters in the table */
10292 while (s < e) {
10293 int n;
10294 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10295
10296 if (!tr_find(cc, table, del, nodel)) break;
10297 s += n;
10298 }
10299 return s - start;
10300}
10301
10302/*
10303 * call-seq:
10304 * lstrip!(*selectors) -> self or nil
10305 *
10306 * Like String#lstrip, except that:
10307 *
10308 * - Performs stripping in +self+ (not in a copy of +self+).
10309 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10310 *
10311 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10312 */
10313
10314static VALUE
10315rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10316{
10317 rb_encoding *enc;
10318 char *start, *s;
10319 long olen, loffset;
10320
10321 str_modify_keep_cr(str);
10322 enc = STR_ENC_GET(str);
10323 RSTRING_GETMEM(str, start, olen);
10324 if (argc > 0) {
10325 char table[TR_TABLE_SIZE];
10326 VALUE del = 0, nodel = 0;
10327
10328 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10329 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10330 }
10331 else {
10332 loffset = lstrip_offset(str, start, start+olen, enc);
10333 }
10334
10335 if (loffset > 0) {
10336 long len = olen-loffset;
10337 s = start + loffset;
10338 memmove(start, s, len);
10339 STR_SET_LEN(str, len);
10340 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10341 return str;
10342 }
10343 return Qnil;
10344}
10345
10346
10347/*
10348 * call-seq:
10349 * lstrip(*selectors) -> new_string
10350 *
10351 * Returns a copy of +self+ with leading whitespace removed;
10352 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10353 *
10354 * whitespace = "\x00\t\n\v\f\r "
10355 * s = whitespace + 'abc' + whitespace
10356 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10357 * s.lstrip
10358 * # => "abc\u0000\t\n\v\f\r "
10359 *
10360 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10361 *
10362 * s = "---abc+++"
10363 * s.lstrip("-") # => "abc+++"
10364 *
10365 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10366 * and may use any of its valid forms, including negation, ranges, and escapes:
10367 *
10368 * "01234abc56789".lstrip("0-9") # "abc56789"
10369 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10370 *
10371 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10372 */
10373
10374static VALUE
10375rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10376{
10377 char *start;
10378 long len, loffset;
10379
10380 RSTRING_GETMEM(str, start, len);
10381 if (argc > 0) {
10382 char table[TR_TABLE_SIZE];
10383 VALUE del = 0, nodel = 0;
10384
10385 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10386 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10387 }
10388 else {
10389 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10390 }
10391 if (loffset <= 0) return str_duplicate(rb_cString, str);
10392 return rb_str_subseq(str, loffset, len - loffset);
10393}
10394
10395static long
10396rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10397{
10398 const char *t;
10399
10400 rb_str_check_dummy_enc(enc);
10402 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10403 }
10404 if (!s || s >= e) return 0;
10405 t = e;
10406
10407 /* remove trailing spaces or '\0's */
10408 if (single_byte_optimizable(str)) {
10409 unsigned char c;
10410 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10411 }
10412 else {
10413 char *tp;
10414
10415 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10416 unsigned int c = rb_enc_codepoint(tp, e, enc);
10417 if (c && !rb_isspace(c)) break;
10418 t = tp;
10419 }
10420 }
10421 return e - t;
10422}
10423
10424static long
10425rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10426 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10427{
10428 const char *t;
10429 char *tp;
10430
10431 rb_str_check_dummy_enc(enc);
10433 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10434 }
10435 if (!s || s >= e) return 0;
10436 t = e;
10437
10438 /* remove trailing characters in the table */
10439 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10440 unsigned int c = rb_enc_codepoint(tp, e, enc);
10441 if (!tr_find(c, table, del, nodel)) break;
10442 t = tp;
10443 }
10444
10445 return e - t;
10446}
10447
10448/*
10449 * call-seq:
10450 * rstrip!(*selectors) -> self or nil
10451 *
10452 * Like String#rstrip, except that:
10453 *
10454 * - Performs stripping in +self+ (not in a copy of +self+).
10455 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10456 *
10457 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10458 */
10459
10460static VALUE
10461rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10462{
10463 rb_encoding *enc;
10464 char *start;
10465 long olen, roffset;
10466
10467 str_modify_keep_cr(str);
10468 enc = STR_ENC_GET(str);
10469 RSTRING_GETMEM(str, start, olen);
10470 if (argc > 0) {
10471 char table[TR_TABLE_SIZE];
10472 VALUE del = 0, nodel = 0;
10473
10474 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10475 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10476 }
10477 else {
10478 roffset = rstrip_offset(str, start, start+olen, enc);
10479 }
10480 if (roffset > 0) {
10481 long len = olen - roffset;
10482
10483 STR_SET_LEN(str, len);
10484 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10485 return str;
10486 }
10487 return Qnil;
10488}
10489
10490
10491/*
10492 * call-seq:
10493 * rstrip(*selectors) -> new_string
10494 *
10495 * Returns a copy of +self+ with trailing whitespace removed;
10496 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10497 *
10498 * whitespace = "\x00\t\n\v\f\r "
10499 * s = whitespace + 'abc' + whitespace
10500 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10501 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10502 *
10503 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10504 *
10505 * s = "---abc+++"
10506 * s.rstrip("+") # => "---abc"
10507 *
10508 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10509 * and may use any of its valid forms, including negation, ranges, and escapes:
10510 *
10511 * "01234abc56789".rstrip("0-9") # "01234abc"
10512 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10513 *
10514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10515 */
10516
10517static VALUE
10518rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10519{
10520 rb_encoding *enc;
10521 char *start;
10522 long olen, roffset;
10523
10524 enc = STR_ENC_GET(str);
10525 RSTRING_GETMEM(str, start, olen);
10526 if (argc > 0) {
10527 char table[TR_TABLE_SIZE];
10528 VALUE del = 0, nodel = 0;
10529
10530 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10531 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10532 }
10533 else {
10534 roffset = rstrip_offset(str, start, start+olen, enc);
10535 }
10536 if (roffset <= 0) return str_duplicate(rb_cString, str);
10537 return rb_str_subseq(str, 0, olen-roffset);
10538}
10539
10540
10541/*
10542 * call-seq:
10543 * strip!(*selectors) -> self or nil
10544 *
10545 * Like String#strip, except that:
10546 *
10547 * - Any modifications are made to +self+.
10548 * - Returns +self+ if any modification are made, +nil+ otherwise.
10549 *
10550 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10551 */
10552
10553static VALUE
10554rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10555{
10556 char *start;
10557 long olen, loffset, roffset;
10558 rb_encoding *enc;
10559
10560 str_modify_keep_cr(str);
10561 enc = STR_ENC_GET(str);
10562 RSTRING_GETMEM(str, start, olen);
10563
10564 if (argc > 0) {
10565 char table[TR_TABLE_SIZE];
10566 VALUE del = 0, nodel = 0;
10567
10568 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10569 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10570 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10571 }
10572 else {
10573 loffset = lstrip_offset(str, start, start+olen, enc);
10574 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10575 }
10576
10577 if (loffset > 0 || roffset > 0) {
10578 long len = olen-roffset;
10579 if (loffset > 0) {
10580 len -= loffset;
10581 memmove(start, start + loffset, len);
10582 }
10583 STR_SET_LEN(str, len);
10584 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10585 return str;
10586 }
10587 return Qnil;
10588}
10589
10590
10591/*
10592 * call-seq:
10593 * strip(*selectors) -> new_string
10594 *
10595 * Returns a copy of +self+ with leading and trailing whitespace removed;
10596 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10597 *
10598 * whitespace = "\x00\t\n\v\f\r "
10599 * s = whitespace + 'abc' + whitespace
10600 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10601 * s.strip # => "abc"
10602 *
10603 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10604 *
10605 * s = "---abc+++"
10606 * s.strip("-+") # => "abc"
10607 * s.strip("+-") # => "abc"
10608 *
10609 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10610 * and may use any of its valid forms, including negation, ranges, and escapes:
10611 *
10612 * "01234abc56789".strip("0-9") # "abc"
10613 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10614 *
10615 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10616 */
10617
10618static VALUE
10619rb_str_strip(int argc, VALUE *argv, VALUE str)
10620{
10621 char *start;
10622 long olen, loffset, roffset;
10623 rb_encoding *enc = STR_ENC_GET(str);
10624
10625 RSTRING_GETMEM(str, start, olen);
10626
10627 if (argc > 0) {
10628 char table[TR_TABLE_SIZE];
10629 VALUE del = 0, nodel = 0;
10630
10631 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10632 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10633 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10634 }
10635 else {
10636 loffset = lstrip_offset(str, start, start+olen, enc);
10637 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10638 }
10639
10640 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10641 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10642}
10643
10644static VALUE
10645scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10646{
10647 VALUE result = Qnil;
10648 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10649 if (pos >= 0) {
10650 VALUE match;
10651 struct re_registers *regs;
10652 if (BUILTIN_TYPE(pat) == T_STRING) {
10653 regs = NULL;
10654 end = pos + RSTRING_LEN(pat);
10655 }
10656 else {
10657 match = rb_backref_get();
10658 regs = RMATCH_REGS(match);
10659 pos = BEG(0);
10660 end = END(0);
10661 }
10662
10663 if (pos == end) {
10664 rb_encoding *enc = STR_ENC_GET(str);
10665 /*
10666 * Always consume at least one character of the input string
10667 */
10668 if (RSTRING_LEN(str) > end)
10669 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10670 RSTRING_END(str), enc);
10671 else
10672 *start = end + 1;
10673 }
10674 else {
10675 *start = end;
10676 }
10677
10678 if (!regs || regs->num_regs == 1) {
10679 result = rb_str_subseq(str, pos, end - pos);
10680 return result;
10681 }
10682 else {
10683 result = rb_ary_new2(regs->num_regs);
10684 for (int i = 1; i < regs->num_regs; i++) {
10685 VALUE s = Qnil;
10686 if (BEG(i) >= 0) {
10687 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10688 }
10689
10690 rb_ary_push(result, s);
10691 }
10692 }
10693
10694 RB_GC_GUARD(match);
10695 }
10696
10697 return result;
10698}
10699
10700
10701/*
10702 * call-seq:
10703 * scan(pattern) -> array_of_results
10704 * scan(pattern) {|result| ... } -> self
10705 *
10706 * :include: doc/string/scan.rdoc
10707 *
10708 */
10709
10710static VALUE
10711rb_str_scan(VALUE str, VALUE pat)
10712{
10713 VALUE result;
10714 long start = 0;
10715 long last = -1, prev = 0;
10716 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10717
10718 pat = get_pat_quoted(pat, 1);
10719 mustnot_broken(str);
10720 if (!rb_block_given_p()) {
10721 VALUE ary = rb_ary_new();
10722
10723 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10724 last = prev;
10725 prev = start;
10726 rb_ary_push(ary, result);
10727 }
10728 if (last >= 0) rb_pat_search(pat, str, last, 1);
10729 else rb_backref_set(Qnil);
10730 return ary;
10731 }
10732
10733 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10734 last = prev;
10735 prev = start;
10736 rb_yield(result);
10737 str_mod_check(str, p, len);
10738 }
10739 if (last >= 0) rb_pat_search(pat, str, last, 1);
10740 return str;
10741}
10742
10743
10744/*
10745 * call-seq:
10746 * hex -> integer
10747 *
10748 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10749 * returns its value as an integer.
10750 *
10751 * The leading substring is interpreted as hexadecimal when it begins with:
10752 *
10753 * - One or more character representing hexadecimal digits
10754 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10755 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10756 *
10757 * 'f'.hex # => 15
10758 * '11'.hex # => 17
10759 * 'FFF'.hex # => 4095
10760 * 'fffg'.hex # => 4095
10761 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10762 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10763 * 'deadbeef'.hex # => 3735928559
10764 *
10765 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10766 *
10767 * '0xfff'.hex # => 4095
10768 * '0xfffg'.hex # => 4095
10769 *
10770 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10771 *
10772 * '-fff'.hex # => -4095
10773 * '-0xFFF'.hex # => -4095
10774 *
10775 * For any substring not described above, returns zero:
10776 *
10777 * 'xxx'.hex # => 0
10778 * ''.hex # => 0
10779 *
10780 * Note that, unlike #oct, this method interprets only hexadecimal,
10781 * and not binary, octal, or decimal notations:
10782 *
10783 * '0b111'.hex # => 45329
10784 * '0o777'.hex # => 0
10785 * '0d999'.hex # => 55705
10786 *
10787 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10788 */
10789
10790static VALUE
10791rb_str_hex(VALUE str)
10792{
10793 return rb_str_to_inum(str, 16, FALSE);
10794}
10795
10796
10797/*
10798 * call-seq:
10799 * oct -> integer
10800 *
10801 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10802 * returns their value as an integer.
10803 *
10804 * In brief:
10805 *
10806 * # Interpreted as octal.
10807 * '777'.oct # => 511
10808 * '777x'.oct # => 511
10809 * '0777'.oct # => 511
10810 * '0o777'.oct # => 511
10811 * '-777'.oct # => -511
10812 * # Not interpreted as octal.
10813 * '0b111'.oct # => 7 # Interpreted as binary.
10814 * '0d999'.oct # => 999 # Interpreted as decimal.
10815 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10816 *
10817 * The leading substring is interpreted as octal when it begins with:
10818 *
10819 * - One or more character representing octal digits
10820 * (each in the range <tt>'0'..'7'</tt>);
10821 * the string to be interpreted ends at the first character that does not represent an octal digit:
10822 *
10823 * '7'.oct @ => 7
10824 * '11'.oct # => 9
10825 * '777'.oct # => 511
10826 * '0777'.oct # => 511
10827 * '7778'.oct # => 511
10828 * '777x'.oct # => 511
10829 *
10830 * - <tt>'0o'</tt>, followed by one or more octal digits:
10831 *
10832 * '0o777'.oct # => 511
10833 * '0o7778'.oct # => 511
10834 *
10835 * The leading substring is _not_ interpreted as octal when it begins with:
10836 *
10837 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10838 * (each in the range <tt>'0'..'1'</tt>);
10839 * the string to be interpreted ends at the first character that does not represent a binary digit.
10840 * the string is interpreted as binary digits (base 2):
10841 *
10842 * '0b111'.oct # => 7
10843 * '0b1112'.oct # => 7
10844 *
10845 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10846 * (each in the range <tt>'0'..'9'</tt>);
10847 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10848 * the string is interpreted as decimal digits (base 10):
10849 *
10850 * '0d999'.oct # => 999
10851 * '0d999x'.oct # => 999
10852 *
10853 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10854 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10855 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10856 * the string is interpreted as hexadecimal digits (base 16):
10857 *
10858 * '0xfff'.oct # => 4095
10859 * '0xfffg'.oct # => 4095
10860 *
10861 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10862 *
10863 * '-777'.oct # => -511
10864 * '-0777'.oct # => -511
10865 * '-0b111'.oct # => -7
10866 * '-0xfff'.oct # => -4095
10867 *
10868 * For any substring not described above, returns zero:
10869 *
10870 * 'foo'.oct # => 0
10871 * ''.oct # => 0
10872 *
10873 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10874 */
10875
10876static VALUE
10877rb_str_oct(VALUE str)
10878{
10879 return rb_str_to_inum(str, -8, FALSE);
10880}
10881
10882#ifndef HAVE_CRYPT_R
10883# include "ruby/thread_native.h"
10884# include "ruby/atomic.h"
10885
10886static struct {
10887 rb_nativethread_lock_t lock;
10888} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10889#endif
10890
10891/*
10892 * call-seq:
10893 * crypt(salt_str) -> new_string
10894 *
10895 * Returns the string generated by calling <code>crypt(3)</code>
10896 * standard library function with <code>str</code> and
10897 * <code>salt_str</code>, in this order, as its arguments. Please do
10898 * not use this method any longer. It is legacy; provided only for
10899 * backward compatibility with ruby scripts in earlier days. It is
10900 * bad to use in contemporary programs for several reasons:
10901 *
10902 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10903 * run. The generated string lacks data portability.
10904 *
10905 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10906 * (i.e. silently ends up in unexpected results).
10907 *
10908 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10909 * thread safe.
10910 *
10911 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10912 * very very weak. According to its manpage, Linux's traditional
10913 * <code>crypt(3)</code> output has only 2**56 variations; too
10914 * easy to brute force today. And this is the default behaviour.
10915 *
10916 * * In order to make things robust some OSes implement so-called
10917 * "modular" usage. To go through, you have to do a complex
10918 * build-up of the <code>salt_str</code> parameter, by hand.
10919 * Failure in generation of a proper salt string tends not to
10920 * yield any errors; typos in parameters are normally not
10921 * detectable.
10922 *
10923 * * For instance, in the following example, the second invocation
10924 * of String#crypt is wrong; it has a typo in "round=" (lacks
10925 * "s"). However the call does not fail and something unexpected
10926 * is generated.
10927 *
10928 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10929 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10930 *
10931 * * Even in the "modular" mode, some hash functions are considered
10932 * archaic and no longer recommended at all; for instance module
10933 * <code>$1$</code> is officially abandoned by its author: see
10934 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10935 * instance module <code>$3$</code> is considered completely
10936 * broken: see the manpage of FreeBSD.
10937 *
10938 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10939 * written above, <code>crypt(3)</code> on Mac OS never fails.
10940 * This means even if you build up a proper salt string it
10941 * generates a traditional DES hash anyways, and there is no way
10942 * for you to be aware of.
10943 *
10944 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10945 *
10946 * If for some reason you cannot migrate to other secure contemporary
10947 * password hashing algorithms, install the string-crypt gem and
10948 * <code>require 'string/crypt'</code> to continue using it.
10949 */
10950
10951static VALUE
10952rb_str_crypt(VALUE str, VALUE salt)
10953{
10954#ifdef HAVE_CRYPT_R
10955 VALUE databuf;
10956 struct crypt_data *data;
10957# define CRYPT_END() ALLOCV_END(databuf)
10958#else
10959 char *tmp_buf;
10960 extern char *crypt(const char *, const char *);
10961# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10962#endif
10963 VALUE result;
10964 const char *s, *saltp;
10965 char *res;
10966#ifdef BROKEN_CRYPT
10967 char salt_8bit_clean[3];
10968#endif
10969
10970 StringValue(salt);
10971 mustnot_wchar(str);
10972 mustnot_wchar(salt);
10973 s = StringValueCStr(str);
10974 saltp = RSTRING_PTR(salt);
10975 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10976 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10977 }
10978
10979#ifdef BROKEN_CRYPT
10980 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10981 salt_8bit_clean[0] = saltp[0] & 0x7f;
10982 salt_8bit_clean[1] = saltp[1] & 0x7f;
10983 salt_8bit_clean[2] = '\0';
10984 saltp = salt_8bit_clean;
10985 }
10986#endif
10987#ifdef HAVE_CRYPT_R
10988 data = ALLOCV(databuf, sizeof(struct crypt_data));
10989# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10990 data->initialized = 0;
10991# endif
10992 res = crypt_r(s, saltp, data);
10993#else
10994 rb_nativethread_lock_lock(&crypt_mutex.lock);
10995 res = crypt(s, saltp);
10996#endif
10997 if (!res) {
10998 int err = errno;
10999 CRYPT_END();
11000 rb_syserr_fail(err, "crypt");
11001 }
11002#ifdef HAVE_CRYPT_R
11003 result = rb_str_new_cstr(res);
11004 CRYPT_END();
11005#else
11006 // We need to copy this buffer because it's static and we need to unlock the mutex
11007 // before allocating a new object (the string to be returned). If we allocate while
11008 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
11009 // if other ractors are waiting on this lock.
11010 size_t res_size = strlen(res)+1;
11011 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
11012 memcpy(tmp_buf, res, res_size);
11013 res = tmp_buf;
11014 CRYPT_END();
11015 result = rb_str_new_cstr(res);
11016#endif
11017 return result;
11018}
11019
11020
11021/*
11022 * call-seq:
11023 * ord -> integer
11024 *
11025 * :include: doc/string/ord.rdoc
11026 *
11027 */
11028
11029static VALUE
11030rb_str_ord(VALUE s)
11031{
11032 unsigned int c;
11033
11034 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11035 return UINT2NUM(c);
11036}
11037/*
11038 * call-seq:
11039 * sum(n = 16) -> integer
11040 *
11041 * :include: doc/string/sum.rdoc
11042 *
11043 */
11044
11045static VALUE
11046rb_str_sum(int argc, VALUE *argv, VALUE str)
11047{
11048 int bits = 16;
11049 char *ptr, *p, *pend;
11050 long len;
11051 VALUE sum = INT2FIX(0);
11052 unsigned long sum0 = 0;
11053
11054 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11055 bits = 0;
11056 }
11057 ptr = p = RSTRING_PTR(str);
11058 len = RSTRING_LEN(str);
11059 pend = p + len;
11060
11061 while (p < pend) {
11062 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11063 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11064 str_mod_check(str, ptr, len);
11065 sum0 = 0;
11066 }
11067 sum0 += (unsigned char)*p;
11068 p++;
11069 }
11070
11071 if (bits == 0) {
11072 if (sum0) {
11073 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11074 }
11075 }
11076 else {
11077 if (sum == INT2FIX(0)) {
11078 if (bits < (int)sizeof(long)*CHAR_BIT) {
11079 sum0 &= (((unsigned long)1)<<bits)-1;
11080 }
11081 sum = LONG2FIX(sum0);
11082 }
11083 else {
11084 VALUE mod;
11085
11086 if (sum0) {
11087 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11088 }
11089
11090 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11091 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11092 sum = rb_funcall(sum, '&', 1, mod);
11093 }
11094 }
11095 return sum;
11096}
11097
11098static VALUE
11099rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11100{
11101 rb_encoding *enc;
11102 VALUE w;
11103 long width, len, flen = 1, fclen = 1;
11104 VALUE res;
11105 char *p;
11106 const char *f = " ";
11107 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11108 VALUE pad;
11109 int singlebyte = 1, cr;
11110 int termlen;
11111
11112 rb_scan_args(argc, argv, "11", &w, &pad);
11113 enc = STR_ENC_GET(str);
11114 termlen = rb_enc_mbminlen(enc);
11115 width = NUM2LONG(w);
11116 if (argc == 2) {
11117 StringValue(pad);
11118 enc = rb_enc_check(str, pad);
11119 f = RSTRING_PTR(pad);
11120 flen = RSTRING_LEN(pad);
11121 fclen = str_strlen(pad, enc); /* rb_enc_check */
11122 singlebyte = single_byte_optimizable(pad);
11123 if (flen == 0 || fclen == 0) {
11124 rb_raise(rb_eArgError, "zero width padding");
11125 }
11126 }
11127 len = str_strlen(str, enc); /* rb_enc_check */
11128 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11129 n = width - len;
11130 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11131 rlen = n - llen;
11132 cr = ENC_CODERANGE(str);
11133 if (flen > 1) {
11134 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11135 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11136 }
11137 size = RSTRING_LEN(str);
11138 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11139 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11140 (len += llen2 + rlen2) >= LONG_MAX - size) {
11141 rb_raise(rb_eArgError, "argument too big");
11142 }
11143 len += size;
11144 res = str_enc_new(rb_cString, 0, len, enc);
11145 p = RSTRING_PTR(res);
11146 if (flen <= 1) {
11147 memset(p, *f, llen);
11148 p += llen;
11149 }
11150 else {
11151 while (llen >= fclen) {
11152 memcpy(p,f,flen);
11153 p += flen;
11154 llen -= fclen;
11155 }
11156 if (llen > 0) {
11157 memcpy(p, f, llen2);
11158 p += llen2;
11159 }
11160 }
11161 memcpy(p, RSTRING_PTR(str), size);
11162 p += size;
11163 if (flen <= 1) {
11164 memset(p, *f, rlen);
11165 p += rlen;
11166 }
11167 else {
11168 while (rlen >= fclen) {
11169 memcpy(p,f,flen);
11170 p += flen;
11171 rlen -= fclen;
11172 }
11173 if (rlen > 0) {
11174 memcpy(p, f, rlen2);
11175 p += rlen2;
11176 }
11177 }
11178 TERM_FILL(p, termlen);
11179 STR_SET_LEN(res, p-RSTRING_PTR(res));
11180
11181 if (argc == 2)
11182 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11183 if (cr != ENC_CODERANGE_BROKEN)
11184 ENC_CODERANGE_SET(res, cr);
11185
11186 RB_GC_GUARD(pad);
11187 return res;
11188}
11189
11190
11191/*
11192 * call-seq:
11193 * ljust(width, pad_string = ' ') -> new_string
11194 *
11195 * :include: doc/string/ljust.rdoc
11196 *
11197 */
11198
11199static VALUE
11200rb_str_ljust(int argc, VALUE *argv, VALUE str)
11201{
11202 return rb_str_justify(argc, argv, str, 'l');
11203}
11204
11205/*
11206 * call-seq:
11207 * rjust(width, pad_string = ' ') -> new_string
11208 *
11209 * :include: doc/string/rjust.rdoc
11210 *
11211 */
11212
11213static VALUE
11214rb_str_rjust(int argc, VALUE *argv, VALUE str)
11215{
11216 return rb_str_justify(argc, argv, str, 'r');
11217}
11218
11219
11220/*
11221 * call-seq:
11222 * center(size, pad_string = ' ') -> new_string
11223 *
11224 * :include: doc/string/center.rdoc
11225 *
11226 */
11227
11228static VALUE
11229rb_str_center(int argc, VALUE *argv, VALUE str)
11230{
11231 return rb_str_justify(argc, argv, str, 'c');
11232}
11233
11234/*
11235 * call-seq:
11236 * partition(pattern) -> [pre_match, first_match, post_match]
11237 *
11238 * :include: doc/string/partition.rdoc
11239 *
11240 */
11241
11242static VALUE
11243rb_str_partition(VALUE str, VALUE sep)
11244{
11245 long pos;
11246
11247 sep = get_pat_quoted(sep, 0);
11248 if (RB_TYPE_P(sep, T_REGEXP)) {
11249 if (rb_reg_search(sep, str, 0, 0) < 0) {
11250 goto failed;
11251 }
11252 VALUE match = rb_backref_get();
11253 struct re_registers *regs = RMATCH_REGS(match);
11254
11255 pos = BEG(0);
11256 sep = rb_str_subseq(str, pos, END(0) - pos);
11257 }
11258 else {
11259 pos = rb_str_index(str, sep, 0);
11260 if (pos < 0) goto failed;
11261 }
11262 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11263 sep,
11264 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11265 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11266
11267 failed:
11268 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11269}
11270
11271/*
11272 * call-seq:
11273 * rpartition(pattern) -> [pre_match, last_match, post_match]
11274 *
11275 * :include: doc/string/rpartition.rdoc
11276 *
11277 */
11278
11279static VALUE
11280rb_str_rpartition(VALUE str, VALUE sep)
11281{
11282 long pos = RSTRING_LEN(str);
11283
11284 sep = get_pat_quoted(sep, 0);
11285 if (RB_TYPE_P(sep, T_REGEXP)) {
11286 if (rb_reg_search(sep, str, pos, 1) < 0) {
11287 goto failed;
11288 }
11289 VALUE match = rb_backref_get();
11290 struct re_registers *regs = RMATCH_REGS(match);
11291
11292 pos = BEG(0);
11293 sep = rb_str_subseq(str, pos, END(0) - pos);
11294 }
11295 else {
11296 pos = rb_str_sublen(str, pos);
11297 pos = rb_str_rindex(str, sep, pos);
11298 if (pos < 0) {
11299 goto failed;
11300 }
11301 }
11302
11303 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11304 sep,
11305 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11306 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11307 failed:
11308 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11309}
11310
11311/*
11312 * call-seq:
11313 * start_with?(*patterns) -> true or false
11314 *
11315 * :include: doc/string/start_with_p.rdoc
11316 *
11317 */
11318
11319static VALUE
11320rb_str_start_with(int argc, VALUE *argv, VALUE str)
11321{
11322 int i;
11323
11324 for (i=0; i<argc; i++) {
11325 VALUE tmp = argv[i];
11326 if (RB_TYPE_P(tmp, T_REGEXP)) {
11327 if (rb_reg_start_with_p(tmp, str))
11328 return Qtrue;
11329 }
11330 else {
11331 const char *p, *s, *e;
11332 long slen, tlen;
11333 rb_encoding *enc;
11334
11335 StringValue(tmp);
11336 enc = rb_enc_check(str, tmp);
11337 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11338 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11339 p = RSTRING_PTR(str);
11340 e = p + slen;
11341 s = p + tlen;
11342 if (!at_char_right_boundary(p, s, e, enc))
11343 continue;
11344 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11345 return Qtrue;
11346 }
11347 }
11348 return Qfalse;
11349}
11350
11351/*
11352 * call-seq:
11353 * end_with?(*strings) -> true or false
11354 *
11355 * :include: doc/string/end_with_p.rdoc
11356 *
11357 */
11358
11359static VALUE
11360rb_str_end_with(int argc, VALUE *argv, VALUE str)
11361{
11362 int i;
11363
11364 for (i=0; i<argc; i++) {
11365 VALUE tmp = argv[i];
11366 const char *p, *s, *e;
11367 long slen, tlen;
11368 rb_encoding *enc;
11369
11370 StringValue(tmp);
11371 enc = rb_enc_check(str, tmp);
11372 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11373 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11374 p = RSTRING_PTR(str);
11375 e = p + slen;
11376 s = e - tlen;
11377 if (!at_char_boundary(p, s, e, enc))
11378 continue;
11379 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11380 return Qtrue;
11381 }
11382 return Qfalse;
11383}
11384
11394static long
11395deleted_prefix_length(VALUE str, VALUE prefix)
11396{
11397 const char *strptr, *prefixptr;
11398 long olen, prefixlen;
11399 rb_encoding *enc = rb_enc_get(str);
11400
11401 StringValue(prefix);
11402
11403 if (!is_broken_string(prefix) ||
11404 !rb_enc_asciicompat(enc) ||
11405 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11406 enc = rb_enc_check(str, prefix);
11407 }
11408
11409 /* return 0 if not start with prefix */
11410 prefixlen = RSTRING_LEN(prefix);
11411 if (prefixlen <= 0) return 0;
11412 olen = RSTRING_LEN(str);
11413 if (olen < prefixlen) return 0;
11414 strptr = RSTRING_PTR(str);
11415 prefixptr = RSTRING_PTR(prefix);
11416 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11417 if (is_broken_string(prefix)) {
11418 if (!is_broken_string(str)) {
11419 /* prefix in a valid string cannot be broken */
11420 return 0;
11421 }
11422 const char *strend = strptr + olen;
11423 const char *after_prefix = strptr + prefixlen;
11424 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11425 /* prefix does not end at char-boundary */
11426 return 0;
11427 }
11428 }
11429 /* prefix part in `str` also should be valid. */
11430
11431 return prefixlen;
11432}
11433
11434/*
11435 * call-seq:
11436 * delete_prefix!(prefix) -> self or nil
11437 *
11438 * Like String#delete_prefix, except that +self+ is modified in place;
11439 * returns +self+ if the prefix is removed, +nil+ otherwise.
11440 *
11441 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11442 */
11443
11444static VALUE
11445rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11446{
11447 long prefixlen;
11448 str_modify_keep_cr(str);
11449
11450 prefixlen = deleted_prefix_length(str, prefix);
11451 if (prefixlen <= 0) return Qnil;
11452
11453 return rb_str_drop_bytes(str, prefixlen);
11454}
11455
11456/*
11457 * call-seq:
11458 * delete_prefix(prefix) -> new_string
11459 *
11460 * :include: doc/string/delete_prefix.rdoc
11461 *
11462 */
11463
11464static VALUE
11465rb_str_delete_prefix(VALUE str, VALUE prefix)
11466{
11467 long prefixlen;
11468
11469 prefixlen = deleted_prefix_length(str, prefix);
11470 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11471
11472 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11473}
11474
11484static long
11485deleted_suffix_length(VALUE str, VALUE suffix)
11486{
11487 const char *strptr, *suffixptr;
11488 long olen, suffixlen;
11489 rb_encoding *enc;
11490
11491 StringValue(suffix);
11492 if (is_broken_string(suffix)) return 0;
11493 enc = rb_enc_check(str, suffix);
11494
11495 /* return 0 if not start with suffix */
11496 suffixlen = RSTRING_LEN(suffix);
11497 if (suffixlen <= 0) return 0;
11498 olen = RSTRING_LEN(str);
11499 if (olen < suffixlen) return 0;
11500 strptr = RSTRING_PTR(str);
11501 suffixptr = RSTRING_PTR(suffix);
11502 const char *strend = strptr + olen;
11503 const char *before_suffix = strend - suffixlen;
11504 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11505 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11506
11507 return suffixlen;
11508}
11509
11510/*
11511 * call-seq:
11512 * delete_suffix!(suffix) -> self or nil
11513 *
11514 * Like String#delete_suffix, except that +self+ is modified in place;
11515 * returns +self+ if the suffix is removed, +nil+ otherwise.
11516 *
11517 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11518 */
11519
11520static VALUE
11521rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11522{
11523 long olen, suffixlen, len;
11524 str_modifiable(str);
11525
11526 suffixlen = deleted_suffix_length(str, suffix);
11527 if (suffixlen <= 0) return Qnil;
11528
11529 olen = RSTRING_LEN(str);
11530 str_modify_keep_cr(str);
11531 len = olen - suffixlen;
11532 STR_SET_LEN(str, len);
11533 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11534 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11536 }
11537 return str;
11538}
11539
11540/*
11541 * call-seq:
11542 * delete_suffix(suffix) -> new_string
11543 *
11544 * :include: doc/string/delete_suffix.rdoc
11545 *
11546 */
11547
11548static VALUE
11549rb_str_delete_suffix(VALUE str, VALUE suffix)
11550{
11551 long suffixlen;
11552
11553 suffixlen = deleted_suffix_length(str, suffix);
11554 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11555
11556 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11557}
11558
11559void
11560rb_str_setter(VALUE val, ID id, VALUE *var)
11561{
11562 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11563 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11564 }
11565 *var = val;
11566}
11567
11568static void
11569nil_setter_warning(ID id)
11570{
11571 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11572}
11573
11574void
11575rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11576{
11577 rb_str_setter(val, id, var);
11578 if (!NIL_P(*var)) {
11579 nil_setter_warning(id);
11580 }
11581}
11582
11583static void
11584rb_fs_setter(VALUE val, ID id, VALUE *var)
11585{
11586 val = rb_fs_check(val);
11587 if (!val) {
11588 rb_raise(rb_eTypeError,
11589 "value of %"PRIsVALUE" must be String or Regexp",
11590 rb_id2str(id));
11591 }
11592 if (!NIL_P(val)) {
11593 nil_setter_warning(id);
11594 }
11595 *var = val;
11596}
11597
11598
11599/*
11600 * call-seq:
11601 * force_encoding(encoding) -> self
11602 *
11603 * :include: doc/string/force_encoding.rdoc
11604 *
11605 */
11606
11607static VALUE
11608rb_str_force_encoding(VALUE str, VALUE enc)
11609{
11610 str_modifiable(str);
11611
11612 rb_encoding *encoding = rb_to_encoding(enc);
11613 int idx = rb_enc_to_index(encoding);
11614
11615 // If the encoding is unchanged, we do nothing.
11616 if (ENCODING_GET(str) == idx) {
11617 return str;
11618 }
11619
11620 rb_enc_associate_index(str, idx);
11621
11622 // If the coderange was 7bit and the new encoding is ASCII-compatible
11623 // we can keep the coderange.
11624 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11625 return str;
11626 }
11627
11629 return str;
11630}
11631
11632/*
11633 * call-seq:
11634 * b -> new_string
11635 *
11636 * :include: doc/string/b.rdoc
11637 *
11638 */
11639
11640static VALUE
11641rb_str_b(VALUE str)
11642{
11643 VALUE str2;
11644 if (STR_EMBED_P(str)) {
11645 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11646 }
11647 else {
11648 str2 = str_alloc_heap(rb_cString);
11649 }
11650 str_replace_shared_without_enc(str2, str);
11651
11652 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11653 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11654 // If we know the receiver's code range then we know the result's code range.
11655 int cr = ENC_CODERANGE(str);
11656 switch (cr) {
11657 case ENC_CODERANGE_7BIT:
11659 break;
11663 break;
11664 default:
11665 ENC_CODERANGE_CLEAR(str2);
11666 break;
11667 }
11668 }
11669
11670 return str2;
11671}
11672
11673/*
11674 * call-seq:
11675 * valid_encoding? -> true or false
11676 *
11677 * :include: doc/string/valid_encoding_p.rdoc
11678 *
11679 */
11680
11681static VALUE
11682rb_str_valid_encoding_p(VALUE str)
11683{
11684 int cr = rb_enc_str_coderange(str);
11685
11686 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11687}
11688
11689/*
11690 * call-seq:
11691 * ascii_only? -> true or false
11692 *
11693 * Returns whether +self+ contains only ASCII characters:
11694 *
11695 * 'abc'.ascii_only? # => true
11696 * "abc\u{6666}".ascii_only? # => false
11697 *
11698 * Related: see {Querying}[rdoc-ref:String@Querying].
11699 */
11700
11701static VALUE
11702rb_str_is_ascii_only_p(VALUE str)
11703{
11704 int cr = rb_enc_str_coderange(str);
11705
11706 return RBOOL(cr == ENC_CODERANGE_7BIT);
11707}
11708
11709VALUE
11711{
11712 static const char ellipsis[] = "...";
11713 const long ellipsislen = sizeof(ellipsis) - 1;
11714 rb_encoding *const enc = rb_enc_get(str);
11715 const long blen = RSTRING_LEN(str);
11716 const char *const p = RSTRING_PTR(str), *e = p + blen;
11717 VALUE estr, ret = 0;
11718
11719 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11720 if (len * rb_enc_mbminlen(enc) >= blen ||
11721 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11722 ret = str;
11723 }
11724 else if (len <= ellipsislen ||
11725 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11726 if (rb_enc_asciicompat(enc)) {
11727 ret = rb_str_new(ellipsis, len);
11728 rb_enc_associate(ret, enc);
11729 }
11730 else {
11731 estr = rb_usascii_str_new(ellipsis, len);
11732 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11733 }
11734 }
11735 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11736 rb_str_cat(ret, ellipsis, ellipsislen);
11737 }
11738 else {
11739 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11740 rb_enc_from_encoding(enc), 0, Qnil);
11741 rb_str_append(ret, estr);
11742 }
11743 return ret;
11744}
11745
11746static VALUE
11747str_compat_and_valid(VALUE str, rb_encoding *enc)
11748{
11749 int cr;
11750 str = StringValue(str);
11751 cr = rb_enc_str_coderange(str);
11752 if (cr == ENC_CODERANGE_BROKEN) {
11753 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11754 }
11755 else {
11756 rb_encoding *e = STR_ENC_GET(str);
11757 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11758 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11759 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11760 }
11761 }
11762 return str;
11763}
11764
11765static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11766
11767VALUE
11769{
11770 rb_encoding *enc = STR_ENC_GET(str);
11771 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11772}
11773
11774VALUE
11775rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11776{
11777 int cr = ENC_CODERANGE_UNKNOWN;
11778 if (enc == STR_ENC_GET(str)) {
11779 /* cached coderange makes sense only when enc equals the
11780 * actual encoding of str */
11781 cr = ENC_CODERANGE(str);
11782 }
11783 return enc_str_scrub(enc, str, repl, cr);
11784}
11785
11786static VALUE
11787enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11788{
11789 int encidx;
11790 VALUE buf = Qnil;
11791 const char *rep, *p, *e, *p1, *sp;
11792 long replen = -1;
11793 long slen;
11794
11795 if (rb_block_given_p()) {
11796 if (!NIL_P(repl))
11797 rb_raise(rb_eArgError, "both of block and replacement given");
11798 replen = 0;
11799 }
11800
11801 if (ENC_CODERANGE_CLEAN_P(cr))
11802 return Qnil;
11803
11804 if (!NIL_P(repl)) {
11805 repl = str_compat_and_valid(repl, enc);
11806 }
11807
11808 if (rb_enc_dummy_p(enc)) {
11809 return Qnil;
11810 }
11811 encidx = rb_enc_to_index(enc);
11812
11813#define DEFAULT_REPLACE_CHAR(str) do { \
11814 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11815 rep = replace; replen = (int)sizeof(replace); \
11816 } while (0)
11817
11818 slen = RSTRING_LEN(str);
11819 p = RSTRING_PTR(str);
11820 e = RSTRING_END(str);
11821 p1 = p;
11822 sp = p;
11823
11824 if (rb_enc_asciicompat(enc)) {
11825 int rep7bit_p;
11826 if (!replen) {
11827 rep = NULL;
11828 rep7bit_p = FALSE;
11829 }
11830 else if (!NIL_P(repl)) {
11831 rep = RSTRING_PTR(repl);
11832 replen = RSTRING_LEN(repl);
11833 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11834 }
11835 else if (encidx == rb_utf8_encindex()) {
11836 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11837 rep7bit_p = FALSE;
11838 }
11839 else {
11840 DEFAULT_REPLACE_CHAR("?");
11841 rep7bit_p = TRUE;
11842 }
11843 cr = ENC_CODERANGE_7BIT;
11844
11845 p = search_nonascii(p, e);
11846 if (!p) {
11847 p = e;
11848 }
11849 while (p < e) {
11850 int ret = rb_enc_precise_mbclen(p, e, enc);
11851 if (MBCLEN_NEEDMORE_P(ret)) {
11852 break;
11853 }
11854 else if (MBCLEN_CHARFOUND_P(ret)) {
11856 p += MBCLEN_CHARFOUND_LEN(ret);
11857 }
11858 else if (MBCLEN_INVALID_P(ret)) {
11859 /*
11860 * p1~p: valid ascii/multibyte chars
11861 * p ~e: invalid bytes + unknown bytes
11862 */
11863 long clen = rb_enc_mbmaxlen(enc);
11864 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11865 if (p > p1) {
11866 rb_str_buf_cat(buf, p1, p - p1);
11867 }
11868
11869 if (e - p < clen) clen = e - p;
11870 if (clen <= 2) {
11871 clen = 1;
11872 }
11873 else {
11874 const char *q = p;
11875 clen--;
11876 for (; clen > 1; clen--) {
11877 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11878 if (MBCLEN_NEEDMORE_P(ret)) break;
11879 if (MBCLEN_INVALID_P(ret)) continue;
11881 }
11882 }
11883 if (rep) {
11884 rb_str_buf_cat(buf, rep, replen);
11885 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11886 }
11887 else {
11888 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11889 str_mod_check(str, sp, slen);
11890 repl = str_compat_and_valid(repl, enc);
11891 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11894 }
11895 p += clen;
11896 p1 = p;
11897 p = search_nonascii(p, e);
11898 if (!p) {
11899 p = e;
11900 break;
11901 }
11902 }
11903 else {
11905 }
11906 }
11907 if (NIL_P(buf)) {
11908 if (p == e) {
11909 ENC_CODERANGE_SET(str, cr);
11910 return Qnil;
11911 }
11912 buf = rb_str_buf_new(RSTRING_LEN(str));
11913 }
11914 if (p1 < p) {
11915 rb_str_buf_cat(buf, p1, p - p1);
11916 }
11917 if (p < e) {
11918 if (rep) {
11919 rb_str_buf_cat(buf, rep, replen);
11920 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11921 }
11922 else {
11923 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11924 str_mod_check(str, sp, slen);
11925 repl = str_compat_and_valid(repl, enc);
11926 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11929 }
11930 }
11931 }
11932 else {
11933 /* ASCII incompatible */
11934 long mbminlen = rb_enc_mbminlen(enc);
11935 if (!replen) {
11936 rep = NULL;
11937 }
11938 else if (!NIL_P(repl)) {
11939 rep = RSTRING_PTR(repl);
11940 replen = RSTRING_LEN(repl);
11941 }
11942 else if (encidx == ENCINDEX_UTF_16BE) {
11943 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11944 }
11945 else if (encidx == ENCINDEX_UTF_16LE) {
11946 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11947 }
11948 else if (encidx == ENCINDEX_UTF_32BE) {
11949 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11950 }
11951 else if (encidx == ENCINDEX_UTF_32LE) {
11952 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11953 }
11954 else {
11955 DEFAULT_REPLACE_CHAR("?");
11956 }
11957
11958 while (p < e) {
11959 int ret = rb_enc_precise_mbclen(p, e, enc);
11960 if (MBCLEN_NEEDMORE_P(ret)) {
11961 break;
11962 }
11963 else if (MBCLEN_CHARFOUND_P(ret)) {
11964 p += MBCLEN_CHARFOUND_LEN(ret);
11965 }
11966 else if (MBCLEN_INVALID_P(ret)) {
11967 const char *q = p;
11968 long clen = rb_enc_mbmaxlen(enc);
11969 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11970 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11971
11972 if (e - p < clen) clen = e - p;
11973 if (clen <= mbminlen * 2) {
11974 clen = mbminlen;
11975 }
11976 else {
11977 clen -= mbminlen;
11978 for (; clen > mbminlen; clen-=mbminlen) {
11979 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11980 if (MBCLEN_NEEDMORE_P(ret)) break;
11981 if (MBCLEN_INVALID_P(ret)) continue;
11983 }
11984 }
11985 if (rep) {
11986 rb_str_buf_cat(buf, rep, replen);
11987 }
11988 else {
11989 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11990 str_mod_check(str, sp, slen);
11991 repl = str_compat_and_valid(repl, enc);
11992 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11993 }
11994 p += clen;
11995 p1 = p;
11996 }
11997 else {
11999 }
12000 }
12001 if (NIL_P(buf)) {
12002 if (p == e) {
12004 return Qnil;
12005 }
12006 buf = rb_str_buf_new(RSTRING_LEN(str));
12007 }
12008 if (p1 < p) {
12009 rb_str_buf_cat(buf, p1, p - p1);
12010 }
12011 if (p < e) {
12012 if (rep) {
12013 rb_str_buf_cat(buf, rep, replen);
12014 }
12015 else {
12016 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12017 str_mod_check(str, sp, slen);
12018 repl = str_compat_and_valid(repl, enc);
12019 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12020 }
12021 }
12023 }
12024 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12025 return buf;
12026}
12027
12028/*
12029 * call-seq:
12030 * scrub(replacement_string = default_replacement_string) -> new_string
12031 * scrub{|sequence| ... } -> new_string
12032 *
12033 * :include: doc/string/scrub.rdoc
12034 *
12035 */
12036static VALUE
12037str_scrub(int argc, VALUE *argv, VALUE str)
12038{
12039 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12040 VALUE new = rb_str_scrub(str, repl);
12041 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12042}
12043
12044/*
12045 * call-seq:
12046 * scrub!(replacement_string = default_replacement_string) -> self
12047 * scrub!{|sequence| ... } -> self
12048 *
12049 * Like String#scrub, except that:
12050 *
12051 * - Any replacements are made in +self+.
12052 * - Returns +self+.
12053 *
12054 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12055 *
12056 */
12057static VALUE
12058str_scrub_bang(int argc, VALUE *argv, VALUE str)
12059{
12060 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12061 VALUE new = rb_str_scrub(str, repl);
12062 if (!NIL_P(new)) rb_str_replace(str, new);
12063 return str;
12064}
12065
12066static ID id_normalize;
12067static ID id_normalized_p;
12068static VALUE mUnicodeNormalize;
12069
12070static VALUE
12071unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12072{
12073 static int UnicodeNormalizeRequired = 0;
12074 VALUE argv2[2];
12075
12076 if (!UnicodeNormalizeRequired) {
12077 rb_require("unicode_normalize/normalize.rb");
12078 UnicodeNormalizeRequired = 1;
12079 }
12080 argv2[0] = str;
12081 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12082 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12083}
12084
12085/*
12086 * call-seq:
12087 * unicode_normalize(form = :nfc) -> string
12088 *
12089 * :include: doc/string/unicode_normalize.rdoc
12090 *
12091 */
12092static VALUE
12093rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12094{
12095 return unicode_normalize_common(argc, argv, str, id_normalize);
12096}
12097
12098/*
12099 * call-seq:
12100 * unicode_normalize!(form = :nfc) -> self
12101 *
12102 * Like String#unicode_normalize, except that the normalization
12103 * is performed on +self+ (not on a copy of +self+).
12104 *
12105 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12106 *
12107 */
12108static VALUE
12109rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12110{
12111 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12112}
12113
12114/* call-seq:
12115 * unicode_normalized?(form = :nfc) -> true or false
12116 *
12117 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12118 * see String#unicode_normalize.
12119 *
12120 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12121 *
12122 * Examples:
12123 *
12124 * "a\u0300".unicode_normalized? # => false
12125 * "a\u0300".unicode_normalized?(:nfd) # => true
12126 * "\u00E0".unicode_normalized? # => true
12127 * "\u00E0".unicode_normalized?(:nfd) # => false
12128 *
12129 *
12130 * Raises an exception if +self+ is not in a Unicode encoding:
12131 *
12132 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12133 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12134 *
12135 * Related: see {Querying}[rdoc-ref:String@Querying].
12136 */
12137static VALUE
12138rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12139{
12140 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12141}
12142
12143/**********************************************************************
12144 * Document-class: Symbol
12145 *
12146 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12147 *
12148 * You can create a +Symbol+ object explicitly with:
12149 *
12150 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12151 *
12152 * The same +Symbol+ object will be
12153 * created for a given name or string for the duration of a program's
12154 * execution, regardless of the context or meaning of that name. Thus
12155 * if <code>Fred</code> is a constant in one context, a method in
12156 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12157 * will be the same object in all three contexts.
12158 *
12159 * module One
12160 * class Fred
12161 * end
12162 * $f1 = :Fred
12163 * end
12164 * module Two
12165 * Fred = 1
12166 * $f2 = :Fred
12167 * end
12168 * def Fred()
12169 * end
12170 * $f3 = :Fred
12171 * $f1.object_id #=> 2514190
12172 * $f2.object_id #=> 2514190
12173 * $f3.object_id #=> 2514190
12174 *
12175 * Constant, method, and variable names are returned as symbols:
12176 *
12177 * module One
12178 * Two = 2
12179 * def three; 3 end
12180 * @four = 4
12181 * @@five = 5
12182 * $six = 6
12183 * end
12184 * seven = 7
12185 *
12186 * One.constants
12187 * # => [:Two]
12188 * One.instance_methods(true)
12189 * # => [:three]
12190 * One.instance_variables
12191 * # => [:@four]
12192 * One.class_variables
12193 * # => [:@@five]
12194 * global_variables.grep(/six/)
12195 * # => [:$six]
12196 * local_variables
12197 * # => [:seven]
12198 *
12199 * A +Symbol+ object differs from a String object in that
12200 * a +Symbol+ object represents an identifier, while a String object
12201 * represents text or data.
12202 *
12203 * == What's Here
12204 *
12205 * First, what's elsewhere. Class +Symbol+:
12206 *
12207 * - Inherits from {class Object}[rdoc-ref:Object@Whats+Here].
12208 * - Includes {module Comparable}[rdoc-ref:Comparable@Whats+Here].
12209 *
12210 * Here, class +Symbol+ provides methods that are useful for:
12211 *
12212 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12213 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12214 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12215 *
12216 * === Methods for Querying
12217 *
12218 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12219 * - #=~: Returns the index of the first substring in symbol that matches a
12220 * given Regexp or other object; returns +nil+ if no match is found.
12221 * - #[], #slice : Returns a substring of symbol
12222 * determined by a given index, start/length, or range, or string.
12223 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12224 * - #encoding: Returns the Encoding object that represents the encoding
12225 * of symbol.
12226 * - #end_with?: Returns +true+ if symbol ends with
12227 * any of the given strings.
12228 * - #match: Returns a MatchData object if symbol
12229 * matches a given Regexp; +nil+ otherwise.
12230 * - #match?: Returns +true+ if symbol
12231 * matches a given Regexp; +false+ otherwise.
12232 * - #length, #size: Returns the number of characters in symbol.
12233 * - #start_with?: Returns +true+ if symbol starts with
12234 * any of the given strings.
12235 *
12236 * === Methods for Comparing
12237 *
12238 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12239 * or larger than symbol.
12240 * - #==, #===: Returns +true+ if a given symbol has the same content and
12241 * encoding.
12242 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12243 * symbol is smaller than, equal to, or larger than symbol.
12244 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12245 * after Unicode case folding; +false+ otherwise.
12246 *
12247 * === Methods for Converting
12248 *
12249 * - #capitalize: Returns symbol with the first character upcased
12250 * and all other characters downcased.
12251 * - #downcase: Returns symbol with all characters downcased.
12252 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12253 * - #name: Returns the frozen string corresponding to symbol.
12254 * - #succ, #next: Returns the symbol that is the successor to symbol.
12255 * - #swapcase: Returns symbol with all upcase characters downcased
12256 * and all downcase characters upcased.
12257 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12258 * - #to_s, #id2name: Returns the string corresponding to +self+.
12259 * - #to_sym, #intern: Returns +self+.
12260 * - #upcase: Returns symbol with all characters upcased.
12261 *
12262 */
12263
12264
12265/*
12266 * call-seq:
12267 * self == other -> true or false
12268 *
12269 * Returns whether +other+ is the same object as +self+.
12270 */
12271
12272#define sym_equal rb_obj_equal
12273
12274static int
12275sym_printable(const char *s, const char *send, rb_encoding *enc)
12276{
12277 while (s < send) {
12278 int n;
12279 int c = rb_enc_precise_mbclen(s, send, enc);
12280
12281 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12282 n = MBCLEN_CHARFOUND_LEN(c);
12283 c = rb_enc_mbc_to_codepoint(s, send, enc);
12284 if (!rb_enc_isprint(c, enc)) return FALSE;
12285 s += n;
12286 }
12287 return TRUE;
12288}
12289
12290int
12291rb_str_symname_p(VALUE sym)
12292{
12293 rb_encoding *enc;
12294 const char *ptr;
12295 long len;
12296 rb_encoding *resenc = rb_default_internal_encoding();
12297
12298 if (resenc == NULL) resenc = rb_default_external_encoding();
12299 enc = STR_ENC_GET(sym);
12300 ptr = RSTRING_PTR(sym);
12301 len = RSTRING_LEN(sym);
12302 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12303 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12304 return FALSE;
12305 }
12306 return TRUE;
12307}
12308
12309VALUE
12310rb_str_quote_unprintable(VALUE str)
12311{
12312 rb_encoding *enc;
12313 const char *ptr;
12314 long len;
12315 rb_encoding *resenc;
12316
12317 Check_Type(str, T_STRING);
12318 resenc = rb_default_internal_encoding();
12319 if (resenc == NULL) resenc = rb_default_external_encoding();
12320 enc = STR_ENC_GET(str);
12321 ptr = RSTRING_PTR(str);
12322 len = RSTRING_LEN(str);
12323 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12324 !sym_printable(ptr, ptr + len, enc)) {
12325 return rb_str_escape(str);
12326 }
12327 return str;
12328}
12329
12330VALUE
12331rb_id_quote_unprintable(ID id)
12332{
12333 VALUE str = rb_id2str(id);
12334 if (!rb_str_symname_p(str)) {
12335 return rb_str_escape(str);
12336 }
12337 return str;
12338}
12339
12340/*
12341 * call-seq:
12342 * inspect -> string
12343 *
12344 * Returns a string representation of +self+ (including the leading colon):
12345 *
12346 * :foo.inspect # => ":foo"
12347 *
12348 * Related: Symbol#to_s, Symbol#name.
12349 *
12350 */
12351
12352static VALUE
12353sym_inspect(VALUE sym)
12354{
12355 VALUE str = rb_sym2str(sym);
12356 const char *ptr;
12357 long len;
12358 char *dest;
12359
12360 if (!rb_str_symname_p(str)) {
12361 str = rb_str_inspect(str);
12362 len = RSTRING_LEN(str);
12363 rb_str_resize(str, len + 1);
12364 dest = RSTRING_PTR(str);
12365 memmove(dest + 1, dest, len);
12366 }
12367 else {
12368 rb_encoding *enc = STR_ENC_GET(str);
12369 VALUE orig_str = str;
12370
12371 len = RSTRING_LEN(orig_str);
12372 str = rb_enc_str_new(0, len + 1, enc);
12373
12374 // Get data pointer after allocation
12375 ptr = RSTRING_PTR(orig_str);
12376 dest = RSTRING_PTR(str);
12377 memcpy(dest + 1, ptr, len);
12378
12379 RB_GC_GUARD(orig_str);
12380 }
12381 dest[0] = ':';
12382
12384
12385 return str;
12386}
12387
12388VALUE
12390{
12391 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12392 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12393 return str;
12394}
12395
12396VALUE
12397rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12398{
12399 VALUE obj;
12400
12401 if (argc < 1) {
12402 rb_raise(rb_eArgError, "no receiver given");
12403 }
12404 obj = argv[0];
12405 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12406}
12407
12408/*
12409 * call-seq:
12410 * succ
12411 *
12412 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12413 *
12414 * :foo.succ # => :fop
12415 *
12416 * Related: String#succ.
12417 */
12418
12419static VALUE
12420sym_succ(VALUE sym)
12421{
12422 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12423}
12424
12425/*
12426 * call-seq:
12427 * self <=> other -> -1, 0, 1, or nil
12428 *
12429 * Compares +self+ and +other+, using String#<=>.
12430 *
12431 * Returns:
12432 *
12433 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12434 * - +nil+, otherwise.
12435 *
12436 * Examples:
12437 *
12438 * :bar <=> :foo # => -1
12439 * :foo <=> :foo # => 0
12440 * :foo <=> :bar # => 1
12441 * :foo <=> 'bar' # => nil
12442 *
12443 * \Class \Symbol includes module Comparable,
12444 * each of whose methods uses Symbol#<=> for comparison.
12445 *
12446 * Related: String#<=>.
12447 */
12448
12449static VALUE
12450sym_cmp(VALUE sym, VALUE other)
12451{
12452 if (!SYMBOL_P(other)) {
12453 return Qnil;
12454 }
12455 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12456}
12457
12458/*
12459 * call-seq:
12460 * casecmp(object) -> -1, 0, 1, or nil
12461 *
12462 * :include: doc/symbol/casecmp.rdoc
12463 *
12464 */
12465
12466static VALUE
12467sym_casecmp(VALUE sym, VALUE other)
12468{
12469 if (!SYMBOL_P(other)) {
12470 return Qnil;
12471 }
12472 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12473}
12474
12475/*
12476 * call-seq:
12477 * casecmp?(object) -> true, false, or nil
12478 *
12479 * :include: doc/symbol/casecmp_p.rdoc
12480 *
12481 */
12482
12483static VALUE
12484sym_casecmp_p(VALUE sym, VALUE other)
12485{
12486 if (!SYMBOL_P(other)) {
12487 return Qnil;
12488 }
12489 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12490}
12491
12492/*
12493 * call-seq:
12494 * self =~ other -> integer or nil
12495 *
12496 * Equivalent to <tt>self.to_s =~ other</tt>,
12497 * including possible updates to global variables;
12498 * see String#=~.
12499 *
12500 */
12501
12502static VALUE
12503sym_match(VALUE sym, VALUE other)
12504{
12505 return rb_str_match(rb_sym2str(sym), other);
12506}
12507
12508/*
12509 * call-seq:
12510 * match(pattern, offset = 0) -> matchdata or nil
12511 * match(pattern, offset = 0) {|matchdata| } -> object
12512 *
12513 * Equivalent to <tt>self.to_s.match</tt>,
12514 * including possible updates to global variables;
12515 * see String#match.
12516 *
12517 */
12518
12519static VALUE
12520sym_match_m(int argc, VALUE *argv, VALUE sym)
12521{
12522 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12523}
12524
12525/*
12526 * call-seq:
12527 * match?(pattern, offset) -> true or false
12528 *
12529 * Equivalent to <tt>sym.to_s.match?</tt>;
12530 * see String#match.
12531 *
12532 */
12533
12534static VALUE
12535sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12536{
12537 return rb_str_match_m_p(argc, argv, sym);
12538}
12539
12540/*
12541 * call-seq:
12542 * self[offset] -> string or nil
12543 * self[offset, size] -> string or nil
12544 * self[range] -> string or nil
12545 * self[regexp, capture = 0] -> string or nil
12546 * self[substring] -> string or nil
12547 *
12548 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12549 *
12550 */
12551
12552static VALUE
12553sym_aref(int argc, VALUE *argv, VALUE sym)
12554{
12555 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12556}
12557
12558/*
12559 * call-seq:
12560 * length -> integer
12561 *
12562 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12563 */
12564
12565static VALUE
12566sym_length(VALUE sym)
12567{
12568 return rb_str_length(rb_sym2str(sym));
12569}
12570
12571/*
12572 * call-seq:
12573 * empty? -> true or false
12574 *
12575 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12576 *
12577 */
12578
12579static VALUE
12580sym_empty(VALUE sym)
12581{
12582 return rb_str_empty(rb_sym2str(sym));
12583}
12584
12585/*
12586 * call-seq:
12587 * upcase(mapping) -> symbol
12588 *
12589 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12590 *
12591 * See String#upcase.
12592 *
12593 */
12594
12595static VALUE
12596sym_upcase(int argc, VALUE *argv, VALUE sym)
12597{
12598 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12599}
12600
12601/*
12602 * call-seq:
12603 * downcase(mapping) -> symbol
12604 *
12605 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12606 *
12607 * See String#downcase.
12608 *
12609 * Related: Symbol#upcase.
12610 *
12611 */
12612
12613static VALUE
12614sym_downcase(int argc, VALUE *argv, VALUE sym)
12615{
12616 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12617}
12618
12619/*
12620 * call-seq:
12621 * capitalize(mapping) -> symbol
12622 *
12623 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12624 *
12625 * See String#capitalize.
12626 *
12627 */
12628
12629static VALUE
12630sym_capitalize(int argc, VALUE *argv, VALUE sym)
12631{
12632 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12633}
12634
12635/*
12636 * call-seq:
12637 * swapcase(mapping) -> symbol
12638 *
12639 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12640 *
12641 * See String#swapcase.
12642 *
12643 */
12644
12645static VALUE
12646sym_swapcase(int argc, VALUE *argv, VALUE sym)
12647{
12648 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12649}
12650
12651/*
12652 * call-seq:
12653 * start_with?(*string_or_regexp) -> true or false
12654 *
12655 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12656 *
12657 */
12658
12659static VALUE
12660sym_start_with(int argc, VALUE *argv, VALUE sym)
12661{
12662 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12663}
12664
12665/*
12666 * call-seq:
12667 * end_with?(*strings) -> true or false
12668 *
12669 *
12670 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12671 *
12672 */
12673
12674static VALUE
12675sym_end_with(int argc, VALUE *argv, VALUE sym)
12676{
12677 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12678}
12679
12680/*
12681 * call-seq:
12682 * encoding -> encoding
12683 *
12684 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12685 *
12686 */
12687
12688static VALUE
12689sym_encoding(VALUE sym)
12690{
12691 return rb_obj_encoding(rb_sym2str(sym));
12692}
12693
12694static VALUE
12695string_for_symbol(VALUE name)
12696{
12697 if (!RB_TYPE_P(name, T_STRING)) {
12698 VALUE tmp = rb_check_string_type(name);
12699 if (NIL_P(tmp)) {
12700 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12701 name);
12702 }
12703 name = tmp;
12704 }
12705 return name;
12706}
12707
12708ID
12710{
12711 if (SYMBOL_P(name)) {
12712 return SYM2ID(name);
12713 }
12714 name = string_for_symbol(name);
12715 return rb_intern_str(name);
12716}
12717
12718VALUE
12720{
12721 if (SYMBOL_P(name)) {
12722 return name;
12723 }
12724 name = string_for_symbol(name);
12725 return rb_str_intern(name);
12726}
12727
12728/*
12729 * call-seq:
12730 * Symbol.all_symbols -> array_of_symbols
12731 *
12732 * Returns an array of all symbols currently in Ruby's symbol table:
12733 *
12734 * Symbol.all_symbols.size # => 9334
12735 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12736 *
12737 */
12738
12739static VALUE
12740sym_all_symbols(VALUE _)
12741{
12742 return rb_sym_all_symbols();
12743}
12744
12745VALUE
12746rb_str_to_interned_str(VALUE str)
12747{
12748 return rb_fstring(str);
12749}
12750
12751VALUE
12752rb_interned_str(const char *ptr, long len)
12753{
12754 struct RString fake_str = {RBASIC_INIT};
12755 int encidx = ENCINDEX_US_ASCII;
12756 int coderange = ENC_CODERANGE_7BIT;
12757 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12758 encidx = ENCINDEX_ASCII_8BIT;
12759 coderange = ENC_CODERANGE_VALID;
12760 }
12761 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12762 ENC_CODERANGE_SET(str, coderange);
12763 return register_fstring(str, true, false);
12764}
12765
12766VALUE
12768{
12769 return rb_interned_str(ptr, strlen(ptr));
12770}
12771
12772VALUE
12773rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12774{
12775 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12776 rb_enc_autoload(enc);
12777 }
12778
12779 struct RString fake_str = {RBASIC_INIT};
12780 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12781}
12782
12783VALUE
12784rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12785{
12786 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12787 rb_enc_autoload(enc);
12788 }
12789
12790 struct RString fake_str = {RBASIC_INIT};
12791 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12792 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12793 return str;
12794}
12795
12796VALUE
12798{
12799 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12800}
12801
12802#if USE_YJIT || USE_ZJIT
12803void
12804rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12805{
12806 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12807 ssize_t code = RB_NUM2SSIZE(codepoint);
12808
12809 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12810 rb_str_buf_cat_byte(str, (char) code);
12811 return;
12812 }
12813 }
12814
12815 rb_str_concat(str, codepoint);
12816}
12817#endif
12818
12819static int
12820fstring_set_class_i(VALUE *str, void *data)
12821{
12822 RBASIC_SET_CLASS(*str, rb_cString);
12823
12824 return ST_CONTINUE;
12825}
12826
12827void
12828Init_String(void)
12829{
12831
12832 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12833
12835 rb_define_alloc_func(rb_cString, empty_str_alloc);
12836 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12837 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12838 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12840 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12841 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12844 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12845 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12846 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12847 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12850 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12851 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12852 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12853 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12856 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12857 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12858 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12859 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12860 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12862 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12864 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12865 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12866 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12867 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12868 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12869 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12870 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12871 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12872 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12873 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12874 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12875 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12876 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12877 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12879 rb_define_method(rb_cString, "+@", str_uplus, 0);
12880 rb_define_method(rb_cString, "-@", str_uminus, 0);
12881 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12882 rb_define_alias(rb_cString, "dedup", "-@");
12883
12884 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12885 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12886 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12887 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12890 rb_define_method(rb_cString, "undump", str_undump, 0);
12891
12892 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12893 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12894 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12895 sym_fold = ID2SYM(rb_intern_const("fold"));
12896
12897 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12898 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12899 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12900 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12901
12902 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12903 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12904 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12905 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12906
12907 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12908 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12909 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12910 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12911 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12912 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12913 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12914 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12915 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12916 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12917 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12918 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12920 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12921 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12922 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12923 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12924 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12925
12926 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12927 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12928 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12929
12930 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12931
12932 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12933 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12934 rb_define_method(rb_cString, "center", rb_str_center, -1);
12935
12936 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12937 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12938 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12939 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12940 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12941 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12942 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12943 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12944 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12945
12946 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12947 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12948 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12949 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12950 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12951 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12952 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12953 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12954 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12955
12956 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12957 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12958 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12959 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12960 rb_define_method(rb_cString, "count", rb_str_count, -1);
12961
12962 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12963 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12964 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12965 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12966
12967 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12968 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12969 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12970 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12971 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12972
12973 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12974
12975 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12976 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12977
12978 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12979 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12980
12981 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12982 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12983 rb_define_method(rb_cString, "b", rb_str_b, 0);
12984 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12985 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12986
12987 /* define UnicodeNormalize module here so that we don't have to look it up */
12988 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12989 id_normalize = rb_intern_const("normalize");
12990 id_normalized_p = rb_intern_const("normalized?");
12991
12992 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12993 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12994 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12995
12996 rb_fs = Qnil;
12997 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12998 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12999 rb_gc_register_address(&rb_fs);
13000
13005 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13006
13007 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13008 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13009 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13010 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13011 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13012 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13013
13014 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13015 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13016 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13017 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13018
13019 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13020 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13021 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13022 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13023 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13024 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13025 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13026
13027 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13028 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13029 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13030 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13031
13032 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13033 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13034
13035 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13036}
13037
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:696
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:404
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1730
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1523
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1636
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2890
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2700
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3180
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1018
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2969
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:130
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:128
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:125
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:122
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:127
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:129
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:126
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:134
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:477
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:661
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3967
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1431
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1427
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1434
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1425
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1429
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2254
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2272
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1325
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3650
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:235
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:141
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1313
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3334
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1340
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:945
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1205
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3052
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1224
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12773
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:255
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2332
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3769
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1153
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1445
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1346
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:964
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12797
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:755
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1120
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1207
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:712
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2042
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1091
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2048
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1949
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4304
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3796
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1927
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1750
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1510
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2485
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3834
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1421
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12389
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2558
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1397
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1744
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3080
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5379
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4197
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3187
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11710
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1791
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1786
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1187
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:999
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1516
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1994
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4183
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3602
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2421
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2012
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6593
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3195
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12767
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1427
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3800
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3127
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4304
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3421
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7272
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2790
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12752
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4251
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4071
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4226
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3776
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3312
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5866
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11768
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1700
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2974
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3284
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3403
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1199
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2744
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7379
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1409
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1716
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2435
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5781
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9386
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1193
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:968
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1848
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2034
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2113
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3474
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1731
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1024
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12719
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12709
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1866
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3506
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4548
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1375
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1439
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2951
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2809
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1433
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2822
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1777
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:122
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:531
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RString::@53::@55 embed
Embedded contents.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@53 as
String's specific fields.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@53::@54 heap
Strings that use separated memory region for contents use this pattern.
union RString::@53::@54::@56 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:229
Definition string.c:8266
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:308
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113