Ruby 4.1.0dev (2026-04-29 revision 3345854fa882f6bd70bbd7853010b43b9c8fe73d)
string.c (3345854fa882f6bd70bbd7853010b43b9c8fe73d)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
156} while (0)
157
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
161} while (0)
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
172 }\
173 }\
174 else {\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 }\
180} while (0)
181
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
192 } \
193} while (0)
194
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
197/* TODO: include the terminator size in capa. */
198
199#define STR_ENC_GET(str) get_encoding(str)
200
201static inline bool
202zero_filled(const char *s, int n)
203{
204 for (; n > 0; --n) {
205 if (*s++) return false;
206 }
207 return true;
208}
209
210#if !defined SHARABLE_MIDDLE_SUBSTRING
211# define SHARABLE_MIDDLE_SUBSTRING 0
212#endif
213
214static inline bool
215SHARABLE_SUBSTRING_P(VALUE str, long beg, long len)
216{
217#if SHARABLE_MIDDLE_SUBSTRING
218 return true;
219#else
220 long end = beg + len;
221 long source_len = RSTRING_LEN(str);
222 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
223#endif
224}
225
226static inline long
227str_embed_capa(VALUE str)
228{
229 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
230}
231
232bool
233rb_str_reembeddable_p(VALUE str)
234{
235 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
236}
237
238static inline size_t
239rb_str_embed_size(long capa, long termlen)
240{
241 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
242 if (size < sizeof(struct RString)) size = sizeof(struct RString);
243 return size;
244}
245
246size_t
247rb_str_size_as_embedded(VALUE str)
248{
249 size_t real_size;
250 if (STR_EMBED_P(str)) {
251 size_t capa = RSTRING(str)->len;
252 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
253
254 real_size = rb_str_embed_size(capa, TERM_LEN(str));
255 }
256 /* if the string is not currently embedded, but it can be embedded, how
257 * much space would it require */
258 else if (rb_str_reembeddable_p(str)) {
259 size_t capa = RSTRING(str)->as.heap.aux.capa;
260 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
261
262 real_size = rb_str_embed_size(capa, TERM_LEN(str));
263 }
264 else {
265 real_size = sizeof(struct RString);
266 }
267
268 return real_size;
269}
270
271static inline bool
272STR_EMBEDDABLE_P(long len, long termlen)
273{
274 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
275}
276
277static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
278static VALUE str_new_frozen(VALUE klass, VALUE orig);
279static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
280static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
281static VALUE str_new(VALUE klass, const char *ptr, long len);
282static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
283static inline void str_modifiable(VALUE str);
284static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
285static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
286
287static inline void
288str_make_independent(VALUE str)
289{
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str), len, 0L, termlen);
293}
294
295static inline int str_dependent_p(VALUE str);
296
297void
298rb_str_make_independent(VALUE str)
299{
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
302 }
303}
304
305void
306rb_str_make_embedded(VALUE str)
307{
308 RUBY_ASSERT(rb_str_reembeddable_p(str));
309 RUBY_ASSERT(!STR_EMBED_P(str));
310
311 int termlen = TERM_LEN(str);
312 char *buf = RSTRING(str)->as.heap.ptr;
313 long old_capa = RSTRING(str)->as.heap.aux.capa + termlen;
314 long len = RSTRING(str)->len;
315
316 STR_SET_EMBED(str);
317 STR_SET_LEN(str, len);
318
319 if (len > 0) {
320 memcpy(RSTRING_PTR(str), buf, len);
321 SIZED_FREE_N(buf, old_capa);
322 }
323
324 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
325}
326
327void
328rb_debug_rstring_null_ptr(const char *func)
329{
330 fprintf(stderr, "%s is returning NULL!! "
331 "SIGSEGV is highly expected to follow immediately.\n"
332 "If you could reproduce, attach your debugger here, "
333 "and look at the passed string.\n",
334 func);
335}
336
337/* symbols for [up|down|swap]case/capitalize options */
338static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
339
340static rb_encoding *
341get_encoding(VALUE str)
342{
343 return rb_enc_from_index(ENCODING_GET(str));
344}
345
346static void
347mustnot_broken(VALUE str)
348{
349 if (is_broken_string(str)) {
350 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
351 }
352}
353
354static void
355mustnot_wchar(VALUE str)
356{
357 rb_encoding *enc = STR_ENC_GET(str);
358 if (rb_enc_mbminlen(enc) > 1) {
359 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
360 }
361}
362
363static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
364
365#if SIZEOF_LONG == SIZEOF_VOIDP
366#define PRECOMPUTED_FAKESTR_HASH 1
367#else
368#endif
369
370static inline bool
371BARE_STRING_P(VALUE str)
372{
373 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
374}
375
376static inline st_index_t
377str_do_hash(VALUE str)
378{
379 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
380 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
381 if (e && !is_ascii_string(str)) {
382 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
383 }
384 return h;
385}
386
387static VALUE
388str_store_precomputed_hash(VALUE str, st_index_t hash)
389{
390 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
391 RUBY_ASSERT(STR_EMBED_P(str));
392
393#if RUBY_DEBUG
394 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
395 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
397#endif
398
399 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
400
401 FL_SET(str, STR_PRECOMPUTED_HASH);
402
403 return str;
404}
405
406VALUE
407rb_fstring(VALUE str)
408{
409 VALUE fstr;
410 int bare;
411
412 Check_Type(str, T_STRING);
413
414 if (FL_TEST(str, RSTRING_FSTR))
415 return str;
416
417 bare = BARE_STRING_P(str);
418 if (!bare) {
419 if (STR_EMBED_P(str)) {
420 OBJ_FREEZE(str);
421 return str;
422 }
423
424 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
426 return str;
427 }
428 }
429
430 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
431 rb_str_resize(str, RSTRING_LEN(str));
432
433 fstr = register_fstring(str, false, false);
434
435 if (!bare) {
436 str_replace_shared_without_enc(str, fstr);
437 OBJ_FREEZE(str);
438 return str;
439 }
440 return fstr;
441}
442
443static VALUE fstring_table_obj;
444
445static VALUE
446fstring_concurrent_set_hash(VALUE str)
447{
448#ifdef PRECOMPUTED_FAKESTR_HASH
449 st_index_t h;
450 if (FL_TEST_RAW(str, STR_FAKESTR)) {
451 // register_fstring precomputes the hash and stores it in capa for fake strings
452 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
453 }
454 else {
455 h = rb_str_hash(str);
456 }
457 // rb_str_hash doesn't include the encoding for ascii only strings, so
458 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
459 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
460#else
461 return (VALUE)rb_str_hash(str);
462#endif
463}
464
465static bool
466fstring_concurrent_set_cmp(VALUE a, VALUE b)
467{
468 long alen, blen;
469 const char *aptr, *bptr;
470
473
474 RSTRING_GETMEM(a, aptr, alen);
475 RSTRING_GETMEM(b, bptr, blen);
476 return (alen == blen &&
477 ENCODING_GET(a) == ENCODING_GET(b) &&
478 memcmp(aptr, bptr, alen) == 0);
479}
480
482 bool copy;
483 bool force_precompute_hash;
484};
485
486static VALUE
487fstring_concurrent_set_create(VALUE str, void *data)
488{
489 struct fstr_create_arg *arg = data;
490
491 // Unless the string is empty or binary, its coderange has been precomputed.
492 int coderange = ENC_CODERANGE(str);
493
494 if (FL_TEST_RAW(str, STR_FAKESTR)) {
495 if (arg->copy) {
496 VALUE new_str;
497 long len = RSTRING_LEN(str);
498 long capa = len + sizeof(st_index_t);
499 int term_len = TERM_LEN(str);
500
501 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
502 new_str = str_alloc_embed(rb_cString, capa + term_len);
503 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
504 STR_SET_LEN(new_str, RSTRING_LEN(str));
505 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
506 rb_enc_copy(new_str, str);
507 str_store_precomputed_hash(new_str, str_do_hash(str));
508 }
509 else {
510 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
511 rb_enc_copy(new_str, str);
512#ifdef PRECOMPUTED_FAKESTR_HASH
513 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
514 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
515 }
516#endif
517 }
518 str = new_str;
519 }
520 else {
521 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
522 RSTRING(str)->len,
523 ENCODING_GET(str));
524 }
525 OBJ_FREEZE(str);
526 }
527 else {
528 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
529 str = str_new_frozen(rb_cString, str);
530 }
531 if (STR_SHARED_P(str)) { /* str should not be shared */
532 /* shared substring */
533 str_make_independent(str);
535 }
536 if (!BARE_STRING_P(str)) {
537 str = str_new_frozen(rb_cString, str);
538 }
539 }
540
541 ENC_CODERANGE_SET(str, coderange);
542 RBASIC(str)->flags |= RSTRING_FSTR;
543 if (!RB_OBJ_SHAREABLE_P(str)) {
544 RB_OBJ_SET_SHAREABLE(str);
545 }
546 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
549 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
550 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
552 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
553
554 return str;
555}
556
557static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
558 .hash = fstring_concurrent_set_hash,
559 .cmp = fstring_concurrent_set_cmp,
560 .create = fstring_concurrent_set_create,
561 .free = NULL,
562};
563
564void
565Init_fstring_table(void)
566{
567 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
568 rb_gc_register_address(&fstring_table_obj);
569}
570
571static VALUE
572register_fstring(VALUE str, bool copy, bool force_precompute_hash)
573{
574 struct fstr_create_arg args = {
575 .copy = copy,
576 .force_precompute_hash = force_precompute_hash
577 };
578
579#if SIZEOF_VOIDP == SIZEOF_LONG
580 if (FL_TEST_RAW(str, STR_FAKESTR)) {
581 // if the string hasn't been interned, we'll need the hash twice, so we
582 // compute it once and store it in capa
583 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
584 }
585#endif
586
587 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
588
589 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
591 RUBY_ASSERT(OBJ_FROZEN(result));
593 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
594 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
596
597 return result;
598}
599
600bool
601rb_obj_is_fstring_table(VALUE obj)
602{
603 ASSERT_vm_locking();
604
605 return obj == fstring_table_obj;
606}
607
608void
609rb_gc_free_fstring(VALUE obj)
610{
611 ASSERT_vm_locking_with_barrier();
612
613 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
615 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
616
617 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
618
619 RB_DEBUG_COUNTER_INC(obj_str_fstr);
620
621 FL_UNSET(obj, RSTRING_FSTR);
622}
623
624void
625rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
626{
627 if (fstring_table_obj) {
628 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
629 }
630}
631
632static VALUE
633setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
634{
635 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
636 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
637
638 if (!name) {
640 name = "";
641 }
642
643 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
644
645 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
646 fake_str->len = len;
647 fake_str->as.heap.ptr = (char *)name;
648 fake_str->as.heap.aux.capa = len;
649 return (VALUE)fake_str;
650}
651
652/*
653 * set up a fake string which refers a static string literal.
654 */
655VALUE
656rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
657{
658 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
659}
660
661/*
662 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
663 * shared string which refers a static string literal. `ptr` must
664 * point a constant string.
665 */
666VALUE
667rb_fstring_new(const char *ptr, long len)
668{
669 struct RString fake_str = {RBASIC_INIT};
670 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
671}
672
673VALUE
674rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
675{
676 struct RString fake_str = {RBASIC_INIT};
677 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
678}
679
680VALUE
681rb_fstring_cstr(const char *ptr)
682{
683 return rb_fstring_new(ptr, strlen(ptr));
684}
685
686static inline bool
687single_byte_optimizable(VALUE str)
688{
689 int encindex = ENCODING_GET(str);
690 switch (encindex) {
691 case ENCINDEX_ASCII_8BIT:
692 case ENCINDEX_US_ASCII:
693 return true;
694 case ENCINDEX_UTF_8:
695 // For UTF-8 it's worth scanning the string coderange when unknown.
696 return rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT;
697 }
698 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
699 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
700 return true;
701 }
702
703 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
704 return true;
705 }
706
707 /* Conservative. Possibly single byte.
708 * "\xa1" in Shift_JIS for example. */
709 return false;
710}
711
713
714static inline const char *
715search_nonascii(const char *p, const char *e)
716{
717 const char *s, *t;
718
719#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
720# if SIZEOF_UINTPTR_T == 8
721# define NONASCII_MASK UINT64_C(0x8080808080808080)
722# elif SIZEOF_UINTPTR_T == 4
723# define NONASCII_MASK UINT32_C(0x80808080)
724# else
725# error "don't know what to do."
726# endif
727#else
728# if SIZEOF_UINTPTR_T == 8
729# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
730# elif SIZEOF_UINTPTR_T == 4
731# define NONASCII_MASK 0x80808080UL /* or...? */
732# else
733# error "don't know what to do."
734# endif
735#endif
736
737 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
738#if !UNALIGNED_WORD_ACCESS
739 if ((uintptr_t)p % SIZEOF_VOIDP) {
740 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
741 p += l;
742 switch (l) {
743 default: UNREACHABLE;
744#if SIZEOF_VOIDP > 4
745 case 7: if (p[-7]&0x80) return p-7;
746 case 6: if (p[-6]&0x80) return p-6;
747 case 5: if (p[-5]&0x80) return p-5;
748 case 4: if (p[-4]&0x80) return p-4;
749#endif
750 case 3: if (p[-3]&0x80) return p-3;
751 case 2: if (p[-2]&0x80) return p-2;
752 case 1: if (p[-1]&0x80) return p-1;
753 case 0: break;
754 }
755 }
756#endif
757#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
758#define aligned_ptr(value) \
759 __builtin_assume_aligned((value), sizeof(uintptr_t))
760#else
761#define aligned_ptr(value) (value)
762#endif
763 s = aligned_ptr(p);
764 t = (e - (SIZEOF_VOIDP-1));
765#undef aligned_ptr
766 for (;s < t; s += sizeof(uintptr_t)) {
767 uintptr_t word;
768 memcpy(&word, s, sizeof(word));
769 if (word & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
945rbimpl_enc_str_coderange_scan(VALUE str)
946{
947 int cr = enc_coderange_scan(str, get_encoding(str));
948 ENC_CODERANGE_SET(str, cr);
949 return cr;
950}
951
952#undef rb_enc_str_coderange
953int
954rb_enc_str_coderange(VALUE str)
955{
956 int cr = ENC_CODERANGE(str);
957
958 if (cr == ENC_CODERANGE_UNKNOWN) {
959 cr = rbimpl_enc_str_coderange_scan(str);
960 }
961 return cr;
962}
963#define rb_enc_str_coderange rb_enc_str_coderange_inline
964
965static inline bool
966rb_enc_str_asciicompat(VALUE str)
967{
968 int encindex = ENCODING_GET_INLINED(str);
969 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
970}
971
972int
974{
975 switch(ENC_CODERANGE(str)) {
977 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
979 return true;
980 default:
981 return false;
982 }
983}
984
985static inline void
986str_mod_check(VALUE s, const char *p, long len)
987{
988 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
989 rb_raise(rb_eRuntimeError, "string modified");
990 }
991}
992
993static size_t
994str_capacity(VALUE str, const int termlen)
995{
996 if (STR_EMBED_P(str)) {
997 return str_embed_capa(str) - termlen;
998 }
999 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1000 return RSTRING(str)->len;
1001 }
1002 else {
1003 return RSTRING(str)->as.heap.aux.capa;
1004 }
1005}
1006
1007size_t
1009{
1010 return str_capacity(str, TERM_LEN(str));
1011}
1012
1013static inline void
1014must_not_null(const char *ptr)
1015{
1016 if (!ptr) {
1017 rb_raise(rb_eArgError, "NULL pointer given");
1018 }
1019}
1020
1021static inline VALUE
1022str_alloc_embed(VALUE klass, size_t capa)
1023{
1024 size_t size = rb_str_embed_size(capa, 0);
1025 RUBY_ASSERT(size > 0);
1026 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1027
1028 NEWOBJ_OF(str, struct RString, klass, T_STRING, size);
1029
1030 str->len = 0;
1031 str->as.embed.ary[0] = 0;
1032
1033 return (VALUE)str;
1034}
1035
1036static inline VALUE
1037str_alloc_heap(VALUE klass)
1038{
1039 NEWOBJ_OF(str, struct RString, klass, T_STRING | STR_NOEMBED, sizeof(struct RString));
1040
1041 str->len = 0;
1042 str->as.heap.aux.capa = 0;
1043 str->as.heap.ptr = NULL;
1044
1045 return (VALUE)str;
1046}
1047
1048static inline VALUE
1049empty_str_alloc(VALUE klass)
1050{
1051 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1052 VALUE str = str_alloc_embed(klass, 0);
1053 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1055 return str;
1056}
1057
1058static VALUE
1059str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1060{
1061 VALUE str;
1062
1063 if (len < 0) {
1064 rb_raise(rb_eArgError, "negative string size (or size too big)");
1065 }
1066
1067 if (enc == NULL) {
1068 enc = rb_ascii8bit_encoding();
1069 }
1070
1071 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1072
1073 int termlen = rb_enc_mbminlen(enc);
1074
1075 if (STR_EMBEDDABLE_P(len, termlen)) {
1076 str = str_alloc_embed(klass, len + termlen);
1077 if (len == 0) {
1078 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1079 }
1080 }
1081 else {
1082 str = str_alloc_heap(klass);
1083 RSTRING(str)->as.heap.aux.capa = len;
1084 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1085 * integer overflow. If we can STATIC_ASSERT that, the following
1086 * mul_add_mul can be reverted to a simple ALLOC_N. */
1087 RSTRING(str)->as.heap.ptr =
1088 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1089 }
1090
1091 rb_enc_raw_set(str, enc);
1092
1093 if (ptr) {
1094 memcpy(RSTRING_PTR(str), ptr, len);
1095 }
1096 else {
1097 memset(RSTRING_PTR(str), 0, len);
1098 }
1099
1100 STR_SET_LEN(str, len);
1101 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1102 return str;
1103}
1104
1105static VALUE
1106str_new(VALUE klass, const char *ptr, long len)
1107{
1108 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1109}
1110
1111VALUE
1112rb_str_new(const char *ptr, long len)
1113{
1114 return str_new(rb_cString, ptr, len);
1115}
1116
1117VALUE
1118rb_usascii_str_new(const char *ptr, long len)
1119{
1120 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1121}
1122
1123VALUE
1124rb_utf8_str_new(const char *ptr, long len)
1125{
1126 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1127}
1128
1129VALUE
1130rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1131{
1132 return str_enc_new(rb_cString, ptr, len, enc);
1133}
1134
1135VALUE
1137{
1138 must_not_null(ptr);
1139 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1140 * memory regions, and that cannot be detected by the MSAN. Just
1141 * trust the programmer that the argument passed here is a sane C
1142 * string. */
1143 __msan_unpoison_string(ptr);
1144 return rb_str_new(ptr, strlen(ptr));
1145}
1146
1147VALUE
1149{
1150 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1151}
1152
1153VALUE
1155{
1156 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1157}
1158
1159VALUE
1161{
1162 must_not_null(ptr);
1163 if (rb_enc_mbminlen(enc) != 1) {
1164 rb_raise(rb_eArgError, "wchar encoding given");
1165 }
1166 return rb_enc_str_new(ptr, strlen(ptr), enc);
1167}
1168
1169static VALUE
1170str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1171{
1172 VALUE str;
1173
1174 if (len < 0) {
1175 rb_raise(rb_eArgError, "negative string size (or size too big)");
1176 }
1177
1178 if (!ptr) {
1179 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1180 }
1181 else {
1182 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1183 str = str_alloc_heap(klass);
1184 RSTRING(str)->len = len;
1185 RSTRING(str)->as.heap.ptr = (char *)ptr;
1186 RSTRING(str)->as.heap.aux.capa = len;
1187 RBASIC(str)->flags |= STR_NOFREE;
1188 rb_enc_associate_index(str, encindex);
1189 }
1190 return str;
1191}
1192
1193VALUE
1194rb_str_new_static(const char *ptr, long len)
1195{
1196 return str_new_static(rb_cString, ptr, len, 0);
1197}
1198
1199VALUE
1201{
1202 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1203}
1204
1205VALUE
1207{
1208 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1209}
1210
1211VALUE
1213{
1214 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1215}
1216
1217static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1218 rb_encoding *from, rb_encoding *to,
1219 int ecflags, VALUE ecopts);
1220
1221static inline bool
1222is_enc_ascii_string(VALUE str, rb_encoding *enc)
1223{
1224 int encidx = rb_enc_to_index(enc);
1225 if (rb_enc_get_index(str) == encidx)
1226 return is_ascii_string(str);
1227 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1228}
1229
1230VALUE
1231rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1232{
1233 long len;
1234 const char *ptr;
1235 VALUE newstr;
1236
1237 if (!to) return str;
1238 if (!from) from = rb_enc_get(str);
1239 if (from == to) return str;
1240 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1241 rb_is_ascii8bit_enc(to)) {
1242 if (STR_ENC_GET(str) != to) {
1243 str = rb_str_dup(str);
1244 rb_enc_associate(str, to);
1245 }
1246 return str;
1247 }
1248
1249 RSTRING_GETMEM(str, ptr, len);
1250 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1251 from, to, ecflags, ecopts);
1252 if (NIL_P(newstr)) {
1253 /* some error, return original */
1254 return str;
1255 }
1256 return newstr;
1257}
1258
1259VALUE
1260rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1261 rb_encoding *from, int ecflags, VALUE ecopts)
1262{
1263 long olen;
1264
1265 olen = RSTRING_LEN(newstr);
1266 if (ofs < -olen || olen < ofs)
1267 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1268 if (ofs < 0) ofs += olen;
1269 if (!from) {
1270 STR_SET_LEN(newstr, ofs);
1271 return rb_str_cat(newstr, ptr, len);
1272 }
1273
1274 rb_str_modify(newstr);
1275 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1276 rb_enc_get(newstr),
1277 ecflags, ecopts);
1278}
1279
1280VALUE
1281rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1282{
1283 STR_SET_LEN(str, 0);
1284 rb_enc_associate(str, enc);
1285 rb_str_cat(str, ptr, len);
1286 return str;
1287}
1288
1289static VALUE
1290str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1291 rb_encoding *from, rb_encoding *to,
1292 int ecflags, VALUE ecopts)
1293{
1294 rb_econv_t *ec;
1296 long olen;
1297 VALUE econv_wrapper;
1298 const unsigned char *start, *sp;
1299 unsigned char *dest, *dp;
1300 size_t converted_output = (size_t)ofs;
1301
1302 olen = rb_str_capacity(newstr);
1303
1304 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1305 RBASIC_CLEAR_CLASS(econv_wrapper);
1306 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1307 if (!ec) return Qnil;
1308 DATA_PTR(econv_wrapper) = ec;
1309
1310 sp = (unsigned char*)ptr;
1311 start = sp;
1312 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1313 (dp = dest + converted_output),
1314 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1316 /* destination buffer short */
1317 size_t converted_input = sp - start;
1318 size_t rest = len - converted_input;
1319 converted_output = dp - dest;
1320 rb_str_set_len(newstr, converted_output);
1321 if (converted_input && converted_output &&
1322 rest < (LONG_MAX / converted_output)) {
1323 rest = (rest * converted_output) / converted_input;
1324 }
1325 else {
1326 rest = olen;
1327 }
1328 olen += rest < 2 ? 2 : rest;
1329 rb_str_resize(newstr, olen);
1330 }
1331 DATA_PTR(econv_wrapper) = 0;
1332 RB_GC_GUARD(econv_wrapper);
1333 rb_econv_close(ec);
1334 switch (ret) {
1335 case econv_finished:
1336 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1337 rb_str_set_len(newstr, len);
1338 rb_enc_associate(newstr, to);
1339 return newstr;
1340
1341 default:
1342 return Qnil;
1343 }
1344}
1345
1346VALUE
1348{
1349 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1350}
1351
1352VALUE
1354{
1355 rb_encoding *ienc;
1356 VALUE str;
1357 const int eidx = rb_enc_to_index(eenc);
1358
1359 if (!ptr) {
1360 return rb_enc_str_new(ptr, len, eenc);
1361 }
1362
1363 /* ASCII-8BIT case, no conversion */
1364 if ((eidx == rb_ascii8bit_encindex()) ||
1365 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1366 return rb_str_new(ptr, len);
1367 }
1368 /* no default_internal or same encoding, no conversion */
1369 ienc = rb_default_internal_encoding();
1370 if (!ienc || eenc == ienc) {
1371 return rb_enc_str_new(ptr, len, eenc);
1372 }
1373 /* ASCII compatible, and ASCII only string, no conversion in
1374 * default_internal */
1375 if ((eidx == rb_ascii8bit_encindex()) ||
1376 (eidx == rb_usascii_encindex()) ||
1377 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1378 return rb_enc_str_new(ptr, len, ienc);
1379 }
1380 /* convert from the given encoding to default_internal */
1381 str = rb_enc_str_new(NULL, 0, ienc);
1382 /* when the conversion failed for some reason, just ignore the
1383 * default_internal and result in the given encoding as-is. */
1384 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1385 rb_str_initialize(str, ptr, len, eenc);
1386 }
1387 return str;
1388}
1389
1390VALUE
1391rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1392{
1393 int eidx = rb_enc_to_index(eenc);
1394 if (eidx == rb_usascii_encindex() &&
1395 !is_ascii_string(str)) {
1396 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1397 return str;
1398 }
1399 rb_enc_associate_index(str, eidx);
1400 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1401}
1402
1403VALUE
1404rb_external_str_new(const char *ptr, long len)
1405{
1406 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1407}
1408
1409VALUE
1411{
1412 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1413}
1414
1415VALUE
1416rb_locale_str_new(const char *ptr, long len)
1417{
1418 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1419}
1420
1421VALUE
1423{
1424 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1425}
1426
1427VALUE
1429{
1430 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1431}
1432
1433VALUE
1435{
1436 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1437}
1438
1439VALUE
1441{
1442 return rb_str_export_to_enc(str, rb_default_external_encoding());
1443}
1444
1445VALUE
1447{
1448 return rb_str_export_to_enc(str, rb_locale_encoding());
1449}
1450
1451VALUE
1453{
1454 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1455}
1456
1457static VALUE
1458str_replace_shared_without_enc(VALUE str2, VALUE str)
1459{
1460 const int termlen = TERM_LEN(str);
1461 char *ptr;
1462 long len;
1463
1464 RSTRING_GETMEM(str, ptr, len);
1465 if (str_embed_capa(str2) >= len + termlen) {
1466 char *ptr2 = RSTRING(str2)->as.embed.ary;
1467 STR_SET_EMBED(str2);
1468 memcpy(ptr2, RSTRING_PTR(str), len);
1469 TERM_FILL(ptr2+len, termlen);
1470 }
1471 else {
1472 VALUE root;
1473 if (STR_SHARED_P(str)) {
1474 root = RSTRING(str)->as.heap.aux.shared;
1475 RSTRING_GETMEM(str, ptr, len);
1476 }
1477 else {
1478 root = rb_str_new_frozen(str);
1479 RSTRING_GETMEM(root, ptr, len);
1480 }
1481 RUBY_ASSERT(OBJ_FROZEN(root));
1482
1483 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1484 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1485 rb_fatal("about to free a possible shared root");
1486 }
1487 char *ptr2 = STR_HEAP_PTR(str2);
1488 if (ptr2 != ptr) {
1489 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1490 }
1491 }
1492 FL_SET(str2, STR_NOEMBED);
1493 RSTRING(str2)->as.heap.ptr = ptr;
1494 STR_SET_SHARED(str2, root);
1495 }
1496
1497 STR_SET_LEN(str2, len);
1498
1499 return str2;
1500}
1501
1502static VALUE
1503str_replace_shared(VALUE str2, VALUE str)
1504{
1505 str_replace_shared_without_enc(str2, str);
1506 rb_enc_cr_str_exact_copy(str2, str);
1507 return str2;
1508}
1509
1510static VALUE
1511str_new_shared(VALUE klass, VALUE str)
1512{
1513 return str_replace_shared(str_alloc_heap(klass), str);
1514}
1515
1516VALUE
1518{
1519 return str_new_shared(rb_obj_class(str), str);
1520}
1521
1522VALUE
1524{
1525 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1526 return str_new_frozen(rb_obj_class(orig), orig);
1527}
1528
1529static VALUE
1530rb_str_new_frozen_String(VALUE orig)
1531{
1532 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1533 return str_new_frozen(rb_cString, orig);
1534}
1535
1536
1537VALUE
1538rb_str_frozen_bare_string(VALUE orig)
1539{
1540 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1541 return str_new_frozen(rb_cString, orig);
1542}
1543
1544VALUE
1545rb_str_tmp_frozen_acquire(VALUE orig)
1546{
1547 if (OBJ_FROZEN_RAW(orig)) return orig;
1548 return str_new_frozen_buffer(0, orig, FALSE);
1549}
1550
1551VALUE
1552rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1553{
1554 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1555 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1556
1557 VALUE str = str_alloc_heap(0);
1558 OBJ_FREEZE(str);
1559 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1560 FL_SET(str, STR_SHARED_ROOT);
1561
1562 size_t capa = str_capacity(orig, TERM_LEN(orig));
1563
1564 /* If the string is embedded then we want to create a copy that is heap
1565 * allocated. If the string is shared then the shared root must be
1566 * embedded, so we want to create a copy. If the string is a shared root
1567 * then it must be embedded, so we want to create a copy. */
1568 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1569 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1570 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1571 }
1572 else {
1573 /* orig must be heap allocated and not shared, so we can safely transfer
1574 * the pointer to str. */
1575 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1576 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1577 RBASIC(orig)->flags &= ~STR_NOFREE;
1578 STR_SET_SHARED(orig, str);
1579 if (RB_OBJ_SHAREABLE_P(orig)) {
1580 RB_OBJ_SET_SHAREABLE(str);
1581 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1582 }
1583 }
1584
1585 RSTRING(str)->len = RSTRING(orig)->len;
1586 RSTRING(str)->as.heap.aux.capa = capa + (TERM_LEN(orig) - TERM_LEN(str));
1587
1588 return str;
1589}
1590
1591void
1592rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1593{
1594 if (RBASIC_CLASS(tmp) != 0)
1595 return;
1596
1597 if (STR_EMBED_P(tmp)) {
1599 }
1600 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1601 !OBJ_FROZEN_RAW(orig)) {
1602 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1603
1604 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1605 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1606 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1607
1608 /* Unshare orig since the root (tmp) only has this one child. */
1609 FL_UNSET_RAW(orig, STR_SHARED);
1610 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1611 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1613
1614 /* Make tmp embedded and empty so it is safe for sweeping. */
1615 STR_SET_EMBED(tmp);
1616 STR_SET_LEN(tmp, 0);
1617 }
1618 }
1619}
1620
1621static VALUE
1622str_new_frozen(VALUE klass, VALUE orig)
1623{
1624 return str_new_frozen_buffer(klass, orig, TRUE);
1625}
1626
1627static VALUE
1628heap_str_make_shared(VALUE klass, VALUE orig)
1629{
1630 RUBY_ASSERT(!STR_EMBED_P(orig));
1631 RUBY_ASSERT(!STR_SHARED_P(orig));
1633
1634 VALUE str = str_alloc_heap(klass);
1635 STR_SET_LEN(str, RSTRING_LEN(orig));
1636 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1637 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1638 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1639 RBASIC(orig)->flags &= ~STR_NOFREE;
1640 STR_SET_SHARED(orig, str);
1641 if (klass == 0)
1642 FL_UNSET_RAW(str, STR_BORROWED);
1643 return str;
1644}
1645
1646static VALUE
1647str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1648{
1649 VALUE str;
1650
1651 long len = RSTRING_LEN(orig);
1652 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1653 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1654
1655 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1656 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1657 RUBY_ASSERT(STR_EMBED_P(str));
1658 }
1659 else {
1660 if (FL_TEST_RAW(orig, STR_SHARED)) {
1661 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1662 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1663 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1664 RUBY_ASSERT(ofs >= 0);
1665 RUBY_ASSERT(rest >= 0);
1666 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1668
1669 if ((ofs > 0) || (rest > 0) ||
1670 (klass != RBASIC(shared)->klass) ||
1671 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1672 str = str_new_shared(klass, shared);
1673 RUBY_ASSERT(!STR_EMBED_P(str));
1674 RSTRING(str)->as.heap.ptr += ofs;
1675 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1676 }
1677 else {
1678 if (RBASIC_CLASS(shared) == 0)
1679 FL_SET_RAW(shared, STR_BORROWED);
1680 return shared;
1681 }
1682 }
1683 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1684 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1685 STR_SET_EMBED(str);
1686 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1687 STR_SET_LEN(str, RSTRING_LEN(orig));
1688 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1689 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1690 }
1691 else {
1692 if (RB_OBJ_SHAREABLE_P(orig)) {
1693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1694 }
1695 else {
1696 str = heap_str_make_shared(klass, orig);
1697 }
1698 }
1699 }
1700
1701 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1702 OBJ_FREEZE(str);
1703 return str;
1704}
1705
1706VALUE
1707rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1708{
1709 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1710}
1711
1712static VALUE
1713str_new_empty_String(VALUE str)
1714{
1715 VALUE v = rb_str_new(0, 0);
1716 rb_enc_copy(v, str);
1717 return v;
1718}
1719
1720#define STR_BUF_MIN_SIZE 63
1721
1722VALUE
1724{
1725 if (STR_EMBEDDABLE_P(capa, 1)) {
1726 return str_alloc_embed(rb_cString, capa + 1);
1727 }
1728
1729 VALUE str = str_alloc_heap(rb_cString);
1730
1731 RSTRING(str)->as.heap.aux.capa = capa;
1732 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1733 RSTRING(str)->as.heap.ptr[0] = '\0';
1734
1735 return str;
1736}
1737
1738VALUE
1740{
1741 VALUE str;
1742 long len = strlen(ptr);
1743
1744 str = rb_str_buf_new(len);
1745 rb_str_buf_cat(str, ptr, len);
1746
1747 return str;
1748}
1749
1750VALUE
1752{
1753 return str_new(0, 0, len);
1754}
1755
1756void
1758{
1759 if (STR_EMBED_P(str)) {
1760 RB_DEBUG_COUNTER_INC(obj_str_embed);
1761 }
1762 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1763 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1764 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1765 }
1766 else {
1767 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1768 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1769 }
1770}
1771
1772size_t
1773rb_str_memsize(VALUE str)
1774{
1775 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1776 return STR_HEAP_SIZE(str);
1777 }
1778 else {
1779 return 0;
1780 }
1781}
1782
1783VALUE
1785{
1786 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1787}
1788
1789static inline void str_discard(VALUE str);
1790static void str_shared_replace(VALUE str, VALUE str2);
1791
1792void
1794{
1795 if (str != str2) str_shared_replace(str, str2);
1796}
1797
1798static void
1799str_shared_replace(VALUE str, VALUE str2)
1800{
1801 rb_encoding *enc;
1802 int cr;
1803 int termlen;
1804
1805 RUBY_ASSERT(str2 != str);
1806 enc = STR_ENC_GET(str2);
1807 cr = ENC_CODERANGE(str2);
1808 str_discard(str);
1809 termlen = rb_enc_mbminlen(enc);
1810
1811 STR_SET_LEN(str, RSTRING_LEN(str2));
1812
1813 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1814 STR_SET_EMBED(str);
1815 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1816 rb_enc_associate(str, enc);
1817 ENC_CODERANGE_SET(str, cr);
1818 }
1819 else {
1820 if (STR_EMBED_P(str2)) {
1821 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1822 long len = RSTRING_LEN(str2);
1823 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1824
1825 char *new_ptr = ALLOC_N(char, len + termlen);
1826 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1827 RSTRING(str2)->as.heap.ptr = new_ptr;
1828 STR_SET_LEN(str2, len);
1829 RSTRING(str2)->as.heap.aux.capa = len;
1830 STR_SET_NOEMBED(str2);
1831 }
1832
1833 STR_SET_NOEMBED(str);
1834 FL_UNSET(str, STR_SHARED);
1835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1836
1837 if (FL_TEST(str2, STR_SHARED)) {
1838 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1839 STR_SET_SHARED(str, shared);
1840 }
1841 else {
1842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1843 }
1844
1845 /* abandon str2 */
1846 STR_SET_EMBED(str2);
1847 RSTRING_PTR(str2)[0] = 0;
1848 STR_SET_LEN(str2, 0);
1849 rb_enc_associate(str, enc);
1850 ENC_CODERANGE_SET(str, cr);
1851 }
1852}
1853
1854VALUE
1856{
1857 VALUE str;
1858
1859 if (RB_TYPE_P(obj, T_STRING)) {
1860 return obj;
1861 }
1862 str = rb_funcall(obj, idTo_s, 0);
1863 return rb_obj_as_string_result(str, obj);
1864}
1865
1866VALUE
1867rb_obj_as_string_result(VALUE str, VALUE obj)
1868{
1869 if (!RB_TYPE_P(str, T_STRING))
1870 return rb_any_to_s(obj);
1871 return str;
1872}
1873
1874static VALUE
1875str_replace(VALUE str, VALUE str2)
1876{
1877 long len;
1878
1879 len = RSTRING_LEN(str2);
1880 if (STR_SHARED_P(str2)) {
1881 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1883 STR_SET_NOEMBED(str);
1884 STR_SET_LEN(str, len);
1885 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1886 STR_SET_SHARED(str, shared);
1887 rb_enc_cr_str_exact_copy(str, str2);
1888 }
1889 else {
1890 str_replace_shared(str, str2);
1891 }
1892
1893 return str;
1894}
1895
1896static inline VALUE
1897ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1898{
1899 size_t size = rb_str_embed_size(capa, 0);
1900 RUBY_ASSERT(size > 0);
1901 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1902
1903 EC_NEWOBJ_OF(str, struct RString, klass, T_STRING, size, ec);
1904
1905 str->len = 0;
1906
1907 return (VALUE)str;
1908}
1909
1910static inline VALUE
1911ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1912{
1913 EC_NEWOBJ_OF(str, struct RString, klass, T_STRING | STR_NOEMBED, sizeof(struct RString), ec);
1914
1915 str->as.heap.aux.capa = 0;
1916 str->as.heap.ptr = NULL;
1917
1918 return (VALUE)str;
1919}
1920
1921static inline VALUE
1922str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1923{
1924 int encidx = 0;
1925 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1926 encidx = rb_enc_get_index(str);
1927 flags &= ~ENCODING_MASK;
1928 }
1929 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1930 if (encidx) rb_enc_associate_index(dup, encidx);
1931 return dup;
1932}
1933
1934static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1935
1936static inline VALUE
1937str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1938{
1939 VALUE flags = FL_TEST_RAW(str, flag_mask);
1940 long len = RSTRING_LEN(str);
1941
1942 RUBY_ASSERT(STR_EMBED_P(dup));
1943 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1944 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1945 STR_SET_LEN(dup, RSTRING_LEN(str));
1946 return str_duplicate_setup_encoding(str, dup, flags);
1947}
1948
1949static inline VALUE
1950str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1951{
1952 VALUE flags = FL_TEST_RAW(str, flag_mask);
1953 VALUE root = str;
1954 if (FL_TEST_RAW(str, STR_SHARED)) {
1955 root = RSTRING(str)->as.heap.aux.shared;
1956 }
1957 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1958 root = str = str_new_frozen(klass, str);
1959 flags = FL_TEST_RAW(str, flag_mask);
1960 }
1961 RUBY_ASSERT(!STR_SHARED_P(root));
1963
1964 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1965 FL_SET_RAW(dup, RSTRING_NOEMBED);
1966 STR_SET_SHARED(dup, root);
1967 flags |= RSTRING_NOEMBED | STR_SHARED;
1968
1969 STR_SET_LEN(dup, RSTRING_LEN(str));
1970 return str_duplicate_setup_encoding(str, dup, flags);
1971}
1972
1973static inline VALUE
1974str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1975{
1976 if (STR_EMBED_P(str)) {
1977 return str_duplicate_setup_embed(klass, str, dup);
1978 }
1979 else {
1980 return str_duplicate_setup_heap(klass, str, dup);
1981 }
1982}
1983
1984static inline VALUE
1985str_duplicate(VALUE klass, VALUE str)
1986{
1987 VALUE dup;
1988 if (STR_EMBED_P(str)) {
1989 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1990 }
1991 else {
1992 dup = str_alloc_heap(klass);
1993 }
1994
1995 return str_duplicate_setup(klass, str, dup);
1996}
1997
1998VALUE
2000{
2001 return str_duplicate(rb_obj_class(str), str);
2002}
2003
2004/* :nodoc: */
2005VALUE
2006rb_str_dup_m(VALUE str)
2007{
2008 if (LIKELY(BARE_STRING_P(str))) {
2009 return str_duplicate(rb_cString, str);
2010 }
2011 else {
2012 return rb_obj_dup(str);
2013 }
2014}
2015
2016VALUE
2018{
2019 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2020 return str_duplicate(rb_cString, str);
2021}
2022
2023VALUE
2024rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2025{
2026 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2027 VALUE new_str, klass = rb_cString;
2028
2029 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2030 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2031 str_duplicate_setup_embed(klass, str, new_str);
2032 }
2033 else {
2034 new_str = ec_str_alloc_heap(ec, klass);
2035 str_duplicate_setup_heap(klass, str, new_str);
2036 }
2037 if (chilled) {
2038 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2039 }
2040 return new_str;
2041}
2042
2043VALUE
2044rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2045{
2046 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2047 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2048 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2049 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2050 return rb_str_freeze(str);
2051}
2052
2053/*
2054 * The documentation block below uses an include (instead of inline text)
2055 * because the included text has non-ASCII characters (which are not allowed in a C file).
2056 */
2057
2058/*
2059 *
2060 * call-seq:
2061 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2062 *
2063 * :include: doc/string/new.rdoc
2064 *
2065 */
2066
2067static VALUE
2068rb_str_init(int argc, VALUE *argv, VALUE str)
2069{
2070 static ID keyword_ids[2];
2071 VALUE orig, opt, venc, vcapa;
2072 VALUE kwargs[2];
2073 rb_encoding *enc = 0;
2074 int n;
2075
2076 if (!keyword_ids[0]) {
2077 keyword_ids[0] = rb_id_encoding();
2078 CONST_ID(keyword_ids[1], "capacity");
2079 }
2080
2081 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2082 if (!NIL_P(opt)) {
2083 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2084 venc = kwargs[0];
2085 vcapa = kwargs[1];
2086 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2087 enc = rb_to_encoding(venc);
2088 }
2089 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2090 long capa = NUM2LONG(vcapa);
2091 long len = 0;
2092 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2093
2094 if (capa < STR_BUF_MIN_SIZE) {
2095 capa = STR_BUF_MIN_SIZE;
2096 }
2097 if (n == 1) {
2098 StringValue(orig);
2099 len = RSTRING_LEN(orig);
2100 if (capa < len) {
2101 capa = len;
2102 }
2103 if (orig == str) n = 0;
2104 }
2105 str_modifiable(str);
2106 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2107 /* make noembed always */
2108 const size_t size = (size_t)capa + termlen;
2109 const char *const old_ptr = RSTRING_PTR(str);
2110 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2111 char *new_ptr = ALLOC_N(char, size);
2112 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2113 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2114 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2115 RSTRING(str)->as.heap.ptr = new_ptr;
2116 }
2117 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2118 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2119 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2120 }
2121 STR_SET_LEN(str, len);
2122 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2123 if (n == 1) {
2124 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2125 rb_enc_cr_str_exact_copy(str, orig);
2126 }
2127 FL_SET(str, STR_NOEMBED);
2128 RSTRING(str)->as.heap.aux.capa = capa;
2129 }
2130 else if (n == 1) {
2131 rb_str_replace(str, orig);
2132 }
2133 if (enc) {
2134 rb_enc_associate(str, enc);
2136 }
2137 }
2138 else if (n == 1) {
2139 rb_str_replace(str, orig);
2140 }
2141 return str;
2142}
2143
2144/* :nodoc: */
2145static VALUE
2146rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2147{
2148 if (klass != rb_cString) {
2149 return rb_class_new_instance_pass_kw(argc, argv, klass);
2150 }
2151
2152 static ID keyword_ids[2];
2153 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2154 VALUE kwargs[2];
2155 rb_encoding *enc = NULL;
2156
2157 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2158 if (NIL_P(opt)) {
2159 return rb_class_new_instance_pass_kw(argc, argv, klass);
2160 }
2161
2162 keyword_ids[0] = rb_id_encoding();
2163 CONST_ID(keyword_ids[1], "capacity");
2164 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2165 encoding = kwargs[0];
2166 capacity = kwargs[1];
2167
2168 if (n == 1) {
2169 orig = StringValue(orig);
2170 }
2171 else {
2172 orig = Qnil;
2173 }
2174
2175 if (UNDEF_P(encoding)) {
2176 if (!NIL_P(orig)) {
2177 encoding = rb_obj_encoding(orig);
2178 }
2179 }
2180
2181 if (!UNDEF_P(encoding)) {
2182 enc = rb_to_encoding(encoding);
2183 }
2184
2185 // If capacity is nil, we're basically just duping `orig`.
2186 if (UNDEF_P(capacity)) {
2187 if (NIL_P(orig)) {
2188 VALUE empty_str = str_new(klass, "", 0);
2189 if (enc) {
2190 rb_enc_associate(empty_str, enc);
2191 }
2192 return empty_str;
2193 }
2194 VALUE copy = str_duplicate(klass, orig);
2195 rb_enc_associate(copy, enc);
2196 ENC_CODERANGE_CLEAR(copy);
2197 return copy;
2198 }
2199
2200 long capa = 0;
2201 capa = NUM2LONG(capacity);
2202 if (capa < 0) {
2203 capa = 0;
2204 }
2205
2206 if (!NIL_P(orig)) {
2207 long orig_capa = rb_str_capacity(orig);
2208 if (orig_capa > capa) {
2209 capa = orig_capa;
2210 }
2211 }
2212
2213 VALUE str = str_enc_new(klass, NULL, capa, enc);
2214 STR_SET_LEN(str, 0);
2215 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2216
2217 if (!NIL_P(orig)) {
2218 rb_str_buf_append(str, orig);
2219 }
2220
2221 return str;
2222}
2223
2224#ifdef NONASCII_MASK
2225#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2226
2227/*
2228 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2229 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2230 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2231 *
2232 * if (!(byte & 0x80))
2233 * byte |= 0x40; // turn on bit6
2234 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2235 *
2236 * This function calculates whether a byte is leading or not for all bytes
2237 * in the argument word by concurrently using the above logic, and then
2238 * adds up the number of leading bytes in the word.
2239 */
2240static inline uintptr_t
2241count_utf8_lead_bytes_with_word(const uintptr_t *s)
2242{
2243 uintptr_t d = *s;
2244
2245 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2246 d = (d>>6) | (~d>>7);
2247 d &= NONASCII_MASK >> 7;
2248
2249 /* Gather all bytes. */
2250#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2251 /* use only if it can use POPCNT */
2252 return rb_popcount_intptr(d);
2253#else
2254 d += (d>>8);
2255 d += (d>>16);
2256# if SIZEOF_VOIDP == 8
2257 d += (d>>32);
2258# endif
2259 return (d&0xF);
2260#endif
2261}
2262#endif
2263
2264static inline long
2265enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2266{
2267 long c;
2268 const char *q;
2269
2270 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2271 long diff = (long)(e - p);
2272 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2273 }
2274#ifdef NONASCII_MASK
2275 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2276 uintptr_t len = 0;
2277 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2278 const uintptr_t *s, *t;
2279 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2280 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2281 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2282 while (p < (const char *)s) {
2283 if (is_utf8_lead_byte(*p)) len++;
2284 p++;
2285 }
2286 while (s < t) {
2287 len += count_utf8_lead_bytes_with_word(s);
2288 s++;
2289 }
2290 p = (const char *)s;
2291 }
2292 while (p < e) {
2293 if (is_utf8_lead_byte(*p)) len++;
2294 p++;
2295 }
2296 return (long)len;
2297 }
2298#endif
2299 else if (rb_enc_asciicompat(enc)) {
2300 c = 0;
2301 if (ENC_CODERANGE_CLEAN_P(cr)) {
2302 while (p < e) {
2303 if (ISASCII(*p)) {
2304 q = search_nonascii(p, e);
2305 if (!q)
2306 return c + (e - p);
2307 c += q - p;
2308 p = q;
2309 }
2310 p += rb_enc_fast_mbclen(p, e, enc);
2311 c++;
2312 }
2313 }
2314 else {
2315 while (p < e) {
2316 if (ISASCII(*p)) {
2317 q = search_nonascii(p, e);
2318 if (!q)
2319 return c + (e - p);
2320 c += q - p;
2321 p = q;
2322 }
2323 p += rb_enc_mbclen(p, e, enc);
2324 c++;
2325 }
2326 }
2327 return c;
2328 }
2329
2330 for (c=0; p<e; c++) {
2331 p += rb_enc_mbclen(p, e, enc);
2332 }
2333 return c;
2334}
2335
2336long
2337rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2338{
2339 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2340}
2341
2342/* To get strlen with cr
2343 * Note that given cr is not used.
2344 */
2345long
2346rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2347{
2348 long c;
2349 const char *q;
2350 int ret;
2351
2352 *cr = 0;
2353 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2354 long diff = (long)(e - p);
2355 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2356 }
2357 else if (rb_enc_asciicompat(enc)) {
2358 c = 0;
2359 while (p < e) {
2360 if (ISASCII(*p)) {
2361 q = search_nonascii(p, e);
2362 if (!q) {
2363 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2364 return c + (e - p);
2365 }
2366 c += q - p;
2367 p = q;
2368 }
2369 ret = rb_enc_precise_mbclen(p, e, enc);
2370 if (MBCLEN_CHARFOUND_P(ret)) {
2371 *cr |= ENC_CODERANGE_VALID;
2372 p += MBCLEN_CHARFOUND_LEN(ret);
2373 }
2374 else {
2376 p++;
2377 }
2378 c++;
2379 }
2380 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2381 return c;
2382 }
2383
2384 for (c=0; p<e; c++) {
2385 ret = rb_enc_precise_mbclen(p, e, enc);
2386 if (MBCLEN_CHARFOUND_P(ret)) {
2387 *cr |= ENC_CODERANGE_VALID;
2388 p += MBCLEN_CHARFOUND_LEN(ret);
2389 }
2390 else {
2392 if (p + rb_enc_mbminlen(enc) <= e)
2393 p += rb_enc_mbminlen(enc);
2394 else
2395 p = e;
2396 }
2397 }
2398 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2399 return c;
2400}
2401
2402/* enc must be str's enc or rb_enc_check(str, str2) */
2403static long
2404str_strlen(VALUE str, rb_encoding *enc)
2405{
2406 const char *p, *e;
2407 int cr;
2408
2409 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2410 if (!enc) enc = STR_ENC_GET(str);
2411 p = RSTRING_PTR(str);
2412 e = RSTRING_END(str);
2413 cr = ENC_CODERANGE(str);
2414
2415 if (cr == ENC_CODERANGE_UNKNOWN) {
2416 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2417 if (cr) ENC_CODERANGE_SET(str, cr);
2418 return n;
2419 }
2420 else {
2421 return enc_strlen(p, e, enc, cr);
2422 }
2423}
2424
2425long
2427{
2428 return str_strlen(str, NULL);
2429}
2430
2431/*
2432 * call-seq:
2433 * length -> integer
2434 *
2435 * :include: doc/string/length.rdoc
2436 *
2437 */
2438
2439VALUE
2441{
2442 return LONG2NUM(str_strlen(str, NULL));
2443}
2444
2445/*
2446 * call-seq:
2447 * bytesize -> integer
2448 *
2449 * :include: doc/string/bytesize.rdoc
2450 *
2451 */
2452
2453VALUE
2454rb_str_bytesize(VALUE str)
2455{
2456 return LONG2NUM(RSTRING_LEN(str));
2457}
2458
2459/*
2460 * call-seq:
2461 * empty? -> true or false
2462 *
2463 * Returns whether the length of +self+ is zero:
2464 *
2465 * 'hello'.empty? # => false
2466 * ' '.empty? # => false
2467 * ''.empty? # => true
2468 *
2469 * Related: see {Querying}[rdoc-ref:String@Querying].
2470 */
2471
2472static VALUE
2473rb_str_empty(VALUE str)
2474{
2475 return RBOOL(RSTRING_LEN(str) == 0);
2476}
2477
2478/*
2479 * call-seq:
2480 * self + other_string -> new_string
2481 *
2482 * Returns a new string containing +other_string+ concatenated to +self+:
2483 *
2484 * 'Hello from ' + self.to_s # => "Hello from main"
2485 *
2486 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2487 */
2488
2489VALUE
2491{
2492 VALUE str3;
2493 rb_encoding *enc;
2494 char *ptr1, *ptr2, *ptr3;
2495 long len1, len2;
2496 int termlen;
2497
2498 StringValue(str2);
2499 enc = rb_enc_check_str(str1, str2);
2500 RSTRING_GETMEM(str1, ptr1, len1);
2501 RSTRING_GETMEM(str2, ptr2, len2);
2502 termlen = rb_enc_mbminlen(enc);
2503 if (len1 > LONG_MAX - len2) {
2504 rb_raise(rb_eArgError, "string size too big");
2505 }
2506 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2507 ptr3 = RSTRING_PTR(str3);
2508 memcpy(ptr3, ptr1, len1);
2509 memcpy(ptr3+len1, ptr2, len2);
2510 TERM_FILL(&ptr3[len1+len2], termlen);
2511
2512 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2514 RB_GC_GUARD(str1);
2515 RB_GC_GUARD(str2);
2516 return str3;
2517}
2518
2519/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2520VALUE
2521rb_str_opt_plus(VALUE str1, VALUE str2)
2522{
2525 long len1, len2;
2526 MAYBE_UNUSED(char) *ptr1, *ptr2;
2527 RSTRING_GETMEM(str1, ptr1, len1);
2528 RSTRING_GETMEM(str2, ptr2, len2);
2529 int enc1 = rb_enc_get_index(str1);
2530 int enc2 = rb_enc_get_index(str2);
2531
2532 if (enc1 < 0) {
2533 return Qundef;
2534 }
2535 else if (enc2 < 0) {
2536 return Qundef;
2537 }
2538 else if (enc1 != enc2) {
2539 return Qundef;
2540 }
2541 else if (len1 > LONG_MAX - len2) {
2542 return Qundef;
2543 }
2544 else {
2545 return rb_str_plus(str1, str2);
2546 }
2547
2548}
2549
2550/*
2551 * call-seq:
2552 * self * n -> new_string
2553 *
2554 * Returns a new string containing +n+ copies of +self+:
2555 *
2556 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2557 * 'No!' * 0 # => ""
2558 *
2559 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2560 */
2561
2562VALUE
2564{
2565 VALUE str2;
2566 long n, len;
2567 char *ptr2;
2568 int termlen;
2569
2570 if (times == INT2FIX(1)) {
2571 return str_duplicate(rb_cString, str);
2572 }
2573 if (times == INT2FIX(0)) {
2574 str2 = str_alloc_embed(rb_cString, 0);
2575 rb_enc_copy(str2, str);
2576 return str2;
2577 }
2578 len = NUM2LONG(times);
2579 if (len < 0) {
2580 rb_raise(rb_eArgError, "negative argument");
2581 }
2582 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2583 if (STR_EMBEDDABLE_P(len, 1)) {
2584 str2 = str_alloc_embed(rb_cString, len + 1);
2585 memset(RSTRING_PTR(str2), 0, len + 1);
2586 }
2587 else {
2588 str2 = str_alloc_heap(rb_cString);
2589 RSTRING(str2)->as.heap.aux.capa = len;
2590 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2591 }
2592 STR_SET_LEN(str2, len);
2593 rb_enc_copy(str2, str);
2594 return str2;
2595 }
2596 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2597 rb_raise(rb_eArgError, "argument too big");
2598 }
2599
2600 len *= RSTRING_LEN(str);
2601 termlen = TERM_LEN(str);
2602 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2603 ptr2 = RSTRING_PTR(str2);
2604 if (len) {
2605 n = RSTRING_LEN(str);
2606 memcpy(ptr2, RSTRING_PTR(str), n);
2607 while (n <= len/2) {
2608 memcpy(ptr2 + n, ptr2, n);
2609 n *= 2;
2610 }
2611 memcpy(ptr2 + n, ptr2, len-n);
2612 }
2613 STR_SET_LEN(str2, len);
2614 TERM_FILL(&ptr2[len], termlen);
2615 rb_enc_cr_str_copy_for_substr(str2, str);
2616
2617 return str2;
2618}
2619
2620/*
2621 * call-seq:
2622 * self % object -> new_string
2623 *
2624 * Returns the result of formatting +object+ into the format specifications
2625 * contained in +self+
2626 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2627 *
2628 * '%05d' % 123 # => "00123"
2629 *
2630 * If +self+ contains multiple format specifications,
2631 * +object+ must be an array or hash containing the objects to be formatted:
2632 *
2633 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2634 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2635 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2636 *
2637 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2638 */
2639
2640static VALUE
2641rb_str_format_m(VALUE str, VALUE arg)
2642{
2643 VALUE tmp = rb_check_array_type(arg);
2644
2645 if (!NIL_P(tmp)) {
2646 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2647 RB_GC_GUARD(tmp);
2648 return result;
2649 }
2650 return rb_str_format(1, &arg, str);
2651}
2652
2653static inline void
2654rb_check_lockedtmp(VALUE str)
2655{
2656 if (FL_TEST(str, STR_TMPLOCK)) {
2657 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2658 }
2659}
2660
2661// If none of these flags are set, we know we have an modifiable string.
2662// If any is set, we need to do more detailed checks.
2663#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2664static inline void
2665str_modifiable(VALUE str)
2666{
2667 RUBY_ASSERT(ruby_thread_has_gvl_p());
2668
2669 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2670 if (CHILLED_STRING_P(str)) {
2671 CHILLED_STRING_MUTATED(str);
2672 }
2673 rb_check_lockedtmp(str);
2674 rb_check_frozen(str);
2675 }
2676}
2677
2678static inline int
2679str_dependent_p(VALUE str)
2680{
2681 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2682 return FALSE;
2683 }
2684 else {
2685 return TRUE;
2686 }
2687}
2688
2689// If none of these flags are set, we know we have an independent string.
2690// If any is set, we need to do more detailed checks.
2691#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2692static inline int
2693str_independent(VALUE str)
2694{
2695 RUBY_ASSERT(ruby_thread_has_gvl_p());
2696
2697 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2698 str_modifiable(str);
2699 return !str_dependent_p(str);
2700 }
2701 return TRUE;
2702}
2703
2704static void
2705str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2706{
2707 RUBY_ASSERT(ruby_thread_has_gvl_p());
2708
2709 char *ptr;
2710 char *oldptr;
2711 long capa = len + expand;
2712
2713 if (len > capa) len = capa;
2714
2715 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2716 ptr = RSTRING(str)->as.heap.ptr;
2717 STR_SET_EMBED(str);
2718 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2719 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2720 STR_SET_LEN(str, len);
2721 return;
2722 }
2723
2724 ptr = ALLOC_N(char, (size_t)capa + termlen);
2725 oldptr = RSTRING_PTR(str);
2726 if (oldptr) {
2727 memcpy(ptr, oldptr, len);
2728 }
2729 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2730 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2731 }
2732 STR_SET_NOEMBED(str);
2733 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2734 TERM_FILL(ptr + len, termlen);
2735 RSTRING(str)->as.heap.ptr = ptr;
2736 STR_SET_LEN(str, len);
2737 RSTRING(str)->as.heap.aux.capa = capa;
2738}
2739
2740void
2741rb_str_modify(VALUE str)
2742{
2743 if (!str_independent(str))
2744 str_make_independent(str);
2746}
2747
2748void
2750{
2751 RUBY_ASSERT(ruby_thread_has_gvl_p());
2752
2753 int termlen = TERM_LEN(str);
2754 long len = RSTRING_LEN(str);
2755
2756 if (expand < 0) {
2757 rb_raise(rb_eArgError, "negative expanding string size");
2758 }
2759 if (expand >= LONG_MAX - len) {
2760 rb_raise(rb_eArgError, "string size too big");
2761 }
2762
2763 if (!str_independent(str)) {
2764 str_make_independent_expand(str, len, expand, termlen);
2765 }
2766 else if (expand > 0) {
2767 RESIZE_CAPA_TERM(str, len + expand, termlen);
2768 }
2770}
2771
2772/* As rb_str_modify(), but don't clear coderange */
2773static void
2774str_modify_keep_cr(VALUE str)
2775{
2776 if (!str_independent(str))
2777 str_make_independent(str);
2779 /* Force re-scan later */
2781}
2782
2783static inline void
2784str_discard(VALUE str)
2785{
2786 str_modifiable(str);
2787 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2788 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2789 RSTRING(str)->as.heap.ptr = 0;
2790 STR_SET_LEN(str, 0);
2791 }
2792}
2793
2794void
2796{
2797 int encindex = rb_enc_get_index(str);
2798
2799 if (RB_UNLIKELY(encindex == -1)) {
2800 rb_raise(rb_eTypeError, "not encoding capable object");
2801 }
2802
2803 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2804 return;
2805 }
2806
2807 rb_encoding *enc = rb_enc_from_index(encindex);
2808 if (!rb_enc_asciicompat(enc)) {
2809 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2810 }
2811}
2812
2813VALUE
2815{
2816 RUBY_ASSERT(ruby_thread_has_gvl_p());
2817
2818 VALUE s = *ptr;
2819 if (!RB_TYPE_P(s, T_STRING)) {
2820 s = rb_str_to_str(s);
2821 *ptr = s;
2822 }
2823 return s;
2824}
2825
2826char *
2828{
2829 VALUE str = rb_string_value(ptr);
2830 return RSTRING_PTR(str);
2831}
2832
2833static const char *
2834str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2835{
2836 const char *e = s + len;
2837
2838 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2839 if (zero_filled(s, minlen)) return s;
2840 }
2841 return 0;
2842}
2843
2844static char *
2845str_fill_term(VALUE str, char *s, long len, int termlen)
2846{
2847 /* This function assumes that (capa + termlen) bytes of memory
2848 * is allocated, like many other functions in this file.
2849 */
2850 if (str_dependent_p(str)) {
2851 if (!zero_filled(s + len, termlen))
2852 str_make_independent_expand(str, len, 0L, termlen);
2853 }
2854 else {
2855 TERM_FILL(s + len, termlen);
2856 return s;
2857 }
2858 return RSTRING_PTR(str);
2859}
2860
2861void
2862rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2863{
2864 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2865 long len = RSTRING_LEN(str);
2866
2867 RUBY_ASSERT(capa >= len);
2868 if (capa - len < termlen) {
2869 rb_check_lockedtmp(str);
2870 str_make_independent_expand(str, len, 0L, termlen);
2871 }
2872 else if (str_dependent_p(str)) {
2873 if (termlen > oldtermlen)
2874 str_make_independent_expand(str, len, 0L, termlen);
2875 }
2876 else {
2877 if (!STR_EMBED_P(str)) {
2878 /* modify capa instead of realloc */
2879 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2880 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2881 }
2882 if (termlen > oldtermlen) {
2883 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2884 }
2885 }
2886
2887 return;
2888}
2889
2890static char *
2891str_null_check(VALUE str, int *w)
2892{
2893 char *s = RSTRING_PTR(str);
2894 long len = RSTRING_LEN(str);
2895 int minlen = 1;
2896
2897 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2898 rb_encoding *enc = rb_str_enc_get(str);
2899 minlen = rb_enc_mbminlen(enc);
2900
2901 if (minlen > 1) {
2902 *w = 1;
2903 if (str_null_char(s, len, minlen, enc)) {
2904 return NULL;
2905 }
2906 return str_fill_term(str, s, len, minlen);
2907 }
2908 }
2909
2910 *w = 0;
2911 if (!s || memchr(s, 0, len)) {
2912 return NULL;
2913 }
2914 if (s[len]) {
2915 s = str_fill_term(str, s, len, minlen);
2916 }
2917 return s;
2918}
2919
2920const char *
2921rb_str_null_check(VALUE str)
2922{
2924
2925 char *s;
2926 long len;
2927 RSTRING_GETMEM(str, s, len);
2928
2929 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2930 if (!s || memchr(s, 0, len)) {
2931 rb_raise(rb_eArgError, "string contains null byte");
2932 }
2933 }
2934 else {
2935 int w;
2936 const char *s = str_null_check(str, &w);
2937 if (!s) {
2938 if (w) {
2939 rb_raise(rb_eArgError, "string contains null char");
2940 }
2941 rb_raise(rb_eArgError, "string contains null byte");
2942 }
2943 }
2944
2945 return s;
2946}
2947
2948char *
2949rb_str_to_cstr(VALUE str)
2950{
2951 int w;
2952 return str_null_check(str, &w);
2953}
2954
2955char *
2957{
2958 VALUE str = rb_string_value(ptr);
2959 int w;
2960 char *s = str_null_check(str, &w);
2961 if (!s) {
2962 if (w) {
2963 rb_raise(rb_eArgError, "string contains null char");
2964 }
2965 rb_raise(rb_eArgError, "string contains null byte");
2966 }
2967 return s;
2968}
2969
2970char *
2971rb_str_fill_terminator(VALUE str, const int newminlen)
2972{
2973 char *s = RSTRING_PTR(str);
2974 long len = RSTRING_LEN(str);
2975 return str_fill_term(str, s, len, newminlen);
2976}
2977
2978VALUE
2980{
2981 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2982 return str;
2983}
2984
2985/*
2986 * call-seq:
2987 * String.try_convert(object) -> object, new_string, or nil
2988 *
2989 * Attempts to convert the given +object+ to a string.
2990 *
2991 * If +object+ is already a string, returns +object+, unmodified.
2992 *
2993 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2994 * calls <tt>object.to_str</tt> and returns the result.
2995 *
2996 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2997 *
2998 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2999 */
3000static VALUE
3001rb_str_s_try_convert(VALUE dummy, VALUE str)
3002{
3003 return rb_check_string_type(str);
3004}
3005
3006static char*
3007str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3008{
3009 long nth = *nthp;
3010 if (rb_enc_mbmaxlen(enc) == 1) {
3011 p += nth;
3012 }
3013 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3014 p += nth * rb_enc_mbmaxlen(enc);
3015 }
3016 else if (rb_enc_asciicompat(enc)) {
3017 const char *p2, *e2;
3018 int n;
3019
3020 while (p < e && 0 < nth) {
3021 e2 = p + nth;
3022 if (e < e2) {
3023 *nthp = nth;
3024 return (char *)e;
3025 }
3026 if (ISASCII(*p)) {
3027 p2 = search_nonascii(p, e2);
3028 if (!p2) {
3029 nth -= e2 - p;
3030 *nthp = nth;
3031 return (char *)e2;
3032 }
3033 nth -= p2 - p;
3034 p = p2;
3035 }
3036 n = rb_enc_mbclen(p, e, enc);
3037 p += n;
3038 nth--;
3039 }
3040 *nthp = nth;
3041 if (nth != 0) {
3042 return (char *)e;
3043 }
3044 return (char *)p;
3045 }
3046 else {
3047 while (p < e && nth--) {
3048 p += rb_enc_mbclen(p, e, enc);
3049 }
3050 }
3051 if (p > e) p = e;
3052 *nthp = nth;
3053 return (char*)p;
3054}
3055
3056char*
3057rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3058{
3059 return str_nth_len(p, e, &nth, enc);
3060}
3061
3062static char*
3063str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3064{
3065 if (singlebyte)
3066 p += nth;
3067 else {
3068 p = str_nth_len(p, e, &nth, enc);
3069 }
3070 if (!p) return 0;
3071 if (p > e) p = e;
3072 return (char *)p;
3073}
3074
3075/* char offset to byte offset */
3076static long
3077str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3078{
3079 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3080 if (!pp) return e - p;
3081 return pp - p;
3082}
3083
3084long
3085rb_str_offset(VALUE str, long pos)
3086{
3087 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3088 STR_ENC_GET(str), single_byte_optimizable(str));
3089}
3090
3091#ifdef NONASCII_MASK
3092static char *
3093str_utf8_nth(const char *p, const char *e, long *nthp)
3094{
3095 long nth = *nthp;
3096 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3097 const uintptr_t *s, *t;
3098 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3099 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3100 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3101 while (p < (const char *)s) {
3102 if (is_utf8_lead_byte(*p)) nth--;
3103 p++;
3104 }
3105 do {
3106 nth -= count_utf8_lead_bytes_with_word(s);
3107 s++;
3108 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3109 p = (char *)s;
3110 }
3111 while (p < e) {
3112 if (is_utf8_lead_byte(*p)) {
3113 if (nth == 0) break;
3114 nth--;
3115 }
3116 p++;
3117 }
3118 *nthp = nth;
3119 return (char *)p;
3120}
3121
3122static long
3123str_utf8_offset(const char *p, const char *e, long nth)
3124{
3125 const char *pp = str_utf8_nth(p, e, &nth);
3126 return pp - p;
3127}
3128#endif
3129
3130/* byte offset to char offset */
3131long
3132rb_str_sublen(VALUE str, long pos)
3133{
3134 if (single_byte_optimizable(str) || pos < 0)
3135 return pos;
3136 else {
3137 char *p = RSTRING_PTR(str);
3138 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3139 }
3140}
3141
3142static VALUE
3143str_subseq(VALUE str, long beg, long len)
3144{
3145 VALUE str2;
3146
3147 RUBY_ASSERT(beg >= 0);
3148 RUBY_ASSERT(len >= 0);
3149 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3150
3151 const int termlen = TERM_LEN(str);
3152 if (!SHARABLE_SUBSTRING_P(str, beg, len)) {
3153 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
3154 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3156 }
3157 RB_GC_GUARD(str);
3158 return str2;
3159 }
3160
3161 str2 = str_alloc_heap(rb_cString);
3162 if (str_embed_capa(str2) >= len + termlen) {
3163 char *ptr2 = RSTRING(str2)->as.embed.ary;
3164 STR_SET_EMBED(str2);
3165 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3166 TERM_FILL(ptr2+len, termlen);
3167
3168 STR_SET_LEN(str2, len);
3169 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3171 }
3172
3173 RB_GC_GUARD(str);
3174 }
3175 else {
3176 str_replace_shared(str2, str);
3177 RUBY_ASSERT(!STR_EMBED_P(str2));
3178 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3179 ENC_CODERANGE_CLEAR(str2);
3180 }
3181
3182 RSTRING(str2)->as.heap.ptr += beg;
3183 if (RSTRING_LEN(str2) > len) {
3184 STR_SET_LEN(str2, len);
3185 }
3186 }
3187
3188 return str2;
3189}
3190
3191VALUE
3192rb_str_subseq(VALUE str, long beg, long len)
3193{
3194 VALUE str2 = str_subseq(str, beg, len);
3195 rb_enc_cr_str_copy_for_substr(str2, str);
3196 return str2;
3197}
3198
3199char *
3200rb_str_subpos(VALUE str, long beg, long *lenp)
3201{
3202 long len = *lenp;
3203 long slen = -1L;
3204 const long blen = RSTRING_LEN(str);
3205 rb_encoding *enc = STR_ENC_GET(str);
3206 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3207
3208 if (len < 0) return 0;
3209 if (beg < 0 && -beg < 0) return 0;
3210 if (!blen) {
3211 len = 0;
3212 }
3213 if (single_byte_optimizable(str)) {
3214 if (beg > blen) return 0;
3215 if (beg < 0) {
3216 beg += blen;
3217 if (beg < 0) return 0;
3218 }
3219 if (len > blen - beg)
3220 len = blen - beg;
3221 if (len < 0) return 0;
3222 p = s + beg;
3223 goto end;
3224 }
3225 if (beg < 0) {
3226 if (len > -beg) len = -beg;
3227 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3228 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3229 beg = -beg;
3230 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3231 p = e;
3232 if (!p) return 0;
3233 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3234 if (!p) return 0;
3235 len = e - p;
3236 goto end;
3237 }
3238 else {
3239 slen = str_strlen(str, enc);
3240 beg += slen;
3241 if (beg < 0) return 0;
3242 p = s + beg;
3243 if (len == 0) goto end;
3244 }
3245 }
3246 else if (beg > 0 && beg > blen) {
3247 return 0;
3248 }
3249 if (len == 0) {
3250 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3251 p = s + beg;
3252 }
3253#ifdef NONASCII_MASK
3254 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3255 enc == rb_utf8_encoding()) {
3256 p = str_utf8_nth(s, e, &beg);
3257 if (beg > 0) return 0;
3258 len = str_utf8_offset(p, e, len);
3259 }
3260#endif
3261 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3262 int char_sz = rb_enc_mbmaxlen(enc);
3263
3264 p = s + beg * char_sz;
3265 if (p > e) {
3266 return 0;
3267 }
3268 else if (len * char_sz > e - p)
3269 len = e - p;
3270 else
3271 len *= char_sz;
3272 }
3273 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3274 if (beg > 0) return 0;
3275 len = 0;
3276 }
3277 else {
3278 len = str_offset(p, e, len, enc, 0);
3279 }
3280 end:
3281 *lenp = len;
3282 RB_GC_GUARD(str);
3283 return p;
3284}
3285
3286static VALUE str_substr(VALUE str, long beg, long len, int empty);
3287
3288VALUE
3289rb_str_substr(VALUE str, long beg, long len)
3290{
3291 return str_substr(str, beg, len, TRUE);
3292}
3293
3294VALUE
3295rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3296{
3297 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3298}
3299
3300static VALUE
3301str_substr(VALUE str, long beg, long len, int empty)
3302{
3303 char *p = rb_str_subpos(str, beg, &len);
3304
3305 if (!p) return Qnil;
3306 if (!len && !empty) return Qnil;
3307
3308 beg = p - RSTRING_PTR(str);
3309
3310 VALUE str2 = str_subseq(str, beg, len);
3311 rb_enc_cr_str_copy_for_substr(str2, str);
3312 return str2;
3313}
3314
3315/* :nodoc: */
3316VALUE
3318{
3319 if (CHILLED_STRING_P(str)) {
3320 FL_UNSET_RAW(str, STR_CHILLED);
3321 }
3322
3323 if (OBJ_FROZEN(str)) return str;
3324 rb_str_resize(str, RSTRING_LEN(str));
3325 return rb_obj_freeze(str);
3326}
3327
3328/*
3329 * call-seq:
3330 * +string -> new_string or self
3331 *
3332 * Returns +self+ if +self+ is not frozen and can be mutated
3333 * without warning issuance.
3334 *
3335 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3336 *
3337 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3338 */
3339static VALUE
3340str_uplus(VALUE str)
3341{
3342 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3343 return rb_str_dup(str);
3344 }
3345 else {
3346 return str;
3347 }
3348}
3349
3350/*
3351 * call-seq:
3352 * -self -> frozen_string
3353 *
3354 * Returns a frozen string equal to +self+.
3355 *
3356 * The returned string is +self+ if and only if all of the following are true:
3357 *
3358 * - +self+ is already frozen.
3359 * - +self+ is an instance of \String (rather than of a subclass of \String)
3360 * - +self+ has no instance variables set on it.
3361 *
3362 * Otherwise, the returned string is a frozen copy of +self+.
3363 *
3364 * Returning +self+, when possible, saves duplicating +self+;
3365 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3366 *
3367 * It may also save duplicating other, already-existing, strings:
3368 *
3369 * s0 = 'foo'
3370 * s1 = 'foo'
3371 * s0.object_id == s1.object_id # => false
3372 * (-s0).object_id == (-s1).object_id # => true
3373 *
3374 * Note that method #-@ is convenient for defining a constant:
3375 *
3376 * FileName = -'config/database.yml'
3377 *
3378 * While its alias #dedup is better suited for chaining:
3379 *
3380 * 'foo'.dedup.gsub!('o')
3381 *
3382 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3383 */
3384static VALUE
3385str_uminus(VALUE str)
3386{
3387 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3388 str = rb_str_dup(str);
3389 }
3390 return rb_fstring(str);
3391}
3392
3393RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3394#define rb_str_dup_frozen rb_str_new_frozen
3395
3396VALUE
3398{
3399 rb_check_frozen(str);
3400 if (FL_TEST(str, STR_TMPLOCK)) {
3401 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3402 }
3403 FL_SET(str, STR_TMPLOCK);
3404 return str;
3405}
3406
3407VALUE
3409{
3410 rb_check_frozen(str);
3411 if (!FL_TEST(str, STR_TMPLOCK)) {
3412 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3413 }
3414 FL_UNSET(str, STR_TMPLOCK);
3415 return str;
3416}
3417
3418VALUE
3419rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3420{
3421 rb_str_locktmp(str);
3422 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3423}
3424
3425void
3427{
3428 RUBY_ASSERT(ruby_thread_has_gvl_p());
3429
3430 long capa;
3431 const int termlen = TERM_LEN(str);
3432
3433 str_modifiable(str);
3434 if (STR_SHARED_P(str)) {
3435 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3436 }
3437 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3438 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3439 }
3440
3441 int cr = ENC_CODERANGE(str);
3442 if (len == 0) {
3443 /* Empty string does not contain non-ASCII */
3445 }
3446 else if (cr == ENC_CODERANGE_UNKNOWN) {
3447 /* Leave unknown. */
3448 }
3449 else if (len > RSTRING_LEN(str)) {
3450 if (ENC_CODERANGE_CLEAN_P(cr)) {
3451 /* Update the coderange regarding the extended part. */
3452 const char *const prev_end = RSTRING_END(str);
3453 const char *const new_end = RSTRING_PTR(str) + len;
3454 rb_encoding *enc = rb_enc_get(str);
3455 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3456 ENC_CODERANGE_SET(str, cr);
3457 }
3458 else if (cr == ENC_CODERANGE_BROKEN) {
3459 /* May be valid now, by appended part. */
3461 }
3462 }
3463 else if (len < RSTRING_LEN(str)) {
3464 if (cr != ENC_CODERANGE_7BIT) {
3465 /* ASCII-only string is keeping after truncated. Valid
3466 * and broken may be invalid or valid, leave unknown. */
3468 }
3469 }
3470
3471 STR_SET_LEN(str, len);
3472 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3473}
3474
3475VALUE
3476rb_str_resize(VALUE str, long len)
3477{
3478 if (len < 0) {
3479 rb_raise(rb_eArgError, "negative string size (or size too big)");
3480 }
3481
3482 int independent = str_independent(str);
3483 long slen = RSTRING_LEN(str);
3484 const int termlen = TERM_LEN(str);
3485
3486 if (slen > len || (termlen != 1 && slen < len)) {
3488 }
3489
3490 {
3491 long capa;
3492 if (STR_EMBED_P(str)) {
3493 if (len == slen) return str;
3494 if (str_embed_capa(str) >= len + termlen) {
3495 STR_SET_LEN(str, len);
3496 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3497 return str;
3498 }
3499 str_make_independent_expand(str, slen, len - slen, termlen);
3500 }
3501 else if (str_embed_capa(str) >= len + termlen) {
3502 capa = RSTRING(str)->as.heap.aux.capa;
3503 char *ptr = STR_HEAP_PTR(str);
3504 STR_SET_EMBED(str);
3505 if (slen > len) slen = len;
3506 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3507 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3508 STR_SET_LEN(str, len);
3509 if (independent) {
3510 SIZED_FREE_N(ptr, capa + termlen);
3511 }
3512 return str;
3513 }
3514 else if (!independent) {
3515 if (len == slen) return str;
3516 str_make_independent_expand(str, slen, len - slen, termlen);
3517 }
3518 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3519 (capa - len) > (len < 1024 ? len : 1024)) {
3520 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3521 (size_t)len + termlen, STR_HEAP_SIZE(str));
3522 RSTRING(str)->as.heap.aux.capa = len;
3523 }
3524 else if (len == slen) return str;
3525 STR_SET_LEN(str, len);
3526 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3527 }
3528 return str;
3529}
3530
3531static void
3532str_ensure_available_capa(VALUE str, long len)
3533{
3534 str_modify_keep_cr(str);
3535
3536 const int termlen = TERM_LEN(str);
3537 long olen = RSTRING_LEN(str);
3538
3539 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3540 rb_raise(rb_eArgError, "string sizes too big");
3541 }
3542
3543 long total = olen + len;
3544 long capa = str_capacity(str, termlen);
3545
3546 if (capa < total) {
3547 if (total >= LONG_MAX / 2) {
3548 capa = total;
3549 }
3550 while (total > capa) {
3551 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3552 }
3553 RESIZE_CAPA_TERM(str, capa, termlen);
3554 }
3555}
3556
3557static VALUE
3558str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3559{
3560 if (keep_cr) {
3561 str_modify_keep_cr(str);
3562 }
3563 else {
3564 rb_str_modify(str);
3565 }
3566 if (len == 0) return 0;
3567
3568 long total, olen, off = -1;
3569 char *sptr;
3570 const int termlen = TERM_LEN(str);
3571
3572 RSTRING_GETMEM(str, sptr, olen);
3573 if (ptr >= sptr && ptr <= sptr + olen) {
3574 off = ptr - sptr;
3575 }
3576
3577 long capa = str_capacity(str, termlen);
3578
3579 if (olen > LONG_MAX - len) {
3580 rb_raise(rb_eArgError, "string sizes too big");
3581 }
3582 total = olen + len;
3583 if (capa < total) {
3584 if (total >= LONG_MAX / 2) {
3585 capa = total;
3586 }
3587 while (total > capa) {
3588 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3589 }
3590 RESIZE_CAPA_TERM(str, capa, termlen);
3591 sptr = RSTRING_PTR(str);
3592 }
3593 if (off != -1) {
3594 ptr = sptr + off;
3595 }
3596 memcpy(sptr + olen, ptr, len);
3597 STR_SET_LEN(str, total);
3598 TERM_FILL(sptr + total, termlen); /* sentinel */
3599
3600 return str;
3601}
3602
3603#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3604#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3605
3606VALUE
3607rb_str_cat(VALUE str, const char *ptr, long len)
3608{
3609 if (len == 0) return str;
3610 if (len < 0) {
3611 rb_raise(rb_eArgError, "negative string size (or size too big)");
3612 }
3613 return str_buf_cat(str, ptr, len);
3614}
3615
3616VALUE
3617rb_str_cat_cstr(VALUE str, const char *ptr)
3618{
3619 must_not_null(ptr);
3620 return rb_str_buf_cat(str, ptr, strlen(ptr));
3621}
3622
3623static void
3624rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3625{
3626 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3627
3628 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3629 if (UNLIKELY(!str_independent(str))) {
3630 str_make_independent(str);
3631 }
3632
3633 long string_length = -1;
3634 const int null_terminator_length = 1;
3635 char *sptr;
3636 RSTRING_GETMEM(str, sptr, string_length);
3637
3638 // Ensure the resulting string wouldn't be too long.
3639 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3640 rb_raise(rb_eArgError, "string sizes too big");
3641 }
3642
3643 long string_capacity = str_capacity(str, null_terminator_length);
3644
3645 // Get the code range before any modifications since those might clear the code range.
3646 int cr = ENC_CODERANGE(str);
3647
3648 // Check if the string has spare string_capacity to write the new byte.
3649 if (LIKELY(string_capacity >= string_length + 1)) {
3650 // In fast path we can write the new byte and note the string's new length.
3651 sptr[string_length] = byte;
3652 STR_SET_LEN(str, string_length + 1);
3653 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3654 }
3655 else {
3656 // If there's not enough string_capacity, make a call into the general string concatenation function.
3657 str_buf_cat(str, (char *)&byte, 1);
3658 }
3659
3660 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3661 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3662 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3663 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3664 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3665 if (ISASCII(byte)) {
3667 }
3668 else {
3670
3671 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3672 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3673 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3674 }
3675 }
3676 }
3677}
3678
3679RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3680RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3681RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3682
3683static VALUE
3684rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3685 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3686{
3687 int str_encindex = ENCODING_GET(str);
3688 int res_encindex;
3689 int str_cr, res_cr;
3690 rb_encoding *str_enc, *ptr_enc;
3691
3692 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3693
3694 if (str_encindex == ptr_encindex) {
3695 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3696 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3697 }
3698 }
3699 else {
3700 str_enc = rb_enc_from_index(str_encindex);
3701 ptr_enc = rb_enc_from_index(ptr_encindex);
3702 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3703 if (len == 0)
3704 return str;
3705 if (RSTRING_LEN(str) == 0) {
3706 rb_str_buf_cat(str, ptr, len);
3707 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3708 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3709 return str;
3710 }
3711 goto incompatible;
3712 }
3713 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3714 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3715 }
3716 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3717 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3718 str_cr = rb_enc_str_coderange(str);
3719 }
3720 }
3721 }
3722 if (ptr_cr_ret)
3723 *ptr_cr_ret = ptr_cr;
3724
3725 if (str_encindex != ptr_encindex &&
3726 str_cr != ENC_CODERANGE_7BIT &&
3727 ptr_cr != ENC_CODERANGE_7BIT) {
3728 str_enc = rb_enc_from_index(str_encindex);
3729 ptr_enc = rb_enc_from_index(ptr_encindex);
3730 goto incompatible;
3731 }
3732
3733 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3734 res_encindex = str_encindex;
3735 res_cr = ENC_CODERANGE_UNKNOWN;
3736 }
3737 else if (str_cr == ENC_CODERANGE_7BIT) {
3738 if (ptr_cr == ENC_CODERANGE_7BIT) {
3739 res_encindex = str_encindex;
3740 res_cr = ENC_CODERANGE_7BIT;
3741 }
3742 else {
3743 res_encindex = ptr_encindex;
3744 res_cr = ptr_cr;
3745 }
3746 }
3747 else if (str_cr == ENC_CODERANGE_VALID) {
3748 res_encindex = str_encindex;
3749 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3750 res_cr = str_cr;
3751 else
3752 res_cr = ptr_cr;
3753 }
3754 else { /* str_cr == ENC_CODERANGE_BROKEN */
3755 res_encindex = str_encindex;
3756 res_cr = str_cr;
3757 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3758 }
3759
3760 if (len < 0) {
3761 rb_raise(rb_eArgError, "negative string size (or size too big)");
3762 }
3763 str_buf_cat(str, ptr, len);
3764 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3765 return str;
3766
3767 incompatible:
3768 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3769 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3771}
3772
3773VALUE
3774rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3775{
3776 return rb_enc_cr_str_buf_cat(str, ptr, len,
3777 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3778}
3779
3780VALUE
3782{
3783 /* ptr must reference NUL terminated ASCII string. */
3784 int encindex = ENCODING_GET(str);
3785 rb_encoding *enc = rb_enc_from_index(encindex);
3786 if (rb_enc_asciicompat(enc)) {
3787 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3788 encindex, ENC_CODERANGE_7BIT, 0);
3789 }
3790 else {
3791 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3792 while (*ptr) {
3793 unsigned int c = (unsigned char)*ptr;
3794 int len = rb_enc_codelen(c, enc);
3795 rb_enc_mbcput(c, buf, enc);
3796 rb_enc_cr_str_buf_cat(str, buf, len,
3797 encindex, ENC_CODERANGE_VALID, 0);
3798 ptr++;
3799 }
3800 return str;
3801 }
3802}
3803
3804VALUE
3806{
3807 int str2_cr = rb_enc_str_coderange(str2);
3808
3809 if (rb_str_enc_fastpath(str)) {
3810 switch (str2_cr) {
3811 case ENC_CODERANGE_7BIT:
3812 // If RHS is 7bit we can do simple concatenation
3813 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3814 RB_GC_GUARD(str2);
3815 return str;
3817 // If RHS is valid, we can do simple concatenation if encodings are the same
3818 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3819 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3820 int str_cr = ENC_CODERANGE(str);
3821 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3822 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3823 }
3824 RB_GC_GUARD(str2);
3825 return str;
3826 }
3827 }
3828 }
3829
3830 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3831 ENCODING_GET(str2), str2_cr, &str2_cr);
3832
3833 ENC_CODERANGE_SET(str2, str2_cr);
3834
3835 return str;
3836}
3837
3838VALUE
3840{
3841 StringValue(str2);
3842 return rb_str_buf_append(str, str2);
3843}
3844
3845VALUE
3846rb_str_concat_literals(size_t num, const VALUE *strary)
3847{
3848 VALUE str;
3849 size_t i, s = 0;
3850 unsigned long len = 1;
3851
3852 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3853 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3854
3855 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3856 str = rb_str_buf_new(len);
3857 str_enc_copy_direct(str, strary[0]);
3858
3859 for (i = s; i < num; ++i) {
3860 const VALUE v = strary[i];
3861 int encidx = ENCODING_GET(v);
3862
3863 rb_str_buf_append(str, v);
3864 if (encidx != ENCINDEX_US_ASCII) {
3865 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3866 rb_enc_set_index(str, encidx);
3867 }
3868 }
3869 return str;
3870}
3871
3872/*
3873 * call-seq:
3874 * concat(*objects) -> string
3875 *
3876 * :include: doc/string/concat.rdoc
3877 */
3878static VALUE
3879rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3880{
3881 str_modifiable(str);
3882
3883 if (argc == 1) {
3884 return rb_str_concat(str, argv[0]);
3885 }
3886 else if (argc > 1) {
3887 int i;
3888 VALUE arg_str = rb_str_tmp_new(0);
3889 rb_enc_copy(arg_str, str);
3890 for (i = 0; i < argc; i++) {
3891 rb_str_concat(arg_str, argv[i]);
3892 }
3893 rb_str_buf_append(str, arg_str);
3894 }
3895
3896 return str;
3897}
3898
3899/*
3900 * call-seq:
3901 * append_as_bytes(*objects) -> self
3902 *
3903 * Concatenates each object in +objects+ into +self+; returns +self+;
3904 * performs no encoding validation or conversion:
3905 *
3906 * s = 'foo'
3907 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3908 * s.valid_encoding? # => false
3909 * s.append_as_bytes("\xAC 12")
3910 * s.valid_encoding? # => true
3911 *
3912 * When a given object is an integer,
3913 * the value is considered an 8-bit byte;
3914 * if the integer occupies more than one byte (i.e,. is greater than 255),
3915 * appends only the low-order byte (similar to String#setbyte):
3916 *
3917 * s = ""
3918 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3919 * s.bytesize # => 2
3920 *
3921 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3922 */
3923
3924VALUE
3925rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3926{
3927 long needed_capacity = 0;
3928 volatile VALUE t0;
3929 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3930
3931 for (int index = 0; index < argc; index++) {
3932 VALUE obj = argv[index];
3933 enum ruby_value_type type = types[index] = rb_type(obj);
3934 switch (type) {
3935 case T_FIXNUM:
3936 case T_BIGNUM:
3937 needed_capacity++;
3938 break;
3939 case T_STRING:
3940 needed_capacity += RSTRING_LEN(obj);
3941 break;
3942 default:
3943 rb_raise(
3945 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3946 rb_obj_class(obj)
3947 );
3948 break;
3949 }
3950 }
3951
3952 str_ensure_available_capa(str, needed_capacity);
3953 char *sptr = RSTRING_END(str);
3954
3955 for (int index = 0; index < argc; index++) {
3956 VALUE obj = argv[index];
3957 enum ruby_value_type type = types[index];
3958 switch (type) {
3959 case T_FIXNUM:
3960 case T_BIGNUM: {
3961 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3962 char byte = (char)(NUM2INT(obj) & 0xFF);
3963 *sptr = byte;
3964 sptr++;
3965 break;
3966 }
3967 case T_STRING: {
3968 const char *ptr;
3969 long len;
3970 RSTRING_GETMEM(obj, ptr, len);
3971 memcpy(sptr, ptr, len);
3972 sptr += len;
3973 break;
3974 }
3975 default:
3976 rb_bug("append_as_bytes arguments should have been validated");
3977 }
3978 }
3979
3980 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3981 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3982
3983 int cr = ENC_CODERANGE(str);
3984 switch (cr) {
3985 case ENC_CODERANGE_7BIT: {
3986 for (int index = 0; index < argc; index++) {
3987 VALUE obj = argv[index];
3988 enum ruby_value_type type = types[index];
3989 switch (type) {
3990 case T_FIXNUM:
3991 case T_BIGNUM: {
3992 if (!ISASCII(NUM2INT(obj))) {
3993 goto clear_cr;
3994 }
3995 break;
3996 }
3997 case T_STRING: {
3998 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3999 goto clear_cr;
4000 }
4001 break;
4002 }
4003 default:
4004 rb_bug("append_as_bytes arguments should have been validated");
4005 }
4006 }
4007 break;
4008 }
4010 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4011 goto keep_cr;
4012 }
4013 else {
4014 goto clear_cr;
4015 }
4016 break;
4017 default:
4018 goto clear_cr;
4019 break;
4020 }
4021
4022 RB_GC_GUARD(t0);
4023
4024 clear_cr:
4025 // If no fast path was hit, we clear the coderange.
4026 // append_as_bytes is predominantly meant to be used in
4027 // buffering situation, hence it's likely the coderange
4028 // will never be scanned, so it's not worth spending time
4029 // precomputing the coderange except for simple and common
4030 // situations.
4032 keep_cr:
4033 return str;
4034}
4035
4036/*
4037 * call-seq:
4038 * self << object -> self
4039 *
4040 * Appends a string representation of +object+ to +self+;
4041 * returns +self+.
4042 *
4043 * If +object+ is a string, appends it to +self+:
4044 *
4045 * s = 'foo'
4046 * s << 'bar' # => "foobar"
4047 * s # => "foobar"
4048 *
4049 * If +object+ is an integer,
4050 * its value is considered a codepoint;
4051 * converts the value to a character before concatenating:
4052 *
4053 * s = 'foo'
4054 * s << 33 # => "foo!"
4055 *
4056 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4057 * and the encoding of +self+ is Encoding::US_ASCII,
4058 * changes the encoding to Encoding::ASCII_8BIT:
4059 *
4060 * s = 'foo'.encode(Encoding::US_ASCII)
4061 * s.encoding # => #<Encoding:US-ASCII>
4062 * s << 0xff # => "foo\xFF"
4063 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4064 *
4065 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4066 *
4067 * s = 'foo'
4068 * s.encoding # => <Encoding:UTF-8>
4069 * s << 0x00110000 # 1114112 out of char range (RangeError)
4070 * s = 'foo'.encode(Encoding::EUC_JP)
4071 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4072 *
4073 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4074 */
4075VALUE
4077{
4078 unsigned int code;
4079 rb_encoding *enc = STR_ENC_GET(str1);
4080 int encidx;
4081
4082 if (RB_INTEGER_TYPE_P(str2)) {
4083 if (rb_num_to_uint(str2, &code) == 0) {
4084 }
4085 else if (FIXNUM_P(str2)) {
4086 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4087 }
4088 else {
4089 rb_raise(rb_eRangeError, "bignum out of char range");
4090 }
4091 }
4092 else {
4093 return rb_str_append(str1, str2);
4094 }
4095
4096 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4097
4098 if (encidx >= 0) {
4099 rb_str_buf_cat_byte(str1, (unsigned char)code);
4100 }
4101 else {
4102 long pos = RSTRING_LEN(str1);
4103 int cr = ENC_CODERANGE(str1);
4104 int len;
4105 char *buf;
4106
4107 switch (len = rb_enc_codelen(code, enc)) {
4108 case ONIGERR_INVALID_CODE_POINT_VALUE:
4109 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4110 break;
4111 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4112 case 0:
4113 rb_raise(rb_eRangeError, "%u out of char range", code);
4114 break;
4115 }
4116 buf = ALLOCA_N(char, len + 1);
4117 rb_enc_mbcput(code, buf, enc);
4118 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4119 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4120 }
4121 rb_str_resize(str1, pos+len);
4122 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4123 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4125 }
4126 else if (cr == ENC_CODERANGE_BROKEN) {
4128 }
4129 ENC_CODERANGE_SET(str1, cr);
4130 }
4131 return str1;
4132}
4133
4134int
4135rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4136{
4137 int encidx = rb_enc_to_index(enc);
4138
4139 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4140 /* US-ASCII automatically extended to ASCII-8BIT */
4141 if (code > 0xFF) {
4142 rb_raise(rb_eRangeError, "%u out of char range", code);
4143 }
4144 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4145 return ENCINDEX_ASCII_8BIT;
4146 }
4147 return encidx;
4148 }
4149 else {
4150 return -1;
4151 }
4152}
4153
4154/*
4155 * call-seq:
4156 * prepend(*other_strings) -> new_string
4157 *
4158 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4159 *
4160 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4161 *
4162 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4163 *
4164 */
4165
4166static VALUE
4167rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4168{
4169 str_modifiable(str);
4170
4171 if (argc == 1) {
4172 rb_str_update(str, 0L, 0L, argv[0]);
4173 }
4174 else if (argc > 1) {
4175 int i;
4176 VALUE arg_str = rb_str_tmp_new(0);
4177 rb_enc_copy(arg_str, str);
4178 for (i = 0; i < argc; i++) {
4179 rb_str_append(arg_str, argv[i]);
4180 }
4181 rb_str_update(str, 0L, 0L, arg_str);
4182 }
4183
4184 return str;
4185}
4186
4187st_index_t
4189{
4190 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4191 st_index_t precomputed_hash;
4192 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4193
4194 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4195 return precomputed_hash;
4196 }
4197
4198 return str_do_hash(str);
4199}
4200
4201int
4203{
4204 long len1, len2;
4205 const char *ptr1, *ptr2;
4206 RSTRING_GETMEM(str1, ptr1, len1);
4207 RSTRING_GETMEM(str2, ptr2, len2);
4208 return (len1 != len2 ||
4209 !rb_str_comparable(str1, str2) ||
4210 memcmp(ptr1, ptr2, len1) != 0);
4211}
4212
4213/*
4214 * call-seq:
4215 * hash -> integer
4216 *
4217 * :include: doc/string/hash.rdoc
4218 *
4219 */
4220
4221static VALUE
4222rb_str_hash_m(VALUE str)
4223{
4224 st_index_t hval = rb_str_hash(str);
4225 return ST2FIX(hval);
4226}
4227
4228#define lesser(a,b) (((a)>(b))?(b):(a))
4229
4230int
4232{
4233 int idx1, idx2;
4234 int rc1, rc2;
4235
4236 if (RSTRING_LEN(str1) == 0) return TRUE;
4237 if (RSTRING_LEN(str2) == 0) return TRUE;
4238 idx1 = ENCODING_GET(str1);
4239 idx2 = ENCODING_GET(str2);
4240 if (idx1 == idx2) return TRUE;
4241 rc1 = rb_enc_str_coderange(str1);
4242 rc2 = rb_enc_str_coderange(str2);
4243 if (rc1 == ENC_CODERANGE_7BIT) {
4244 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4245 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4246 return TRUE;
4247 }
4248 if (rc2 == ENC_CODERANGE_7BIT) {
4249 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4250 return TRUE;
4251 }
4252 return FALSE;
4253}
4254
4255int
4257{
4258 long len1, len2;
4259 const char *ptr1, *ptr2;
4260 int retval;
4261
4262 if (str1 == str2) return 0;
4263 RSTRING_GETMEM(str1, ptr1, len1);
4264 RSTRING_GETMEM(str2, ptr2, len2);
4265 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4266 if (len1 == len2) {
4267 if (!rb_str_comparable(str1, str2)) {
4268 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4269 return 1;
4270 return -1;
4271 }
4272 return 0;
4273 }
4274 if (len1 > len2) return 1;
4275 return -1;
4276 }
4277 if (retval > 0) return 1;
4278 return -1;
4279}
4280
4281/*
4282 * call-seq:
4283 * self == other -> true or false
4284 *
4285 * Returns whether +other+ is equal to +self+.
4286 *
4287 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4288 *
4289 * s = 'foo'
4290 * s == 'foo' # => true
4291 * s == 'food' # => false
4292 * s == 'FOO' # => false
4293 *
4294 * Returns +false+ if the two strings' encodings are not compatible:
4295 *
4296 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4297 *
4298 * When +other+ is not a string:
4299 *
4300 * - If +other+ responds to method <tt>to_str</tt>,
4301 * <tt>other == self</tt> is called and its return value is returned.
4302 * - If +other+ does not respond to <tt>to_str</tt>,
4303 * +false+ is returned.
4304 *
4305 * Related: {Comparing}[rdoc-ref:String@Comparing].
4306 */
4307
4308VALUE
4310{
4311 if (str1 == str2) return Qtrue;
4312 if (!RB_TYPE_P(str2, T_STRING)) {
4313 if (!rb_respond_to(str2, idTo_str)) {
4314 return Qfalse;
4315 }
4316 return rb_equal(str2, str1);
4317 }
4318 return rb_str_eql_internal(str1, str2);
4319}
4320
4321/*
4322 * call-seq:
4323 * eql?(object) -> true or false
4324 *
4325 * :include: doc/string/eql_p.rdoc
4326 *
4327 */
4328
4329VALUE
4330rb_str_eql(VALUE str1, VALUE str2)
4331{
4332 if (str1 == str2) return Qtrue;
4333 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4334 return rb_str_eql_internal(str1, str2);
4335}
4336
4337/*
4338 * call-seq:
4339 * self <=> other -> -1, 0, 1, or nil
4340 *
4341 * Compares +self+ and +other+,
4342 * evaluating their _contents_, not their _lengths_.
4343 *
4344 * Returns:
4345 *
4346 * - +-1+, if +self+ is smaller.
4347 * - +0+, if the two are equal.
4348 * - +1+, if +self+ is larger.
4349 * - +nil+, if the two are incomparable.
4350 *
4351 * Examples:
4352 *
4353 * 'a' <=> 'b' # => -1
4354 * 'a' <=> 'ab' # => -1
4355 * 'a' <=> 'a' # => 0
4356 * 'b' <=> 'a' # => 1
4357 * 'ab' <=> 'a' # => 1
4358 * 'a' <=> :a # => nil
4359 *
4360 * \Class \String includes module Comparable,
4361 * each of whose methods uses String#<=> for comparison.
4362 *
4363 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4364 */
4365
4366static VALUE
4367rb_str_cmp_m(VALUE str1, VALUE str2)
4368{
4369 int result;
4370 VALUE s = rb_check_string_type(str2);
4371 if (NIL_P(s)) {
4372 return rb_invcmp(str1, str2);
4373 }
4374 result = rb_str_cmp(str1, s);
4375 return INT2FIX(result);
4376}
4377
4378static VALUE str_casecmp(VALUE str1, VALUE str2);
4379static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4380
4381/*
4382 * call-seq:
4383 * casecmp(other_string) -> -1, 0, 1, or nil
4384 *
4385 * Ignoring case, compares +self+ and +other_string+; returns:
4386 *
4387 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4388 * - 0 if the two are equal.
4389 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4390 * - +nil+ if the two are incomparable.
4391 *
4392 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4393 *
4394 * Examples:
4395 *
4396 * 'foo'.casecmp('goo') # => -1
4397 * 'goo'.casecmp('foo') # => 1
4398 * 'foo'.casecmp('food') # => -1
4399 * 'food'.casecmp('foo') # => 1
4400 * 'FOO'.casecmp('foo') # => 0
4401 * 'foo'.casecmp('FOO') # => 0
4402 * 'foo'.casecmp(1) # => nil
4403 *
4404 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4405 */
4406
4407static VALUE
4408rb_str_casecmp(VALUE str1, VALUE str2)
4409{
4410 VALUE s = rb_check_string_type(str2);
4411 if (NIL_P(s)) {
4412 return Qnil;
4413 }
4414 return str_casecmp(str1, s);
4415}
4416
4417static VALUE
4418str_casecmp(VALUE str1, VALUE str2)
4419{
4420 long len;
4421 rb_encoding *enc;
4422 const char *p1, *p1end, *p2, *p2end;
4423
4424 enc = rb_enc_compatible(str1, str2);
4425 if (!enc) {
4426 return Qnil;
4427 }
4428
4429 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4430 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4431 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4432 while (p1 < p1end && p2 < p2end) {
4433 if (*p1 != *p2) {
4434 unsigned int c1 = TOLOWER(*p1 & 0xff);
4435 unsigned int c2 = TOLOWER(*p2 & 0xff);
4436 if (c1 != c2)
4437 return INT2FIX(c1 < c2 ? -1 : 1);
4438 }
4439 p1++;
4440 p2++;
4441 }
4442 }
4443 else {
4444 while (p1 < p1end && p2 < p2end) {
4445 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4446 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4447
4448 if (0 <= c1 && 0 <= c2) {
4449 c1 = TOLOWER(c1);
4450 c2 = TOLOWER(c2);
4451 if (c1 != c2)
4452 return INT2FIX(c1 < c2 ? -1 : 1);
4453 }
4454 else {
4455 int r;
4456 l1 = rb_enc_mbclen(p1, p1end, enc);
4457 l2 = rb_enc_mbclen(p2, p2end, enc);
4458 len = l1 < l2 ? l1 : l2;
4459 r = memcmp(p1, p2, len);
4460 if (r != 0)
4461 return INT2FIX(r < 0 ? -1 : 1);
4462 if (l1 != l2)
4463 return INT2FIX(l1 < l2 ? -1 : 1);
4464 }
4465 p1 += l1;
4466 p2 += l2;
4467 }
4468 }
4469 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4470 if (p1 == p1end) return INT2FIX(-1);
4471 return INT2FIX(1);
4472}
4473
4474/*
4475 * call-seq:
4476 * casecmp?(other_string) -> true, false, or nil
4477 *
4478 * Returns +true+ if +self+ and +other_string+ are equal after
4479 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4480 *
4481 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4482 *
4483 * Examples:
4484 *
4485 * 'foo'.casecmp?('goo') # => false
4486 * 'goo'.casecmp?('foo') # => false
4487 * 'foo'.casecmp?('food') # => false
4488 * 'food'.casecmp?('foo') # => false
4489 * 'FOO'.casecmp?('foo') # => true
4490 * 'foo'.casecmp?('FOO') # => true
4491 * 'foo'.casecmp?(1) # => nil
4492 *
4493 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4494 */
4495
4496static VALUE
4497rb_str_casecmp_p(VALUE str1, VALUE str2)
4498{
4499 VALUE s = rb_check_string_type(str2);
4500 if (NIL_P(s)) {
4501 return Qnil;
4502 }
4503 return str_casecmp_p(str1, s);
4504}
4505
4506static VALUE
4507str_casecmp_p(VALUE str1, VALUE str2)
4508{
4509 rb_encoding *enc;
4510 VALUE folded_str1, folded_str2;
4511 VALUE fold_opt = sym_fold;
4512
4513 enc = rb_enc_compatible(str1, str2);
4514 if (!enc) {
4515 return Qnil;
4516 }
4517
4518 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4519 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4520
4521 return rb_str_eql(folded_str1, folded_str2);
4522}
4523
4524static long
4525strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4526 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4527{
4528 const char *search_start = str_ptr;
4529 long pos, search_len = str_len - offset;
4530
4531 for (;;) {
4532 const char *t;
4533 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4534 if (pos < 0) return pos;
4535 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4536 if (t == search_start + pos) break;
4537 search_len -= t - search_start;
4538 if (search_len <= 0) return -1;
4539 offset += t - search_start;
4540 search_start = t;
4541 }
4542 return pos + offset;
4543}
4544
4545/* found index in byte */
4546#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4547#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4548
4549static long
4550rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4551{
4552 const char *str_ptr, *str_ptr_end, *sub_ptr;
4553 long str_len, sub_len;
4554 rb_encoding *enc;
4555
4556 enc = rb_enc_check(str, sub);
4557 if (is_broken_string(sub)) return -1;
4558
4559 str_ptr = RSTRING_PTR(str);
4560 str_ptr_end = RSTRING_END(str);
4561 str_len = RSTRING_LEN(str);
4562 sub_ptr = RSTRING_PTR(sub);
4563 sub_len = RSTRING_LEN(sub);
4564
4565 if (str_len < sub_len) return -1;
4566
4567 if (offset != 0) {
4568 long str_len_char, sub_len_char;
4569 int single_byte = single_byte_optimizable(str);
4570 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4571 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4572 if (offset < 0) {
4573 offset += str_len_char;
4574 if (offset < 0) return -1;
4575 }
4576 if (str_len_char - offset < sub_len_char) return -1;
4577 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4578 str_ptr += offset;
4579 }
4580 if (sub_len == 0) return offset;
4581
4582 /* need proceed one character at a time */
4583 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4584}
4585
4586
4587/*
4588 * call-seq:
4589 * index(pattern, offset = 0) -> integer or nil
4590 *
4591 * :include: doc/string/index.rdoc
4592 *
4593 */
4594
4595static VALUE
4596rb_str_index_m(int argc, VALUE *argv, VALUE str)
4597{
4598 VALUE sub;
4599 VALUE initpos;
4600 rb_encoding *enc = STR_ENC_GET(str);
4601 long pos;
4602
4603 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4604 long slen = str_strlen(str, enc); /* str's enc */
4605 pos = NUM2LONG(initpos);
4606 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4607 if (RB_TYPE_P(sub, T_REGEXP)) {
4609 }
4610 return Qnil;
4611 }
4612 }
4613 else {
4614 pos = 0;
4615 }
4616
4617 if (RB_TYPE_P(sub, T_REGEXP)) {
4618 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4619 enc, single_byte_optimizable(str));
4620
4621 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4622 VALUE match = rb_backref_get();
4623 struct re_registers *regs = RMATCH_REGS(match);
4624 pos = rb_str_sublen(str, BEG(0));
4625 return LONG2NUM(pos);
4626 }
4627 }
4628 else {
4629 StringValue(sub);
4630 pos = rb_str_index(str, sub, pos);
4631 if (pos >= 0) {
4632 pos = rb_str_sublen(str, pos);
4633 return LONG2NUM(pos);
4634 }
4635 }
4636 return Qnil;
4637}
4638
4639/* Ensure that the given pos is a valid character boundary.
4640 * Note that in this function, "character" means a code point
4641 * (Unicode scalar value), not a grapheme cluster.
4642 */
4643static void
4644str_ensure_byte_pos(VALUE str, long pos)
4645{
4646 if (!single_byte_optimizable(str)) {
4647 const char *s = RSTRING_PTR(str);
4648 const char *e = RSTRING_END(str);
4649 const char *p = s + pos;
4650 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4651 rb_raise(rb_eIndexError,
4652 "offset %ld does not land on character boundary", pos);
4653 }
4654 }
4655}
4656
4657/*
4658 * call-seq:
4659 * byteindex(object, offset = 0) -> integer or nil
4660 *
4661 * Returns the 0-based integer index of a substring of +self+
4662 * specified by +object+ (a string or Regexp) and +offset+,
4663 * or +nil+ if there is no such substring;
4664 * the returned index is the count of _bytes_ (not characters).
4665 *
4666 * When +object+ is a string,
4667 * returns the index of the first found substring equal to +object+:
4668 *
4669 * s = 'foo' # => "foo"
4670 * s.size # => 3 # Three 1-byte characters.
4671 * s.bytesize # => 3 # Three bytes.
4672 * s.byteindex('f') # => 0
4673 * s.byteindex('o') # => 1
4674 * s.byteindex('oo') # => 1
4675 * s.byteindex('ooo') # => nil
4676 *
4677 * When +object+ is a Regexp,
4678 * returns the index of the first found substring matching +object+;
4679 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4680 *
4681 * s = 'foo'
4682 * s.byteindex(/f/) # => 0
4683 * $~ # => #<MatchData "f">
4684 * s.byteindex(/o/) # => 1
4685 * s.byteindex(/oo/) # => 1
4686 * s.byteindex(/ooo/) # => nil
4687 * $~ # => nil
4688 *
4689 * \Integer argument +offset+, if given, specifies the 0-based index
4690 * of the byte where searching is to begin.
4691 *
4692 * When +offset+ is non-negative,
4693 * searching begins at byte position +offset+:
4694 *
4695 * s = 'foo'
4696 * s.byteindex('o', 1) # => 1
4697 * s.byteindex('o', 2) # => 2
4698 * s.byteindex('o', 3) # => nil
4699 *
4700 * When +offset+ is negative, counts backward from the end of +self+:
4701 *
4702 * s = 'foo'
4703 * s.byteindex('o', -1) # => 2
4704 * s.byteindex('o', -2) # => 1
4705 * s.byteindex('o', -3) # => 1
4706 * s.byteindex('o', -4) # => nil
4707 *
4708 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4709 *
4710 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4711 * s.size # => 2 # Two 3-byte characters.
4712 * s.bytesize # => 6 # Six bytes.
4713 * s.byteindex("\uFFFF") # => 0
4714 * s.byteindex("\uFFFF", 1) # Raises IndexError
4715 * s.byteindex("\uFFFF", 2) # Raises IndexError
4716 * s.byteindex("\uFFFF", 3) # => 3
4717 * s.byteindex("\uFFFF", 4) # Raises IndexError
4718 * s.byteindex("\uFFFF", 5) # Raises IndexError
4719 * s.byteindex("\uFFFF", 6) # => nil
4720 *
4721 * Related: see {Querying}[rdoc-ref:String@Querying].
4722 */
4723
4724static VALUE
4725rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4726{
4727 VALUE sub;
4728 VALUE initpos;
4729 long pos;
4730
4731 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4732 long slen = RSTRING_LEN(str);
4733 pos = NUM2LONG(initpos);
4734 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4735 if (RB_TYPE_P(sub, T_REGEXP)) {
4737 }
4738 return Qnil;
4739 }
4740 }
4741 else {
4742 pos = 0;
4743 }
4744
4745 str_ensure_byte_pos(str, pos);
4746
4747 if (RB_TYPE_P(sub, T_REGEXP)) {
4748 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4749 VALUE match = rb_backref_get();
4750 struct re_registers *regs = RMATCH_REGS(match);
4751 pos = BEG(0);
4752 return LONG2NUM(pos);
4753 }
4754 }
4755 else {
4756 StringValue(sub);
4757 pos = rb_str_byteindex(str, sub, pos);
4758 if (pos >= 0) return LONG2NUM(pos);
4759 }
4760 return Qnil;
4761}
4762
4763#ifndef HAVE_MEMRCHR
4764static void*
4765memrchr(const char *search_str, int chr, long search_len)
4766{
4767 const char *ptr = search_str + search_len;
4768 while (ptr > search_str) {
4769 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4770 }
4771
4772 return ((void *)0);
4773}
4774#endif
4775
4776static long
4777str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4778{
4779 char *hit, *adjusted;
4780 int c;
4781 long slen, searchlen;
4782 char *sbeg, *e, *t;
4783
4784 sbeg = RSTRING_PTR(str);
4785 slen = RSTRING_LEN(sub);
4786 if (slen == 0) return s - sbeg;
4787 e = RSTRING_END(str);
4788 t = RSTRING_PTR(sub);
4789 c = *t & 0xff;
4790 searchlen = s - sbeg + 1;
4791
4792 if (memcmp(s, t, slen) == 0) {
4793 return s - sbeg;
4794 }
4795
4796 do {
4797 hit = memrchr(sbeg, c, searchlen);
4798 if (!hit) break;
4799 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4800 if (hit != adjusted) {
4801 searchlen = adjusted - sbeg;
4802 continue;
4803 }
4804 if (memcmp(hit, t, slen) == 0)
4805 return hit - sbeg;
4806 searchlen = adjusted - sbeg;
4807 } while (searchlen > 0);
4808
4809 return -1;
4810}
4811
4812/* found index in byte */
4813static long
4814rb_str_rindex(VALUE str, VALUE sub, long pos)
4815{
4816 long len, slen;
4817 char *sbeg, *s;
4818 rb_encoding *enc;
4819 int singlebyte;
4820
4821 enc = rb_enc_check(str, sub);
4822 if (is_broken_string(sub)) return -1;
4823 singlebyte = single_byte_optimizable(str);
4824 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4825 slen = str_strlen(sub, enc); /* rb_enc_check */
4826
4827 /* substring longer than string */
4828 if (len < slen) return -1;
4829 if (len - pos < slen) pos = len - slen;
4830 if (len == 0) return pos;
4831
4832 sbeg = RSTRING_PTR(str);
4833
4834 if (pos == 0) {
4835 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4836 return 0;
4837 else
4838 return -1;
4839 }
4840
4841 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4842 return str_rindex(str, sub, s, enc);
4843}
4844
4845/*
4846 * call-seq:
4847 * rindex(pattern, offset = self.length) -> integer or nil
4848 *
4849 * :include:doc/string/rindex.rdoc
4850 *
4851 */
4852
4853static VALUE
4854rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4855{
4856 VALUE sub;
4857 VALUE initpos;
4858 rb_encoding *enc = STR_ENC_GET(str);
4859 long pos, len = str_strlen(str, enc); /* str's enc */
4860
4861 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4862 pos = NUM2LONG(initpos);
4863 if (pos < 0 && (pos += len) < 0) {
4864 if (RB_TYPE_P(sub, T_REGEXP)) {
4866 }
4867 return Qnil;
4868 }
4869 if (pos > len) pos = len;
4870 }
4871 else {
4872 pos = len;
4873 }
4874
4875 if (RB_TYPE_P(sub, T_REGEXP)) {
4876 /* enc = rb_enc_check(str, sub); */
4877 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4878 enc, single_byte_optimizable(str));
4879
4880 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4881 VALUE match = rb_backref_get();
4882 struct re_registers *regs = RMATCH_REGS(match);
4883 pos = rb_str_sublen(str, BEG(0));
4884 return LONG2NUM(pos);
4885 }
4886 }
4887 else {
4888 StringValue(sub);
4889 pos = rb_str_rindex(str, sub, pos);
4890 if (pos >= 0) {
4891 pos = rb_str_sublen(str, pos);
4892 return LONG2NUM(pos);
4893 }
4894 }
4895 return Qnil;
4896}
4897
4898static long
4899rb_str_byterindex(VALUE str, VALUE sub, long pos)
4900{
4901 long len, slen;
4902 char *sbeg, *s;
4903 rb_encoding *enc;
4904
4905 enc = rb_enc_check(str, sub);
4906 if (is_broken_string(sub)) return -1;
4907 len = RSTRING_LEN(str);
4908 slen = RSTRING_LEN(sub);
4909
4910 /* substring longer than string */
4911 if (len < slen) return -1;
4912 if (len - pos < slen) pos = len - slen;
4913 if (len == 0) return pos;
4914
4915 sbeg = RSTRING_PTR(str);
4916
4917 if (pos == 0) {
4918 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4919 return 0;
4920 else
4921 return -1;
4922 }
4923
4924 s = sbeg + pos;
4925 return str_rindex(str, sub, s, enc);
4926}
4927
4928/*
4929 * call-seq:
4930 * byterindex(object, offset = self.bytesize) -> integer or nil
4931 *
4932 * Returns the 0-based integer index of a substring of +self+
4933 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4934 * or +nil+ if there is no such substring;
4935 * the returned index is the count of _bytes_ (not characters).
4936 *
4937 * When +object+ is a string,
4938 * returns the index of the _last_ found substring equal to +object+:
4939 *
4940 * s = 'foo' # => "foo"
4941 * s.size # => 3 # Three 1-byte characters.
4942 * s.bytesize # => 3 # Three bytes.
4943 * s.byterindex('f') # => 0
4944 * s.byterindex('o') # => 2
4945 * s.byterindex('oo') # => 1
4946 * s.byterindex('ooo') # => nil
4947 *
4948 * When +object+ is a Regexp,
4949 * returns the index of the last found substring matching +object+;
4950 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4951 *
4952 * s = 'foo'
4953 * s.byterindex(/f/) # => 0
4954 * $~ # => #<MatchData "f">
4955 * s.byterindex(/o/) # => 2
4956 * s.byterindex(/oo/) # => 1
4957 * s.byterindex(/ooo/) # => nil
4958 * $~ # => nil
4959 *
4960 * The last match means starting at the possible last position,
4961 * not the last of the longest matches:
4962 *
4963 * s = 'foo'
4964 * s.byterindex(/o+/) # => 2
4965 * $~ #=> #<MatchData "o">
4966 *
4967 * To get the last longest match, use a negative lookbehind:
4968 *
4969 * s = 'foo'
4970 * s.byterindex(/(?<!o)o+/) # => 1
4971 * $~ # => #<MatchData "oo">
4972 *
4973 * Or use method #byteindex with negative lookahead:
4974 *
4975 * s = 'foo'
4976 * s.byteindex(/o+(?!.*o)/) # => 1
4977 * $~ #=> #<MatchData "oo">
4978 *
4979 * \Integer argument +offset+, if given, specifies the 0-based index
4980 * of the byte where searching is to end.
4981 *
4982 * When +offset+ is non-negative,
4983 * searching ends at byte position +offset+:
4984 *
4985 * s = 'foo'
4986 * s.byterindex('o', 0) # => nil
4987 * s.byterindex('o', 1) # => 1
4988 * s.byterindex('o', 2) # => 2
4989 * s.byterindex('o', 3) # => 2
4990 *
4991 * When +offset+ is negative, counts backward from the end of +self+:
4992 *
4993 * s = 'foo'
4994 * s.byterindex('o', -1) # => 2
4995 * s.byterindex('o', -2) # => 1
4996 * s.byterindex('o', -3) # => nil
4997 *
4998 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4999 *
5000 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
5001 * s.size # => 2 # Two 3-byte characters.
5002 * s.bytesize # => 6 # Six bytes.
5003 * s.byterindex("\uFFFF") # => 3
5004 * s.byterindex("\uFFFF", 1) # Raises IndexError
5005 * s.byterindex("\uFFFF", 2) # Raises IndexError
5006 * s.byterindex("\uFFFF", 3) # => 3
5007 * s.byterindex("\uFFFF", 4) # Raises IndexError
5008 * s.byterindex("\uFFFF", 5) # Raises IndexError
5009 * s.byterindex("\uFFFF", 6) # => nil
5010 *
5011 * Related: see {Querying}[rdoc-ref:String@Querying].
5012 */
5013
5014static VALUE
5015rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5016{
5017 VALUE sub;
5018 VALUE initpos;
5019 long pos, len = RSTRING_LEN(str);
5020
5021 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5022 pos = NUM2LONG(initpos);
5023 if (pos < 0 && (pos += len) < 0) {
5024 if (RB_TYPE_P(sub, T_REGEXP)) {
5026 }
5027 return Qnil;
5028 }
5029 if (pos > len) pos = len;
5030 }
5031 else {
5032 pos = len;
5033 }
5034
5035 str_ensure_byte_pos(str, pos);
5036
5037 if (RB_TYPE_P(sub, T_REGEXP)) {
5038 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5039 VALUE match = rb_backref_get();
5040 struct re_registers *regs = RMATCH_REGS(match);
5041 pos = BEG(0);
5042 return LONG2NUM(pos);
5043 }
5044 }
5045 else {
5046 StringValue(sub);
5047 pos = rb_str_byterindex(str, sub, pos);
5048 if (pos >= 0) return LONG2NUM(pos);
5049 }
5050 return Qnil;
5051}
5052
5053/*
5054 * call-seq:
5055 * self =~ other -> integer or nil
5056 *
5057 * When +other+ is a Regexp:
5058 *
5059 * - Returns the integer index (in characters) of the first match
5060 * for +self+ and +other+, or +nil+ if none;
5061 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5062 *
5063 * Examples:
5064 *
5065 * 'foo' =~ /f/ # => 0
5066 * $~ # => #<MatchData "f">
5067 * 'foo' =~ /o/ # => 1
5068 * $~ # => #<MatchData "o">
5069 * 'foo' =~ /x/ # => nil
5070 * $~ # => nil
5071 *
5072 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5073 * (see Regexp#=~):
5074 *
5075 * number = nil
5076 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5077 * number # => nil # Not assigned.
5078 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5079 * number # => "9" # Assigned.
5080 *
5081 * When +other+ is not a Regexp, returns the value
5082 * returned by <tt>other =~ self</tt>.
5083 *
5084 * Related: see {Querying}[rdoc-ref:String@Querying].
5085 */
5086
5087static VALUE
5088rb_str_match(VALUE x, VALUE y)
5089{
5090 switch (OBJ_BUILTIN_TYPE(y)) {
5091 case T_STRING:
5092 rb_raise(rb_eTypeError, "type mismatch: String given");
5093
5094 case T_REGEXP:
5095 return rb_reg_match(y, x);
5096
5097 default:
5098 return rb_funcall(y, idEqTilde, 1, x);
5099 }
5100}
5101
5102
5103static VALUE get_pat(VALUE);
5104
5105
5106/*
5107 * call-seq:
5108 * match(pattern, offset = 0) -> matchdata or nil
5109 * match(pattern, offset = 0) {|matchdata| ... } -> object
5110 *
5111 * Creates a MatchData object based on +self+ and the given arguments;
5112 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5113 *
5114 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5115 *
5116 * regexp = Regexp.new(pattern)
5117 *
5118 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5119 * (see Regexp#match):
5120 *
5121 * matchdata = regexp.match(self[offset..])
5122 *
5123 * With no block given, returns the computed +matchdata+ or +nil+:
5124 *
5125 * 'foo'.match('f') # => #<MatchData "f">
5126 * 'foo'.match('o') # => #<MatchData "o">
5127 * 'foo'.match('x') # => nil
5128 * 'foo'.match('f', 1) # => nil
5129 * 'foo'.match('o', 1) # => #<MatchData "o">
5130 *
5131 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5132 * returns the block's return value:
5133 *
5134 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5135 *
5136 * With a block given and +nil+ +matchdata+, does not call the block:
5137 *
5138 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5139 *
5140 * Related: see {Querying}[rdoc-ref:String@Querying].
5141 */
5142
5143static VALUE
5144rb_str_match_m(int argc, VALUE *argv, VALUE str)
5145{
5146 VALUE re, result;
5147 if (argc < 1)
5148 rb_check_arity(argc, 1, 2);
5149 re = argv[0];
5150 argv[0] = str;
5151 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5152 if (!NIL_P(result) && rb_block_given_p()) {
5153 return rb_yield(result);
5154 }
5155 return result;
5156}
5157
5158/*
5159 * call-seq:
5160 * match?(pattern, offset = 0) -> true or false
5161 *
5162 * Returns whether a match is found for +self+ and the given arguments;
5163 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5164 *
5165 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5166 *
5167 * regexp = Regexp.new(pattern)
5168 *
5169 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5170 * +false+ otherwise:
5171 *
5172 * 'foo'.match?(/o/) # => true
5173 * 'foo'.match?('o') # => true
5174 * 'foo'.match?(/x/) # => false
5175 * 'foo'.match?('f', 1) # => false
5176 * 'foo'.match?('o', 1) # => true
5177 *
5178 * Related: see {Querying}[rdoc-ref:String@Querying].
5179 */
5180
5181static VALUE
5182rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5183{
5184 VALUE re;
5185 rb_check_arity(argc, 1, 2);
5186 re = get_pat(argv[0]);
5187 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5188}
5189
5190enum neighbor_char {
5191 NEIGHBOR_NOT_CHAR,
5192 NEIGHBOR_FOUND,
5193 NEIGHBOR_WRAPPED
5194};
5195
5196static enum neighbor_char
5197enc_succ_char(char *p, long len, rb_encoding *enc)
5198{
5199 long i;
5200 int l;
5201
5202 if (rb_enc_mbminlen(enc) > 1) {
5203 /* wchar, trivial case */
5204 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5205 if (!MBCLEN_CHARFOUND_P(r)) {
5206 return NEIGHBOR_NOT_CHAR;
5207 }
5208 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5209 l = rb_enc_code_to_mbclen(c, enc);
5210 if (!l) return NEIGHBOR_NOT_CHAR;
5211 if (l != len) return NEIGHBOR_WRAPPED;
5212 rb_enc_mbcput(c, p, enc);
5213 r = rb_enc_precise_mbclen(p, p + len, enc);
5214 if (!MBCLEN_CHARFOUND_P(r)) {
5215 return NEIGHBOR_NOT_CHAR;
5216 }
5217 return NEIGHBOR_FOUND;
5218 }
5219 while (1) {
5220 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5221 p[i] = '\0';
5222 if (i < 0)
5223 return NEIGHBOR_WRAPPED;
5224 ++((unsigned char*)p)[i];
5225 l = rb_enc_precise_mbclen(p, p+len, enc);
5226 if (MBCLEN_CHARFOUND_P(l)) {
5227 l = MBCLEN_CHARFOUND_LEN(l);
5228 if (l == len) {
5229 return NEIGHBOR_FOUND;
5230 }
5231 else {
5232 memset(p+l, 0xff, len-l);
5233 }
5234 }
5235 if (MBCLEN_INVALID_P(l) && i < len-1) {
5236 long len2;
5237 int l2;
5238 for (len2 = len-1; 0 < len2; len2--) {
5239 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5240 if (!MBCLEN_INVALID_P(l2))
5241 break;
5242 }
5243 memset(p+len2+1, 0xff, len-(len2+1));
5244 }
5245 }
5246}
5247
5248static enum neighbor_char
5249enc_pred_char(char *p, long len, rb_encoding *enc)
5250{
5251 long i;
5252 int l;
5253 if (rb_enc_mbminlen(enc) > 1) {
5254 /* wchar, trivial case */
5255 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5256 if (!MBCLEN_CHARFOUND_P(r)) {
5257 return NEIGHBOR_NOT_CHAR;
5258 }
5259 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5260 if (!c) return NEIGHBOR_NOT_CHAR;
5261 --c;
5262 l = rb_enc_code_to_mbclen(c, enc);
5263 if (!l) return NEIGHBOR_NOT_CHAR;
5264 if (l != len) return NEIGHBOR_WRAPPED;
5265 rb_enc_mbcput(c, p, enc);
5266 r = rb_enc_precise_mbclen(p, p + len, enc);
5267 if (!MBCLEN_CHARFOUND_P(r)) {
5268 return NEIGHBOR_NOT_CHAR;
5269 }
5270 return NEIGHBOR_FOUND;
5271 }
5272 while (1) {
5273 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5274 p[i] = '\xff';
5275 if (i < 0)
5276 return NEIGHBOR_WRAPPED;
5277 --((unsigned char*)p)[i];
5278 l = rb_enc_precise_mbclen(p, p+len, enc);
5279 if (MBCLEN_CHARFOUND_P(l)) {
5280 l = MBCLEN_CHARFOUND_LEN(l);
5281 if (l == len) {
5282 return NEIGHBOR_FOUND;
5283 }
5284 else {
5285 memset(p+l, 0, len-l);
5286 }
5287 }
5288 if (MBCLEN_INVALID_P(l) && i < len-1) {
5289 long len2;
5290 int l2;
5291 for (len2 = len-1; 0 < len2; len2--) {
5292 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5293 if (!MBCLEN_INVALID_P(l2))
5294 break;
5295 }
5296 memset(p+len2+1, 0, len-(len2+1));
5297 }
5298 }
5299}
5300
5301/*
5302 overwrite +p+ by succeeding letter in +enc+ and returns
5303 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5304 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5305 assuming each ranges are successive, and mbclen
5306 never change in each ranges.
5307 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5308 character.
5309 */
5310static enum neighbor_char
5311enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5312{
5313 enum neighbor_char ret;
5314 unsigned int c;
5315 int ctype;
5316 int range;
5317 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5318
5319 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5320 int try;
5321 const int max_gaps = 1;
5322
5323 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5324 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5325 ctype = ONIGENC_CTYPE_DIGIT;
5326 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5327 ctype = ONIGENC_CTYPE_ALPHA;
5328 else
5329 return NEIGHBOR_NOT_CHAR;
5330
5331 MEMCPY(save, p, char, len);
5332 for (try = 0; try <= max_gaps; ++try) {
5333 ret = enc_succ_char(p, len, enc);
5334 if (ret == NEIGHBOR_FOUND) {
5335 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5336 if (rb_enc_isctype(c, ctype, enc))
5337 return NEIGHBOR_FOUND;
5338 }
5339 }
5340 MEMCPY(p, save, char, len);
5341 range = 1;
5342 while (1) {
5343 MEMCPY(save, p, char, len);
5344 ret = enc_pred_char(p, len, enc);
5345 if (ret == NEIGHBOR_FOUND) {
5346 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5347 if (!rb_enc_isctype(c, ctype, enc)) {
5348 MEMCPY(p, save, char, len);
5349 break;
5350 }
5351 }
5352 else {
5353 MEMCPY(p, save, char, len);
5354 break;
5355 }
5356 range++;
5357 }
5358 if (range == 1) {
5359 return NEIGHBOR_NOT_CHAR;
5360 }
5361
5362 if (ctype != ONIGENC_CTYPE_DIGIT) {
5363 MEMCPY(carry, p, char, len);
5364 return NEIGHBOR_WRAPPED;
5365 }
5366
5367 MEMCPY(carry, p, char, len);
5368 enc_succ_char(carry, len, enc);
5369 return NEIGHBOR_WRAPPED;
5370}
5371
5372
5373static VALUE str_succ(VALUE str);
5374
5375/*
5376 * call-seq:
5377 * succ -> new_str
5378 *
5379 * :include: doc/string/succ.rdoc
5380 *
5381 */
5382
5383VALUE
5385{
5386 VALUE str;
5387 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5388 rb_enc_cr_str_copy_for_substr(str, orig);
5389 return str_succ(str);
5390}
5391
5392static VALUE
5393str_succ(VALUE str)
5394{
5395 rb_encoding *enc;
5396 char *sbeg, *s, *e, *last_alnum = 0;
5397 int found_alnum = 0;
5398 long l, slen;
5399 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5400 long carry_pos = 0, carry_len = 1;
5401 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5402
5403 slen = RSTRING_LEN(str);
5404 if (slen == 0) return str;
5405
5406 enc = STR_ENC_GET(str);
5407 sbeg = RSTRING_PTR(str);
5408 s = e = sbeg + slen;
5409
5410 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5411 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5412 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5413 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5414 break;
5415 }
5416 }
5417 l = rb_enc_precise_mbclen(s, e, enc);
5418 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5419 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5420 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5421 switch (neighbor) {
5422 case NEIGHBOR_NOT_CHAR:
5423 continue;
5424 case NEIGHBOR_FOUND:
5425 return str;
5426 case NEIGHBOR_WRAPPED:
5427 last_alnum = s;
5428 break;
5429 }
5430 found_alnum = 1;
5431 carry_pos = s - sbeg;
5432 carry_len = l;
5433 }
5434 if (!found_alnum) { /* str contains no alnum */
5435 s = e;
5436 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5437 enum neighbor_char neighbor;
5438 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5439 l = rb_enc_precise_mbclen(s, e, enc);
5440 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5441 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5442 MEMCPY(tmp, s, char, l);
5443 neighbor = enc_succ_char(tmp, l, enc);
5444 switch (neighbor) {
5445 case NEIGHBOR_FOUND:
5446 MEMCPY(s, tmp, char, l);
5447 return str;
5448 break;
5449 case NEIGHBOR_WRAPPED:
5450 MEMCPY(s, tmp, char, l);
5451 break;
5452 case NEIGHBOR_NOT_CHAR:
5453 break;
5454 }
5455 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5456 /* wrapped to \0...\0. search next valid char. */
5457 enc_succ_char(s, l, enc);
5458 }
5459 if (!rb_enc_asciicompat(enc)) {
5460 MEMCPY(carry, s, char, l);
5461 carry_len = l;
5462 }
5463 carry_pos = s - sbeg;
5464 }
5466 }
5467 RESIZE_CAPA(str, slen + carry_len);
5468 sbeg = RSTRING_PTR(str);
5469 s = sbeg + carry_pos;
5470 memmove(s + carry_len, s, slen - carry_pos);
5471 memmove(s, carry, carry_len);
5472 slen += carry_len;
5473 STR_SET_LEN(str, slen);
5474 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5475 rb_enc_str_coderange(str);
5476 return str;
5477}
5478
5479
5480/*
5481 * call-seq:
5482 * succ! -> self
5483 *
5484 * Like String#succ, but modifies +self+ in place; returns +self+.
5485 *
5486 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5487 */
5488
5489static VALUE
5490rb_str_succ_bang(VALUE str)
5491{
5492 rb_str_modify(str);
5493 str_succ(str);
5494 return str;
5495}
5496
5497static int
5498all_digits_p(const char *s, long len)
5499{
5500 while (len-- > 0) {
5501 if (!ISDIGIT(*s)) return 0;
5502 s++;
5503 }
5504 return 1;
5505}
5506
5507static int
5508str_upto_i(VALUE str, VALUE arg)
5509{
5510 rb_yield(str);
5511 return 0;
5512}
5513
5514/*
5515 * call-seq:
5516 * upto(other_string, exclusive = false) {|string| ... } -> self
5517 * upto(other_string, exclusive = false) -> new_enumerator
5518 *
5519 * :include: doc/string/upto.rdoc
5520 *
5521 */
5522
5523static VALUE
5524rb_str_upto(int argc, VALUE *argv, VALUE beg)
5525{
5526 VALUE end, exclusive;
5527
5528 rb_scan_args(argc, argv, "11", &end, &exclusive);
5529 RETURN_ENUMERATOR(beg, argc, argv);
5530 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5531}
5532
5533VALUE
5534rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5535{
5536 VALUE current, after_end;
5537 ID succ;
5538 int n, ascii;
5539 rb_encoding *enc;
5540
5541 CONST_ID(succ, "succ");
5542 StringValue(end);
5543 enc = rb_enc_check(beg, end);
5544 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5545 /* single character */
5546 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5547 char c = RSTRING_PTR(beg)[0];
5548 char e = RSTRING_PTR(end)[0];
5549
5550 if (c > e || (excl && c == e)) return beg;
5551 for (;;) {
5552 VALUE str = rb_enc_str_new(&c, 1, enc);
5554 if ((*each)(str, arg)) break;
5555 if (!excl && c == e) break;
5556 c++;
5557 if (excl && c == e) break;
5558 }
5559 return beg;
5560 }
5561 /* both edges are all digits */
5562 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5563 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5564 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5565 VALUE b, e;
5566 int width;
5567
5568 width = RSTRING_LENINT(beg);
5569 b = rb_str_to_inum(beg, 10, FALSE);
5570 e = rb_str_to_inum(end, 10, FALSE);
5571 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5572 long bi = FIX2LONG(b);
5573 long ei = FIX2LONG(e);
5574 rb_encoding *usascii = rb_usascii_encoding();
5575
5576 while (bi <= ei) {
5577 if (excl && bi == ei) break;
5578 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5579 bi++;
5580 }
5581 }
5582 else {
5583 ID op = excl ? '<' : idLE;
5584 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5585
5586 args[0] = INT2FIX(width);
5587 while (rb_funcall(b, op, 1, e)) {
5588 args[1] = b;
5589 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5590 b = rb_funcallv(b, succ, 0, 0);
5591 }
5592 }
5593 return beg;
5594 }
5595 /* normal case */
5596 n = rb_str_cmp(beg, end);
5597 if (n > 0 || (excl && n == 0)) return beg;
5598
5599 after_end = rb_funcallv(end, succ, 0, 0);
5600 current = str_duplicate(rb_cString, beg);
5601 while (!rb_str_equal(current, after_end)) {
5602 VALUE next = Qnil;
5603 if (excl || !rb_str_equal(current, end))
5604 next = rb_funcallv(current, succ, 0, 0);
5605 if ((*each)(current, arg)) break;
5606 if (NIL_P(next)) break;
5607 current = next;
5608 StringValue(current);
5609 if (excl && rb_str_equal(current, end)) break;
5610 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5611 break;
5612 }
5613
5614 return beg;
5615}
5616
5617VALUE
5618rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5619{
5620 VALUE current;
5621 ID succ;
5622
5623 CONST_ID(succ, "succ");
5624 /* both edges are all digits */
5625 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5626 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5627 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5628 int width = RSTRING_LENINT(beg);
5629 b = rb_str_to_inum(beg, 10, FALSE);
5630 if (FIXNUM_P(b)) {
5631 long bi = FIX2LONG(b);
5632 rb_encoding *usascii = rb_usascii_encoding();
5633
5634 while (FIXABLE(bi)) {
5635 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5636 bi++;
5637 }
5638 b = LONG2NUM(bi);
5639 }
5640 args[0] = INT2FIX(width);
5641 while (1) {
5642 args[1] = b;
5643 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5644 b = rb_funcallv(b, succ, 0, 0);
5645 }
5646 }
5647 /* normal case */
5648 current = str_duplicate(rb_cString, beg);
5649 while (1) {
5650 VALUE next = rb_funcallv(current, succ, 0, 0);
5651 if ((*each)(current, arg)) break;
5652 current = next;
5653 StringValue(current);
5654 if (RSTRING_LEN(current) == 0)
5655 break;
5656 }
5657
5658 return beg;
5659}
5660
5661static int
5662include_range_i(VALUE str, VALUE arg)
5663{
5664 VALUE *argp = (VALUE *)arg;
5665 if (!rb_equal(str, *argp)) return 0;
5666 *argp = Qnil;
5667 return 1;
5668}
5669
5670VALUE
5671rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5672{
5673 beg = rb_str_new_frozen(beg);
5674 StringValue(end);
5675 end = rb_str_new_frozen(end);
5676 if (NIL_P(val)) return Qfalse;
5677 val = rb_check_string_type(val);
5678 if (NIL_P(val)) return Qfalse;
5679 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5680 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5681 rb_enc_asciicompat(STR_ENC_GET(val))) {
5682 const char *bp = RSTRING_PTR(beg);
5683 const char *ep = RSTRING_PTR(end);
5684 const char *vp = RSTRING_PTR(val);
5685 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5686 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5687 return Qfalse;
5688 else {
5689 char b = *bp;
5690 char e = *ep;
5691 char v = *vp;
5692
5693 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5694 if (b <= v && v < e) return Qtrue;
5695 return RBOOL(!RTEST(exclusive) && v == e);
5696 }
5697 }
5698 }
5699#if 0
5700 /* both edges are all digits */
5701 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5702 all_digits_p(bp, RSTRING_LEN(beg)) &&
5703 all_digits_p(ep, RSTRING_LEN(end))) {
5704 /* TODO */
5705 }
5706#endif
5707 }
5708 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5709
5710 return RBOOL(NIL_P(val));
5711}
5712
5713static VALUE
5714rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5715{
5716 if (rb_reg_search(re, str, 0, 0) >= 0) {
5717 VALUE match = rb_backref_get();
5718 int nth = rb_reg_backref_number(match, backref);
5719 return rb_reg_nth_match(nth, match);
5720 }
5721 return Qnil;
5722}
5723
5724static VALUE
5725rb_str_aref(VALUE str, VALUE indx)
5726{
5727 long idx;
5728
5729 if (FIXNUM_P(indx)) {
5730 idx = FIX2LONG(indx);
5731 }
5732 else if (RB_TYPE_P(indx, T_REGEXP)) {
5733 return rb_str_subpat(str, indx, INT2FIX(0));
5734 }
5735 else if (RB_TYPE_P(indx, T_STRING)) {
5736 if (rb_str_index(str, indx, 0) != -1)
5737 return str_duplicate(rb_cString, indx);
5738 return Qnil;
5739 }
5740 else {
5741 /* check if indx is Range */
5742 long beg, len = str_strlen(str, NULL);
5743 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5744 case Qfalse:
5745 break;
5746 case Qnil:
5747 return Qnil;
5748 default:
5749 return rb_str_substr(str, beg, len);
5750 }
5751 idx = NUM2LONG(indx);
5752 }
5753
5754 return str_substr(str, idx, 1, FALSE);
5755}
5756
5757
5758/*
5759 * call-seq:
5760 * self[offset] -> new_string or nil
5761 * self[offset, size] -> new_string or nil
5762 * self[range] -> new_string or nil
5763 * self[regexp, capture = 0] -> new_string or nil
5764 * self[substring] -> new_string or nil
5765 *
5766 * :include: doc/string/aref.rdoc
5767 *
5768 */
5769
5770static VALUE
5771rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5772{
5773 if (argc == 2) {
5774 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5775 return rb_str_subpat(str, argv[0], argv[1]);
5776 }
5777 else {
5778 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5779 }
5780 }
5781 rb_check_arity(argc, 1, 2);
5782 return rb_str_aref(str, argv[0]);
5783}
5784
5785VALUE
5787{
5788 char *ptr = RSTRING_PTR(str);
5789 long olen = RSTRING_LEN(str), nlen;
5790
5791 str_modifiable(str);
5792 if (len > olen) len = olen;
5793 nlen = olen - len;
5794 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5795 char *oldptr = ptr;
5796 size_t old_capa = RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5797 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5798 STR_SET_EMBED(str);
5799 ptr = RSTRING(str)->as.embed.ary;
5800 memmove(ptr, oldptr + len, nlen);
5801 if (fl == STR_NOEMBED) {
5802 SIZED_FREE_N(oldptr, old_capa);
5803 }
5804 }
5805 else {
5806 if (!STR_SHARED_P(str)) {
5807 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5808 rb_enc_cr_str_exact_copy(shared, str);
5809 OBJ_FREEZE(shared);
5810 }
5811 ptr = RSTRING(str)->as.heap.ptr += len;
5812 }
5813 STR_SET_LEN(str, nlen);
5814
5815 if (!SHARABLE_MIDDLE_SUBSTRING) {
5816 TERM_FILL(ptr + nlen, TERM_LEN(str));
5817 }
5819 return str;
5820}
5821
5822static void
5823rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5824{
5825 char *sptr;
5826 long slen;
5827 int cr;
5828
5829 if (beg == 0 && vlen == 0) {
5830 rb_str_drop_bytes(str, len);
5831 return;
5832 }
5833
5834 str_modify_keep_cr(str);
5835 RSTRING_GETMEM(str, sptr, slen);
5836 if (len < vlen) {
5837 /* expand string */
5838 RESIZE_CAPA(str, slen + vlen - len);
5839 sptr = RSTRING_PTR(str);
5840 }
5841
5843 cr = rb_enc_str_coderange(val);
5844 else
5846
5847 if (vlen != len) {
5848 memmove(sptr + beg + vlen,
5849 sptr + beg + len,
5850 slen - (beg + len));
5851 }
5852 if (vlen < beg && len < 0) {
5853 MEMZERO(sptr + slen, char, -len);
5854 }
5855 if (vlen > 0) {
5856 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5857 }
5858 slen += vlen - len;
5859 STR_SET_LEN(str, slen);
5860 TERM_FILL(&sptr[slen], TERM_LEN(str));
5861 ENC_CODERANGE_SET(str, cr);
5862}
5863
5864static inline void
5865rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5866{
5867 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5868}
5869
5870void
5871rb_str_update(VALUE str, long beg, long len, VALUE val)
5872{
5873 long slen;
5874 char *p, *e;
5875 rb_encoding *enc;
5876 int singlebyte = single_byte_optimizable(str);
5877 int cr;
5878
5879 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5880
5881 StringValue(val);
5882 enc = rb_enc_check(str, val);
5883 slen = str_strlen(str, enc); /* rb_enc_check */
5884
5885 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5886 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5887 }
5888 if (beg < 0) {
5889 beg += slen;
5890 }
5891 RUBY_ASSERT(beg >= 0);
5892 RUBY_ASSERT(beg <= slen);
5893
5894 if (len > slen - beg) {
5895 len = slen - beg;
5896 }
5897 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5898 if (!p) p = RSTRING_END(str);
5899 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5900 if (!e) e = RSTRING_END(str);
5901 /* error check */
5902 beg = p - RSTRING_PTR(str); /* physical position */
5903 len = e - p; /* physical length */
5904 rb_str_update_0(str, beg, len, val);
5905 rb_enc_associate(str, enc);
5907 if (cr != ENC_CODERANGE_BROKEN)
5908 ENC_CODERANGE_SET(str, cr);
5909}
5910
5911static void
5912rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5913{
5914 int nth;
5915 VALUE match;
5916 long start, end, len;
5917 rb_encoding *enc;
5918 struct re_registers *regs;
5919
5920 if (rb_reg_search(re, str, 0, 0) < 0) {
5921 rb_raise(rb_eIndexError, "regexp not matched");
5922 }
5923 match = rb_backref_get();
5924 nth = rb_reg_backref_number(match, backref);
5925 regs = RMATCH_REGS(match);
5926 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5927 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5928 }
5929 if (nth < 0) {
5930 nth += regs->num_regs;
5931 }
5932
5933 start = BEG(nth);
5934 if (start == -1) {
5935 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5936 }
5937 end = END(nth);
5938 len = end - start;
5939 StringValue(val);
5940 enc = rb_enc_check_str(str, val);
5941 rb_str_update_0(str, start, len, val);
5942 rb_enc_associate(str, enc);
5943}
5944
5945static VALUE
5946rb_str_aset(VALUE str, VALUE indx, VALUE val)
5947{
5948 long idx, beg;
5949
5950 switch (TYPE(indx)) {
5951 case T_REGEXP:
5952 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5953 return val;
5954
5955 case T_STRING:
5956 beg = rb_str_index(str, indx, 0);
5957 if (beg < 0) {
5958 rb_raise(rb_eIndexError, "string not matched");
5959 }
5960 beg = rb_str_sublen(str, beg);
5961 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5962 return val;
5963
5964 default:
5965 /* check if indx is Range */
5966 {
5967 long beg, len;
5968 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5969 rb_str_update(str, beg, len, val);
5970 return val;
5971 }
5972 }
5973 /* FALLTHROUGH */
5974
5975 case T_FIXNUM:
5976 idx = NUM2LONG(indx);
5977 rb_str_update(str, idx, 1, val);
5978 return val;
5979 }
5980}
5981
5982/*
5983 * call-seq:
5984 * self[index] = other_string -> new_string
5985 * self[start, length] = other_string -> new_string
5986 * self[range] = other_string -> new_string
5987 * self[regexp, capture = 0] = other_string -> new_string
5988 * self[substring] = other_string -> new_string
5989 *
5990 * :include: doc/string/aset.rdoc
5991 *
5992 */
5993
5994static VALUE
5995rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5996{
5997 if (argc == 3) {
5998 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5999 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6000 }
6001 else {
6002 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6003 }
6004 return argv[2];
6005 }
6006 rb_check_arity(argc, 2, 3);
6007 return rb_str_aset(str, argv[0], argv[1]);
6008}
6009
6010/*
6011 * call-seq:
6012 * insert(offset, other_string) -> self
6013 *
6014 * :include: doc/string/insert.rdoc
6015 *
6016 */
6017
6018static VALUE
6019rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6020{
6021 long pos = NUM2LONG(idx);
6022
6023 if (pos == -1) {
6024 return rb_str_append(str, str2);
6025 }
6026 else if (pos < 0) {
6027 pos++;
6028 }
6029 rb_str_update(str, pos, 0, str2);
6030 return str;
6031}
6032
6033
6034/*
6035 * call-seq:
6036 * slice!(index) -> new_string or nil
6037 * slice!(start, length) -> new_string or nil
6038 * slice!(range) -> new_string or nil
6039 * slice!(regexp, capture = 0) -> new_string or nil
6040 * slice!(substring) -> new_string or nil
6041 *
6042 * Like String#[] (and its alias String#slice), except that:
6043 *
6044 * - Performs substitutions in +self+ (not in a copy of +self+).
6045 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6046 *
6047 * A few examples:
6048 *
6049 * s = 'hello'
6050 * s.slice!('e') # => "e"
6051 * s # => "hllo"
6052 * s.slice!('e') # => nil
6053 * s # => "hllo"
6054 *
6055 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6056 */
6057
6058static VALUE
6059rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6060{
6061 VALUE result = Qnil;
6062 VALUE indx;
6063 long beg, len = 1;
6064 char *p;
6065
6066 rb_check_arity(argc, 1, 2);
6067 str_modify_keep_cr(str);
6068 indx = argv[0];
6069 if (RB_TYPE_P(indx, T_REGEXP)) {
6070 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6071 VALUE match = rb_backref_get();
6072 struct re_registers *regs = RMATCH_REGS(match);
6073 int nth = 0;
6074 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6075 if ((nth += regs->num_regs) <= 0) return Qnil;
6076 }
6077 else if (nth >= regs->num_regs) return Qnil;
6078 beg = BEG(nth);
6079 len = END(nth) - beg;
6080 goto subseq;
6081 }
6082 else if (argc == 2) {
6083 beg = NUM2LONG(indx);
6084 len = NUM2LONG(argv[1]);
6085 goto num_index;
6086 }
6087 else if (FIXNUM_P(indx)) {
6088 beg = FIX2LONG(indx);
6089 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6090 if (!len) return Qnil;
6091 beg = p - RSTRING_PTR(str);
6092 goto subseq;
6093 }
6094 else if (RB_TYPE_P(indx, T_STRING)) {
6095 beg = rb_str_index(str, indx, 0);
6096 if (beg == -1) return Qnil;
6097 len = RSTRING_LEN(indx);
6098 result = str_duplicate(rb_cString, indx);
6099 goto squash;
6100 }
6101 else {
6102 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6103 case Qnil:
6104 return Qnil;
6105 case Qfalse:
6106 beg = NUM2LONG(indx);
6107 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6108 if (!len) return Qnil;
6109 beg = p - RSTRING_PTR(str);
6110 goto subseq;
6111 default:
6112 goto num_index;
6113 }
6114 }
6115
6116 num_index:
6117 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6118 beg = p - RSTRING_PTR(str);
6119
6120 subseq:
6121 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6122 rb_enc_cr_str_copy_for_substr(result, str);
6123
6124 squash:
6125 if (len > 0) {
6126 if (beg == 0) {
6127 rb_str_drop_bytes(str, len);
6128 }
6129 else {
6130 char *sptr = RSTRING_PTR(str);
6131 long slen = RSTRING_LEN(str);
6132 if (beg + len > slen) /* pathological check */
6133 len = slen - beg;
6134 memmove(sptr + beg,
6135 sptr + beg + len,
6136 slen - (beg + len));
6137 slen -= len;
6138 STR_SET_LEN(str, slen);
6139 TERM_FILL(&sptr[slen], TERM_LEN(str));
6140 }
6141 }
6142 return result;
6143}
6144
6145static VALUE
6146get_pat(VALUE pat)
6147{
6148 VALUE val;
6149
6150 switch (OBJ_BUILTIN_TYPE(pat)) {
6151 case T_REGEXP:
6152 return pat;
6153
6154 case T_STRING:
6155 break;
6156
6157 default:
6158 val = rb_check_string_type(pat);
6159 if (NIL_P(val)) {
6160 Check_Type(pat, T_REGEXP);
6161 }
6162 pat = val;
6163 }
6164
6165 return rb_reg_regcomp(pat);
6166}
6167
6168static VALUE
6169get_pat_quoted(VALUE pat, int check)
6170{
6171 VALUE val;
6172
6173 switch (OBJ_BUILTIN_TYPE(pat)) {
6174 case T_REGEXP:
6175 return pat;
6176
6177 case T_STRING:
6178 break;
6179
6180 default:
6181 val = rb_check_string_type(pat);
6182 if (NIL_P(val)) {
6183 Check_Type(pat, T_REGEXP);
6184 }
6185 pat = val;
6186 }
6187 if (check && is_broken_string(pat)) {
6188 rb_exc_raise(rb_reg_check_preprocess(pat));
6189 }
6190 return pat;
6191}
6192
6193static long
6194rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6195{
6196 if (BUILTIN_TYPE(pat) == T_STRING) {
6197 pos = rb_str_byteindex(str, pat, pos);
6198 if (set_backref_str) {
6199 if (pos >= 0) {
6200 str = rb_str_new_frozen_String(str);
6201 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6202 if (match) {
6203 *match = match_data;
6204 }
6205 }
6206 else {
6208 }
6209 }
6210 return pos;
6211 }
6212 else {
6213 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6214 }
6215}
6216
6217static long
6218rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6219{
6220 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6221}
6222
6223
6224/*
6225 * call-seq:
6226 * sub!(pattern, replacement) -> self or nil
6227 * sub!(pattern) {|match| ... } -> self or nil
6228 *
6229 * Like String#sub, except that:
6230 *
6231 * - Changes are made to +self+, not to copy of +self+.
6232 * - Returns +self+ if any changes are made, +nil+ otherwise.
6233 *
6234 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6235 */
6236
6237static VALUE
6238rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6239{
6240 VALUE pat, repl, hash = Qnil;
6241 int iter = 0;
6242 long plen;
6243 int min_arity = rb_block_given_p() ? 1 : 2;
6244 long beg;
6245
6246 rb_check_arity(argc, min_arity, 2);
6247 if (argc == 1) {
6248 iter = 1;
6249 }
6250 else {
6251 repl = argv[1];
6252 if (!RB_TYPE_P(repl, T_STRING)) {
6253 hash = rb_check_hash_type(repl);
6254 if (NIL_P(hash)) {
6255 StringValue(repl);
6256 }
6257 }
6258 }
6259
6260 pat = get_pat_quoted(argv[0], 1);
6261
6262 str_modifiable(str);
6263 beg = rb_pat_search(pat, str, 0, 1);
6264 if (beg >= 0) {
6265 rb_encoding *enc;
6266 int cr = ENC_CODERANGE(str);
6267 long beg0, end0;
6268 VALUE match, match0 = Qnil;
6269 struct re_registers *regs;
6270 char *p, *rp;
6271 long len, rlen;
6272
6273 match = rb_backref_get();
6274 regs = RMATCH_REGS(match);
6275 if (RB_TYPE_P(pat, T_STRING)) {
6276 beg0 = beg;
6277 end0 = beg0 + RSTRING_LEN(pat);
6278 match0 = pat;
6279 }
6280 else {
6281 beg0 = BEG(0);
6282 end0 = END(0);
6283 if (iter) match0 = rb_reg_nth_match(0, match);
6284 }
6285
6286 if (iter || !NIL_P(hash)) {
6287 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6288
6289 if (iter) {
6290 repl = rb_obj_as_string(rb_yield(match0));
6291 }
6292 else {
6293 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6294 repl = rb_obj_as_string(repl);
6295 }
6296 str_mod_check(str, p, len);
6297 rb_check_frozen(str);
6298 }
6299 else {
6300 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6301 }
6302
6303 enc = rb_enc_compatible(str, repl);
6304 if (!enc) {
6305 rb_encoding *str_enc = STR_ENC_GET(str);
6306 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6307 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6308 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6309 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6310 rb_enc_inspect_name(str_enc),
6311 rb_enc_inspect_name(STR_ENC_GET(repl)));
6312 }
6313 enc = STR_ENC_GET(repl);
6314 }
6315 rb_str_modify(str);
6316 rb_enc_associate(str, enc);
6318 int cr2 = ENC_CODERANGE(repl);
6319 if (cr2 == ENC_CODERANGE_BROKEN ||
6320 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6322 else
6323 cr = cr2;
6324 }
6325 plen = end0 - beg0;
6326 rlen = RSTRING_LEN(repl);
6327 len = RSTRING_LEN(str);
6328 if (rlen > plen) {
6329 RESIZE_CAPA(str, len + rlen - plen);
6330 }
6331 p = RSTRING_PTR(str);
6332 if (rlen != plen) {
6333 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6334 }
6335 rp = RSTRING_PTR(repl);
6336 memmove(p + beg0, rp, rlen);
6337 len += rlen - plen;
6338 STR_SET_LEN(str, len);
6339 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6340 ENC_CODERANGE_SET(str, cr);
6341
6342 RB_GC_GUARD(match);
6343
6344 return str;
6345 }
6346 return Qnil;
6347}
6348
6349
6350/*
6351 * call-seq:
6352 * sub(pattern, replacement) -> new_string
6353 * sub(pattern) {|match| ... } -> new_string
6354 *
6355 * :include: doc/string/sub.rdoc
6356 */
6357
6358static VALUE
6359rb_str_sub(int argc, VALUE *argv, VALUE str)
6360{
6361 str = str_duplicate(rb_cString, str);
6362 rb_str_sub_bang(argc, argv, str);
6363 return str;
6364}
6365
6366static VALUE
6367str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6368{
6369 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6370 long beg, beg0, end0;
6371 long offset, blen, slen, len, last;
6372 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6373 char *sp, *cp;
6374 int need_backref_str = -1;
6375 rb_encoding *str_enc;
6376
6377 switch (argc) {
6378 case 1:
6379 RETURN_ENUMERATOR(str, argc, argv);
6380 mode = ITER;
6381 break;
6382 case 2:
6383 repl = argv[1];
6384 if (!RB_TYPE_P(repl, T_STRING)) {
6385 hash = rb_check_hash_type(repl);
6386 if (NIL_P(hash)) {
6387 StringValue(repl);
6388 }
6389 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6390 mode = FAST_MAP;
6391 }
6392 else {
6393 mode = MAP;
6394 }
6395 }
6396 break;
6397 default:
6398 rb_error_arity(argc, 1, 2);
6399 }
6400
6401 pat = get_pat_quoted(argv[0], 1);
6402 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6403
6404 if (beg < 0) {
6405 if (bang) return Qnil; /* no match, no substitution */
6406 return str_duplicate(rb_cString, str);
6407 }
6408
6409 offset = 0;
6410 blen = RSTRING_LEN(str) + 30; /* len + margin */
6411 dest = rb_str_buf_new(blen);
6412 sp = RSTRING_PTR(str);
6413 slen = RSTRING_LEN(str);
6414 cp = sp;
6415 str_enc = STR_ENC_GET(str);
6416 rb_enc_associate(dest, str_enc);
6417 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6418
6419 do {
6420 struct re_registers *regs = RMATCH_REGS(match);
6421 if (RB_TYPE_P(pat, T_STRING)) {
6422 beg0 = beg;
6423 end0 = beg0 + RSTRING_LEN(pat);
6424 match0 = pat;
6425 }
6426 else {
6427 beg0 = BEG(0);
6428 end0 = END(0);
6429 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6430 }
6431
6432 if (mode != STR) {
6433 if (mode == ITER) {
6434 val = rb_obj_as_string(rb_yield(match0));
6435 }
6436 else {
6437 struct RString fake_str = {RBASIC_INIT};
6438 VALUE key;
6439 if (mode == FAST_MAP) {
6440 // It is safe to use a fake_str here because we established that it won't escape,
6441 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6442 // default proc.
6443 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6444 }
6445 else {
6446 key = rb_str_subseq(str, beg0, end0 - beg0);
6447 }
6448 val = rb_hash_aref(hash, key);
6449 val = rb_obj_as_string(val);
6450 }
6451 str_mod_check(str, sp, slen);
6452 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6453 rb_raise(rb_eRuntimeError, "block should not cheat");
6454 }
6455 }
6456 else if (need_backref_str) {
6457 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6458 if (need_backref_str < 0) {
6459 need_backref_str = val != repl;
6460 }
6461 }
6462 else {
6463 val = repl;
6464 }
6465
6466 len = beg0 - offset; /* copy pre-match substr */
6467 if (len) {
6468 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6469 }
6470
6471 rb_str_buf_append(dest, val);
6472
6473 last = offset;
6474 offset = end0;
6475 if (beg0 == end0) {
6476 /*
6477 * Always consume at least one character of the input string
6478 * in order to prevent infinite loops.
6479 */
6480 if (RSTRING_LEN(str) <= end0) break;
6481 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6482 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6483 offset = end0 + len;
6484 }
6485 cp = RSTRING_PTR(str) + offset;
6486 if (offset > RSTRING_LEN(str)) break;
6487
6488 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6489 if (mode != FAST_MAP && mode != STR) {
6490 match = Qnil;
6491 }
6492 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6493
6494 RB_GC_GUARD(match);
6495 } while (beg >= 0);
6496
6497 if (RSTRING_LEN(str) > offset) {
6498 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6499 }
6500 rb_pat_search0(pat, str, last, 1, &match);
6501 if (bang) {
6502 str_shared_replace(str, dest);
6503 }
6504 else {
6505 str = dest;
6506 }
6507
6508 return str;
6509}
6510
6511
6512/*
6513 * call-seq:
6514 * gsub!(pattern, replacement) -> self or nil
6515 * gsub!(pattern) {|match| ... } -> self or nil
6516 * gsub!(pattern) -> an_enumerator
6517 *
6518 * Like String#gsub, except that:
6519 *
6520 * - Performs substitutions in +self+ (not in a copy of +self+).
6521 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6522 *
6523 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6524 */
6525
6526static VALUE
6527rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6528{
6529 str_modify_keep_cr(str);
6530 return str_gsub(argc, argv, str, 1);
6531}
6532
6533
6534/*
6535 * call-seq:
6536 * gsub(pattern, replacement) -> new_string
6537 * gsub(pattern) {|match| ... } -> new_string
6538 * gsub(pattern) -> enumerator
6539 *
6540 * Returns a copy of +self+ with zero or more substrings replaced.
6541 *
6542 * Argument +pattern+ may be a string or a Regexp;
6543 * argument +replacement+ may be a string or a Hash.
6544 * Varying types for the argument values makes this method very versatile.
6545 *
6546 * Below are some simple examples;
6547 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6548 *
6549 * With arguments +pattern+ and string +replacement+ given,
6550 * replaces each matching substring with the given +replacement+ string:
6551 *
6552 * s = 'abracadabra'
6553 * s.gsub('ab', 'AB') # => "ABracadABra"
6554 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6555 *
6556 * With arguments +pattern+ and hash +replacement+ given,
6557 * replaces each matching substring with a value from the given +replacement+ hash,
6558 * or removes it:
6559 *
6560 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6561 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6562 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6563 *
6564 * With argument +pattern+ and a block given,
6565 * calls the block with each matching substring;
6566 * replaces that substring with the block's return value:
6567 *
6568 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6569 * # => "ABrACADABrA"
6570 *
6571 * With argument +pattern+ and no block given,
6572 * returns a new Enumerator.
6573 *
6574 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6575 */
6576
6577static VALUE
6578rb_str_gsub(int argc, VALUE *argv, VALUE str)
6579{
6580 return str_gsub(argc, argv, str, 0);
6581}
6582
6583
6584/*
6585 * call-seq:
6586 * replace(other_string) -> self
6587 *
6588 * Replaces the contents of +self+ with the contents of +other_string+;
6589 * returns +self+:
6590 *
6591 * s = 'foo' # => "foo"
6592 * s.replace('bar') # => "bar"
6593 *
6594 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6595 */
6596
6597VALUE
6599{
6600 str_modifiable(str);
6601 if (str == str2) return str;
6602
6603 StringValue(str2);
6604 str_discard(str);
6605 return str_replace(str, str2);
6606}
6607
6608/*
6609 * call-seq:
6610 * clear -> self
6611 *
6612 * Removes the contents of +self+:
6613 *
6614 * s = 'foo'
6615 * s.clear # => ""
6616 * s # => ""
6617 *
6618 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6619 */
6620
6621static VALUE
6622rb_str_clear(VALUE str)
6623{
6624 str_discard(str);
6625 STR_SET_EMBED(str);
6626 STR_SET_LEN(str, 0);
6627 RSTRING_PTR(str)[0] = 0;
6628 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6630 else
6632 return str;
6633}
6634
6635/*
6636 * call-seq:
6637 * chr -> string
6638 *
6639 * :include: doc/string/chr.rdoc
6640 *
6641 */
6642
6643static VALUE
6644rb_str_chr(VALUE str)
6645{
6646 return rb_str_substr(str, 0, 1);
6647}
6648
6649/*
6650 * call-seq:
6651 * getbyte(index) -> integer or nil
6652 *
6653 * :include: doc/string/getbyte.rdoc
6654 *
6655 */
6656VALUE
6657rb_str_getbyte(VALUE str, VALUE index)
6658{
6659 long pos = NUM2LONG(index);
6660
6661 if (pos < 0)
6662 pos += RSTRING_LEN(str);
6663 if (pos < 0 || RSTRING_LEN(str) <= pos)
6664 return Qnil;
6665
6666 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6667}
6668
6669/*
6670 * call-seq:
6671 * setbyte(index, integer) -> integer
6672 *
6673 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6674 * returns +integer+:
6675 *
6676 * s = 'xyzzy'
6677 * s.setbyte(2, 129) # => 129
6678 * s # => "xy\x81zy"
6679 *
6680 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6681 */
6682VALUE
6683rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6684{
6685 long pos = NUM2LONG(index);
6686 long len = RSTRING_LEN(str);
6687 char *ptr, *head, *left = 0;
6688 rb_encoding *enc;
6689 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6690
6691 if (pos < -len || len <= pos)
6692 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6693 if (pos < 0)
6694 pos += len;
6695
6696 VALUE v = rb_to_int(value);
6697 VALUE w = rb_int_and(v, INT2FIX(0xff));
6698 char byte = (char)(NUM2INT(w) & 0xFF);
6699
6700 if (!str_independent(str))
6701 str_make_independent(str);
6702 enc = STR_ENC_GET(str);
6703 head = RSTRING_PTR(str);
6704 ptr = &head[pos];
6705 if (!STR_EMBED_P(str)) {
6706 cr = ENC_CODERANGE(str);
6707 switch (cr) {
6708 case ENC_CODERANGE_7BIT:
6709 left = ptr;
6710 *ptr = byte;
6711 if (ISASCII(byte)) goto end;
6712 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6713 if (!MBCLEN_CHARFOUND_P(nlen))
6715 else
6717 goto end;
6719 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6720 width = rb_enc_precise_mbclen(left, head+len, enc);
6721 *ptr = byte;
6722 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6723 if (!MBCLEN_CHARFOUND_P(nlen))
6725 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6727 goto end;
6728 }
6729 }
6731 *ptr = byte;
6732
6733 end:
6734 return value;
6735}
6736
6737static VALUE
6738str_byte_substr(VALUE str, long beg, long len, int empty)
6739{
6740 long n = RSTRING_LEN(str);
6741
6742 if (beg > n || len < 0) return Qnil;
6743 if (beg < 0) {
6744 beg += n;
6745 if (beg < 0) return Qnil;
6746 }
6747 if (len > n - beg)
6748 len = n - beg;
6749 if (len <= 0) {
6750 if (!empty) return Qnil;
6751 len = 0;
6752 }
6753
6754 VALUE str2 = str_subseq(str, beg, len);
6755
6756 str_enc_copy_direct(str2, str);
6757
6758 if (RSTRING_LEN(str2) == 0) {
6759 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6761 else
6763 }
6764 else {
6765 switch (ENC_CODERANGE(str)) {
6766 case ENC_CODERANGE_7BIT:
6768 break;
6769 default:
6771 break;
6772 }
6773 }
6774
6775 return str2;
6776}
6777
6778VALUE
6779rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6780{
6781 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6782}
6783
6784static VALUE
6785str_byte_aref(VALUE str, VALUE indx)
6786{
6787 long idx;
6788 if (FIXNUM_P(indx)) {
6789 idx = FIX2LONG(indx);
6790 }
6791 else {
6792 /* check if indx is Range */
6793 long beg, len = RSTRING_LEN(str);
6794
6795 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6796 case Qfalse:
6797 break;
6798 case Qnil:
6799 return Qnil;
6800 default:
6801 return str_byte_substr(str, beg, len, TRUE);
6802 }
6803
6804 idx = NUM2LONG(indx);
6805 }
6806 return str_byte_substr(str, idx, 1, FALSE);
6807}
6808
6809/*
6810 * call-seq:
6811 * byteslice(offset, length = 1) -> string or nil
6812 * byteslice(range) -> string or nil
6813 *
6814 * :include: doc/string/byteslice.rdoc
6815 */
6816
6817static VALUE
6818rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6819{
6820 if (argc == 2) {
6821 long beg = NUM2LONG(argv[0]);
6822 long len = NUM2LONG(argv[1]);
6823 return str_byte_substr(str, beg, len, TRUE);
6824 }
6825 rb_check_arity(argc, 1, 2);
6826 return str_byte_aref(str, argv[0]);
6827}
6828
6829static void
6830str_check_beg_len(VALUE str, long *beg, long *len)
6831{
6832 long end, slen = RSTRING_LEN(str);
6833
6834 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6835 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6836 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6837 }
6838 if (*beg < 0) {
6839 *beg += slen;
6840 }
6841 RUBY_ASSERT(*beg >= 0);
6842 RUBY_ASSERT(*beg <= slen);
6843
6844 if (*len > slen - *beg) {
6845 *len = slen - *beg;
6846 }
6847 end = *beg + *len;
6848 str_ensure_byte_pos(str, *beg);
6849 str_ensure_byte_pos(str, end);
6850}
6851
6852/*
6853 * call-seq:
6854 * bytesplice(offset, length, str) -> self
6855 * bytesplice(offset, length, str, str_offset, str_length) -> self
6856 * bytesplice(range, str) -> self
6857 * bytesplice(range, str, str_range) -> self
6858 *
6859 * :include: doc/string/bytesplice.rdoc
6860 */
6861
6862static VALUE
6863rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6864{
6865 long beg, len, vbeg, vlen;
6866 VALUE val;
6867 int cr;
6868
6869 rb_check_arity(argc, 2, 5);
6870 if (!(argc == 2 || argc == 3 || argc == 5)) {
6871 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6872 }
6873 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6874 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6875 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6876 rb_builtin_class_name(argv[0]));
6877 }
6878 val = argv[1];
6879 StringValue(val);
6880 if (argc == 2) {
6881 /* bytesplice(range, str) */
6882 vbeg = 0;
6883 vlen = RSTRING_LEN(val);
6884 }
6885 else {
6886 /* bytesplice(range, str, str_range) */
6887 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6888 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6889 rb_builtin_class_name(argv[2]));
6890 }
6891 }
6892 }
6893 else {
6894 beg = NUM2LONG(argv[0]);
6895 len = NUM2LONG(argv[1]);
6896 val = argv[2];
6897 StringValue(val);
6898 if (argc == 3) {
6899 /* bytesplice(index, length, str) */
6900 vbeg = 0;
6901 vlen = RSTRING_LEN(val);
6902 }
6903 else {
6904 /* bytesplice(index, length, str, str_index, str_length) */
6905 vbeg = NUM2LONG(argv[3]);
6906 vlen = NUM2LONG(argv[4]);
6907 }
6908 }
6909 str_check_beg_len(str, &beg, &len);
6910 str_check_beg_len(val, &vbeg, &vlen);
6911 str_modify_keep_cr(str);
6912
6913 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6914 rb_enc_associate(str, rb_enc_check(str, val));
6915 }
6916
6917 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6919 if (cr != ENC_CODERANGE_BROKEN)
6920 ENC_CODERANGE_SET(str, cr);
6921 return str;
6922}
6923
6924/*
6925 * call-seq:
6926 * reverse -> new_string
6927 *
6928 * Returns a new string with the characters from +self+ in reverse order.
6929 *
6930 * 'drawer'.reverse # => "reward"
6931 * 'reviled'.reverse # => "deliver"
6932 * 'stressed'.reverse # => "desserts"
6933 * 'semordnilaps'.reverse # => "spalindromes"
6934 *
6935 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6936 */
6937
6938static VALUE
6939rb_str_reverse(VALUE str)
6940{
6941 rb_encoding *enc;
6942 VALUE rev;
6943 char *s, *e, *p;
6944 int cr;
6945
6946 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6947 enc = STR_ENC_GET(str);
6948 rev = rb_str_new(0, RSTRING_LEN(str));
6949 s = RSTRING_PTR(str); e = RSTRING_END(str);
6950 p = RSTRING_END(rev);
6951 cr = ENC_CODERANGE(str);
6952
6953 if (RSTRING_LEN(str) > 1) {
6954 if (single_byte_optimizable(str)) {
6955 while (s < e) {
6956 *--p = *s++;
6957 }
6958 }
6959 else if (cr == ENC_CODERANGE_VALID) {
6960 while (s < e) {
6961 int clen = rb_enc_fast_mbclen(s, e, enc);
6962
6963 p -= clen;
6964 memcpy(p, s, clen);
6965 s += clen;
6966 }
6967 }
6968 else {
6969 cr = rb_enc_asciicompat(enc) ?
6971 while (s < e) {
6972 int clen = rb_enc_mbclen(s, e, enc);
6973
6974 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6975 p -= clen;
6976 memcpy(p, s, clen);
6977 s += clen;
6978 }
6979 }
6980 }
6981 STR_SET_LEN(rev, RSTRING_LEN(str));
6982 str_enc_copy_direct(rev, str);
6983 ENC_CODERANGE_SET(rev, cr);
6984
6985 return rev;
6986}
6987
6988
6989/*
6990 * call-seq:
6991 * reverse! -> self
6992 *
6993 * Returns +self+ with its characters reversed:
6994 *
6995 * 'drawer'.reverse! # => "reward"
6996 * 'reviled'.reverse! # => "deliver"
6997 * 'stressed'.reverse! # => "desserts"
6998 * 'semordnilaps'.reverse! # => "spalindromes"
6999 *
7000 * Related: see {Modifying}[rdoc-ref:String@Modifying].
7001 */
7002
7003static VALUE
7004rb_str_reverse_bang(VALUE str)
7005{
7006 if (RSTRING_LEN(str) > 1) {
7007 if (single_byte_optimizable(str)) {
7008 char *s, *e, c;
7009
7010 str_modify_keep_cr(str);
7011 s = RSTRING_PTR(str);
7012 e = RSTRING_END(str) - 1;
7013 while (s < e) {
7014 c = *s;
7015 *s++ = *e;
7016 *e-- = c;
7017 }
7018 }
7019 else {
7020 str_shared_replace(str, rb_str_reverse(str));
7021 }
7022 }
7023 else {
7024 str_modify_keep_cr(str);
7025 }
7026 return str;
7027}
7028
7029
7030/*
7031 * call-seq:
7032 * include?(other_string) -> true or false
7033 *
7034 * Returns whether +self+ contains +other_string+:
7035 *
7036 * s = 'bar'
7037 * s.include?('ba') # => true
7038 * s.include?('ar') # => true
7039 * s.include?('bar') # => true
7040 * s.include?('a') # => true
7041 * s.include?('') # => true
7042 * s.include?('foo') # => false
7043 *
7044 * Related: see {Querying}[rdoc-ref:String@Querying].
7045 */
7046
7047VALUE
7048rb_str_include(VALUE str, VALUE arg)
7049{
7050 long i;
7051
7052 StringValue(arg);
7053 i = rb_str_index(str, arg, 0);
7054
7055 return RBOOL(i != -1);
7056}
7057
7058
7059/*
7060 * call-seq:
7061 * to_i(base = 10) -> integer
7062 *
7063 * Returns the result of interpreting leading characters in +self+
7064 * as an integer in the given +base+;
7065 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7066 *
7067 * '123456'.to_i # => 123456
7068 * '123def'.to_i(16) # => 1195503
7069 *
7070 * With +base+ zero given, string +object+ may contain leading characters
7071 * to specify the actual base:
7072 *
7073 * '123def'.to_i(0) # => 123
7074 * '0123def'.to_i(0) # => 83
7075 * '0b123def'.to_i(0) # => 1
7076 * '0o123def'.to_i(0) # => 83
7077 * '0d123def'.to_i(0) # => 123
7078 * '0x123def'.to_i(0) # => 1195503
7079 *
7080 * Characters past a leading valid number (in the given +base+) are ignored:
7081 *
7082 * '12.345'.to_i # => 12
7083 * '12345'.to_i(2) # => 1
7084 *
7085 * Returns zero if there is no leading valid number:
7086 *
7087 * 'abcdef'.to_i # => 0
7088 * '2'.to_i(2) # => 0
7089 *
7090 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7091 */
7092
7093static VALUE
7094rb_str_to_i(int argc, VALUE *argv, VALUE str)
7095{
7096 int base = 10;
7097
7098 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7099 rb_raise(rb_eArgError, "invalid radix %d", base);
7100 }
7101 return rb_str_to_inum(str, base, FALSE);
7102}
7103
7104
7105/*
7106 * call-seq:
7107 * to_f -> float
7108 *
7109 * Returns the result of interpreting leading characters in +self+ as a Float:
7110 *
7111 * '3.14159'.to_f # => 3.14159
7112 * '1.234e-2'.to_f # => 0.01234
7113 *
7114 * Characters past a leading valid number are ignored:
7115 *
7116 * '3.14 (pi to two places)'.to_f # => 3.14
7117 *
7118 * Returns zero if there is no leading valid number:
7119 *
7120 * 'abcdef'.to_f # => 0.0
7121 *
7122 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7123 */
7124
7125static VALUE
7126rb_str_to_f(VALUE str)
7127{
7128 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7129}
7130
7131
7132/*
7133 * call-seq:
7134 * to_s -> self or new_string
7135 *
7136 * Returns +self+ if +self+ is a +String+,
7137 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7138 *
7139 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7140 */
7141
7142static VALUE
7143rb_str_to_s(VALUE str)
7144{
7145 if (rb_obj_class(str) != rb_cString) {
7146 return str_duplicate(rb_cString, str);
7147 }
7148 return str;
7149}
7150
7151#if 0
7152static void
7153str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7154{
7155 char s[RUBY_MAX_CHAR_LEN];
7156 int n = rb_enc_codelen(c, enc);
7157
7158 rb_enc_mbcput(c, s, enc);
7159 rb_enc_str_buf_cat(str, s, n, enc);
7160}
7161#endif
7162
7163#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7164
7165int
7166rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7167{
7168 char buf[CHAR_ESC_LEN + 1];
7169 int l;
7170
7171#if SIZEOF_INT > 4
7172 c &= 0xffffffff;
7173#endif
7174 if (unicode_p) {
7175 if (c < 0x7F && ISPRINT(c)) {
7176 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7177 }
7178 else if (c < 0x10000) {
7179 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7180 }
7181 else {
7182 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7183 }
7184 }
7185 else {
7186 if (c < 0x100) {
7187 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7188 }
7189 else {
7190 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7191 }
7192 }
7193 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7194 rb_str_buf_cat(result, buf, l);
7195 return l;
7196}
7197
7198const char *
7199ruby_escaped_char(int c)
7200{
7201 switch (c) {
7202 case '\0': return "\\0";
7203 case '\n': return "\\n";
7204 case '\r': return "\\r";
7205 case '\t': return "\\t";
7206 case '\f': return "\\f";
7207 case '\013': return "\\v";
7208 case '\010': return "\\b";
7209 case '\007': return "\\a";
7210 case '\033': return "\\e";
7211 case '\x7f': return "\\c?";
7212 }
7213 return NULL;
7214}
7215
7216VALUE
7217rb_str_escape(VALUE str)
7218{
7219 int encidx = ENCODING_GET(str);
7220 rb_encoding *enc = rb_enc_from_index(encidx);
7221 const char *p = RSTRING_PTR(str);
7222 const char *pend = RSTRING_END(str);
7223 const char *prev = p;
7224 char buf[CHAR_ESC_LEN + 1];
7225 VALUE result = rb_str_buf_new(0);
7226 int unicode_p = rb_enc_unicode_p(enc);
7227 int asciicompat = rb_enc_asciicompat(enc);
7228
7229 while (p < pend) {
7230 unsigned int c;
7231 const char *cc;
7232 int n = rb_enc_precise_mbclen(p, pend, enc);
7233 if (!MBCLEN_CHARFOUND_P(n)) {
7234 if (p > prev) str_buf_cat(result, prev, p - prev);
7235 n = rb_enc_mbminlen(enc);
7236 if (pend < p + n)
7237 n = (int)(pend - p);
7238 while (n--) {
7239 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7240 str_buf_cat(result, buf, strlen(buf));
7241 prev = ++p;
7242 }
7243 continue;
7244 }
7245 n = MBCLEN_CHARFOUND_LEN(n);
7246 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7247 p += n;
7248 cc = ruby_escaped_char(c);
7249 if (cc) {
7250 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7251 str_buf_cat(result, cc, strlen(cc));
7252 prev = p;
7253 }
7254 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7255 }
7256 else {
7257 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7258 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7259 prev = p;
7260 }
7261 }
7262 if (p > prev) str_buf_cat(result, prev, p - prev);
7263 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7264
7265 return result;
7266}
7267
7268/*
7269 * call-seq:
7270 * inspect -> string
7271 *
7272 * :include: doc/string/inspect.rdoc
7273 *
7274 */
7275
7276VALUE
7278{
7279 int encidx = ENCODING_GET(str);
7280 rb_encoding *enc = rb_enc_from_index(encidx);
7281 const char *p, *pend, *prev;
7282 char buf[CHAR_ESC_LEN + 1];
7283 VALUE result = rb_str_buf_new(0);
7284 rb_encoding *resenc = rb_default_internal_encoding();
7285 int unicode_p = rb_enc_unicode_p(enc);
7286 int asciicompat = rb_enc_asciicompat(enc);
7287
7288 if (resenc == NULL) resenc = rb_default_external_encoding();
7289 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7290 rb_enc_associate(result, resenc);
7291 str_buf_cat2(result, "\"");
7292
7293 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7294 prev = p;
7295 while (p < pend) {
7296 unsigned int c, cc;
7297 int n;
7298
7299 n = rb_enc_precise_mbclen(p, pend, enc);
7300 if (!MBCLEN_CHARFOUND_P(n)) {
7301 if (p > prev) str_buf_cat(result, prev, p - prev);
7302 n = rb_enc_mbminlen(enc);
7303 if (pend < p + n)
7304 n = (int)(pend - p);
7305 while (n--) {
7306 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7307 str_buf_cat(result, buf, strlen(buf));
7308 prev = ++p;
7309 }
7310 continue;
7311 }
7312 n = MBCLEN_CHARFOUND_LEN(n);
7313 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7314 p += n;
7315 if ((asciicompat || unicode_p) &&
7316 (c == '"'|| c == '\\' ||
7317 (c == '#' &&
7318 p < pend &&
7319 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7320 (cc = rb_enc_codepoint(p,pend,enc),
7321 (cc == '$' || cc == '@' || cc == '{'))))) {
7322 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7323 str_buf_cat2(result, "\\");
7324 if (asciicompat || enc == resenc) {
7325 prev = p - n;
7326 continue;
7327 }
7328 }
7329 switch (c) {
7330 case '\n': cc = 'n'; break;
7331 case '\r': cc = 'r'; break;
7332 case '\t': cc = 't'; break;
7333 case '\f': cc = 'f'; break;
7334 case '\013': cc = 'v'; break;
7335 case '\010': cc = 'b'; break;
7336 case '\007': cc = 'a'; break;
7337 case 033: cc = 'e'; break;
7338 default: cc = 0; break;
7339 }
7340 if (cc) {
7341 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7342 buf[0] = '\\';
7343 buf[1] = (char)cc;
7344 str_buf_cat(result, buf, 2);
7345 prev = p;
7346 continue;
7347 }
7348 /* The special casing of 0x85 (NEXT_LINE) here is because
7349 * Oniguruma historically treats it as printable, but it
7350 * doesn't match the print POSIX bracket class or character
7351 * property in regexps.
7352 *
7353 * See Ruby Bug #16842 for details:
7354 * https://bugs.ruby-lang.org/issues/16842
7355 */
7356 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7357 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7358 continue;
7359 }
7360 else {
7361 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7362 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7363 prev = p;
7364 continue;
7365 }
7366 }
7367 if (p > prev) str_buf_cat(result, prev, p - prev);
7368 str_buf_cat2(result, "\"");
7369
7370 return result;
7371}
7372
7373#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7374
7375/*
7376 * call-seq:
7377 * dump -> new_string
7378 *
7379 * :include: doc/string/dump.rdoc
7380 *
7381 */
7382
7383VALUE
7385{
7386 int encidx = rb_enc_get_index(str);
7387 rb_encoding *enc = rb_enc_from_index(encidx);
7388 long len;
7389 const char *p, *pend;
7390 char *q, *qend;
7391 VALUE result;
7392 int u8 = (encidx == rb_utf8_encindex());
7393 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7394
7395 len = 2; /* "" */
7396 if (!rb_enc_asciicompat(enc)) {
7397 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7398 len += strlen(enc->name);
7399 }
7400
7401 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7402 while (p < pend) {
7403 int clen;
7404 unsigned char c = *p++;
7405
7406 switch (c) {
7407 case '"': case '\\':
7408 case '\n': case '\r':
7409 case '\t': case '\f':
7410 case '\013': case '\010': case '\007': case '\033':
7411 clen = 2;
7412 break;
7413
7414 case '#':
7415 clen = IS_EVSTR(p, pend) ? 2 : 1;
7416 break;
7417
7418 default:
7419 if (ISPRINT(c)) {
7420 clen = 1;
7421 }
7422 else {
7423 if (u8 && c > 0x7F) { /* \u notation */
7424 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7425 if (MBCLEN_CHARFOUND_P(n)) {
7426 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7427 if (cc <= 0xFFFF)
7428 clen = 6; /* \uXXXX */
7429 else if (cc <= 0xFFFFF)
7430 clen = 9; /* \u{XXXXX} */
7431 else
7432 clen = 10; /* \u{XXXXXX} */
7433 p += MBCLEN_CHARFOUND_LEN(n)-1;
7434 break;
7435 }
7436 }
7437 clen = 4; /* \xNN */
7438 }
7439 break;
7440 }
7441
7442 if (clen > LONG_MAX - len) {
7443 rb_raise(rb_eRuntimeError, "string size too big");
7444 }
7445 len += clen;
7446 }
7447
7448 result = rb_str_new(0, len);
7449 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7450 q = RSTRING_PTR(result); qend = q + len + 1;
7451
7452 *q++ = '"';
7453 while (p < pend) {
7454 unsigned char c = *p++;
7455
7456 if (c == '"' || c == '\\') {
7457 *q++ = '\\';
7458 *q++ = c;
7459 }
7460 else if (c == '#') {
7461 if (IS_EVSTR(p, pend)) *q++ = '\\';
7462 *q++ = '#';
7463 }
7464 else if (c == '\n') {
7465 *q++ = '\\';
7466 *q++ = 'n';
7467 }
7468 else if (c == '\r') {
7469 *q++ = '\\';
7470 *q++ = 'r';
7471 }
7472 else if (c == '\t') {
7473 *q++ = '\\';
7474 *q++ = 't';
7475 }
7476 else if (c == '\f') {
7477 *q++ = '\\';
7478 *q++ = 'f';
7479 }
7480 else if (c == '\013') {
7481 *q++ = '\\';
7482 *q++ = 'v';
7483 }
7484 else if (c == '\010') {
7485 *q++ = '\\';
7486 *q++ = 'b';
7487 }
7488 else if (c == '\007') {
7489 *q++ = '\\';
7490 *q++ = 'a';
7491 }
7492 else if (c == '\033') {
7493 *q++ = '\\';
7494 *q++ = 'e';
7495 }
7496 else if (ISPRINT(c)) {
7497 *q++ = c;
7498 }
7499 else {
7500 *q++ = '\\';
7501 if (u8) {
7502 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7503 if (MBCLEN_CHARFOUND_P(n)) {
7504 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7505 p += n;
7506 if (cc <= 0xFFFF)
7507 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7508 else
7509 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7510 q += strlen(q);
7511 continue;
7512 }
7513 }
7514 snprintf(q, qend-q, "x%02X", c);
7515 q += 3;
7516 }
7517 }
7518 *q++ = '"';
7519 *q = '\0';
7520 if (!rb_enc_asciicompat(enc)) {
7521 snprintf(q, qend-q, nonascii_suffix, enc->name);
7522 encidx = rb_ascii8bit_encindex();
7523 }
7524 /* result from dump is ASCII */
7525 rb_enc_associate_index(result, encidx);
7527 return result;
7528}
7529
7530static int
7531unescape_ascii(unsigned int c)
7532{
7533 switch (c) {
7534 case 'n':
7535 return '\n';
7536 case 'r':
7537 return '\r';
7538 case 't':
7539 return '\t';
7540 case 'f':
7541 return '\f';
7542 case 'v':
7543 return '\13';
7544 case 'b':
7545 return '\010';
7546 case 'a':
7547 return '\007';
7548 case 'e':
7549 return 033;
7550 }
7552}
7553
7554static void
7555undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7556{
7557 const char *s = *ss;
7558 unsigned int c;
7559 int codelen;
7560 size_t hexlen;
7561 unsigned char buf[6];
7562 static rb_encoding *enc_utf8 = NULL;
7563
7564 switch (*s) {
7565 case '\\':
7566 case '"':
7567 case '#':
7568 rb_str_cat(undumped, s, 1); /* cat itself */
7569 s++;
7570 break;
7571 case 'n':
7572 case 'r':
7573 case 't':
7574 case 'f':
7575 case 'v':
7576 case 'b':
7577 case 'a':
7578 case 'e':
7579 *buf = unescape_ascii(*s);
7580 rb_str_cat(undumped, (char *)buf, 1);
7581 s++;
7582 break;
7583 case 'u':
7584 if (*binary) {
7585 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7586 }
7587 *utf8 = true;
7588 if (++s >= s_end) {
7589 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7590 }
7591 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7592 if (*penc != enc_utf8) {
7593 *penc = enc_utf8;
7594 rb_enc_associate(undumped, enc_utf8);
7595 }
7596 if (*s == '{') { /* handle \u{...} form */
7597 s++;
7598 for (;;) {
7599 if (s >= s_end) {
7600 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7601 }
7602 if (*s == '}') {
7603 s++;
7604 break;
7605 }
7606 if (ISSPACE(*s)) {
7607 s++;
7608 continue;
7609 }
7610 c = scan_hex(s, s_end-s, &hexlen);
7611 if (hexlen == 0 || hexlen > 6) {
7612 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7613 }
7614 if (c > 0x10ffff) {
7615 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7616 }
7617 if (0xd800 <= c && c <= 0xdfff) {
7618 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7619 }
7620 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7621 rb_str_cat(undumped, (char *)buf, codelen);
7622 s += hexlen;
7623 }
7624 }
7625 else { /* handle \uXXXX form */
7626 c = scan_hex(s, 4, &hexlen);
7627 if (hexlen != 4) {
7628 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7629 }
7630 if (0xd800 <= c && c <= 0xdfff) {
7631 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7632 }
7633 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7634 rb_str_cat(undumped, (char *)buf, codelen);
7635 s += hexlen;
7636 }
7637 break;
7638 case 'x':
7639 if (++s >= s_end) {
7640 rb_raise(rb_eRuntimeError, "invalid hex escape");
7641 }
7642 *buf = scan_hex(s, 2, &hexlen);
7643 if (hexlen != 2) {
7644 rb_raise(rb_eRuntimeError, "invalid hex escape");
7645 }
7646 if (!ISASCII(*buf)) {
7647 if (*utf8) {
7648 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7649 }
7650 *binary = true;
7651 }
7652 rb_str_cat(undumped, (char *)buf, 1);
7653 s += hexlen;
7654 break;
7655 default:
7656 rb_str_cat(undumped, s-1, 2);
7657 s++;
7658 }
7659
7660 *ss = s;
7661}
7662
7663static VALUE rb_str_is_ascii_only_p(VALUE str);
7664
7665/*
7666 * call-seq:
7667 * undump -> new_string
7668 *
7669 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7670 *
7671 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7672 */
7673
7674static VALUE
7675str_undump(VALUE str)
7676{
7677 const char *s = RSTRING_PTR(str);
7678 const char *s_end = RSTRING_END(str);
7679 rb_encoding *enc = rb_enc_get(str);
7680 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7681 bool utf8 = false;
7682 bool binary = false;
7683 int w;
7684
7686 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7687 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7688 }
7689 if (!str_null_check(str, &w)) {
7690 rb_raise(rb_eRuntimeError, "string contains null byte");
7691 }
7692 if (RSTRING_LEN(str) < 2) goto invalid_format;
7693 if (*s != '"') goto invalid_format;
7694
7695 /* strip '"' at the start */
7696 s++;
7697
7698 for (;;) {
7699 if (s >= s_end) {
7700 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7701 }
7702
7703 if (*s == '"') {
7704 /* epilogue */
7705 s++;
7706 if (s == s_end) {
7707 /* ascii compatible dumped string */
7708 break;
7709 }
7710 else {
7711 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7712 static const char dup_suffix[] = ".dup";
7713 const char *encname;
7714 int encidx;
7715 ptrdiff_t size;
7716
7717 /* check separately for strings dumped by older versions */
7718 size = sizeof(dup_suffix) - 1;
7719 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7720
7721 size = sizeof(force_encoding_suffix) - 1;
7722 if (s_end - s <= size) goto invalid_format;
7723 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7724 s += size;
7725
7726 if (utf8) {
7727 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7728 }
7729
7730 encname = s;
7731 s = memchr(s, '"', s_end-s);
7732 size = s - encname;
7733 if (!s) goto invalid_format;
7734 if (s_end - s != 2) goto invalid_format;
7735 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7736
7737 encidx = rb_enc_find_index2(encname, (long)size);
7738 if (encidx < 0) {
7739 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7740 }
7741 rb_enc_associate_index(undumped, encidx);
7742 }
7743 break;
7744 }
7745
7746 if (*s == '\\') {
7747 s++;
7748 if (s >= s_end) {
7749 rb_raise(rb_eRuntimeError, "invalid escape");
7750 }
7751 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7752 }
7753 else {
7754 rb_str_cat(undumped, s++, 1);
7755 }
7756 }
7757
7758 RB_GC_GUARD(str);
7759
7760 return undumped;
7761invalid_format:
7762 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7763}
7764
7765static void
7766rb_str_check_dummy_enc(rb_encoding *enc)
7767{
7768 if (rb_enc_dummy_p(enc)) {
7769 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7770 rb_enc_name(enc));
7771 }
7772}
7773
7774static rb_encoding *
7775str_true_enc(VALUE str)
7776{
7777 rb_encoding *enc = STR_ENC_GET(str);
7778 rb_str_check_dummy_enc(enc);
7779 return enc;
7780}
7781
7782static OnigCaseFoldType
7783check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7784{
7785 if (argc==0)
7786 return flags;
7787 if (argc>2)
7788 rb_raise(rb_eArgError, "too many options");
7789 if (argv[0]==sym_turkic) {
7790 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7791 if (argc==2) {
7792 if (argv[1]==sym_lithuanian)
7793 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7794 else
7795 rb_raise(rb_eArgError, "invalid second option");
7796 }
7797 }
7798 else if (argv[0]==sym_lithuanian) {
7799 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7800 if (argc==2) {
7801 if (argv[1]==sym_turkic)
7802 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7803 else
7804 rb_raise(rb_eArgError, "invalid second option");
7805 }
7806 }
7807 else if (argc>1)
7808 rb_raise(rb_eArgError, "too many options");
7809 else if (argv[0]==sym_ascii)
7810 flags |= ONIGENC_CASE_ASCII_ONLY;
7811 else if (argv[0]==sym_fold) {
7812 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7813 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7814 else
7815 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7816 }
7817 else
7818 rb_raise(rb_eArgError, "invalid option");
7819 return flags;
7820}
7821
7822static inline bool
7823case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7824{
7825 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7826 return true;
7827 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7828}
7829
7830/* 16 should be long enough to absorb any kind of single character length increase */
7831#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7832#ifndef CASEMAP_DEBUG
7833# define CASEMAP_DEBUG 0
7834#endif
7835
7836struct mapping_buffer;
7837typedef struct mapping_buffer {
7838 size_t capa;
7839 size_t used;
7840 struct mapping_buffer *next;
7841 OnigUChar space[FLEX_ARY_LEN];
7843
7844static void
7845mapping_buffer_free(void *p)
7846{
7847 mapping_buffer *previous_buffer;
7848 mapping_buffer *current_buffer = p;
7849 while (current_buffer) {
7850 previous_buffer = current_buffer;
7851 current_buffer = current_buffer->next;
7852 ruby_xfree_sized(previous_buffer, offsetof(mapping_buffer, space) + previous_buffer->capa);
7853 }
7854}
7855
7856static const rb_data_type_t mapping_buffer_type = {
7857 "mapping_buffer",
7858 {0, mapping_buffer_free,},
7859 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7860};
7861
7862static VALUE
7863rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7864{
7865 VALUE target;
7866
7867 const OnigUChar *source_current, *source_end;
7868 int target_length = 0;
7869 VALUE buffer_anchor;
7870 mapping_buffer *current_buffer = 0;
7871 mapping_buffer **pre_buffer;
7872 size_t buffer_count = 0;
7873 int buffer_length_or_invalid;
7874
7875 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7876
7877 source_current = (OnigUChar*)RSTRING_PTR(source);
7878 source_end = (OnigUChar*)RSTRING_END(source);
7879
7880 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7881 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7882 while (source_current < source_end) {
7883 /* increase multiplier using buffer count to converge quickly */
7884 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7885 if (CASEMAP_DEBUG) {
7886 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7887 }
7888 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7889 *pre_buffer = current_buffer;
7890 pre_buffer = &current_buffer->next;
7891 current_buffer->next = NULL;
7892 current_buffer->capa = capa;
7893 buffer_length_or_invalid = enc->case_map(flags,
7894 &source_current, source_end,
7895 current_buffer->space,
7896 current_buffer->space+current_buffer->capa,
7897 enc);
7898 if (buffer_length_or_invalid < 0) {
7899 current_buffer = DATA_PTR(buffer_anchor);
7900 DATA_PTR(buffer_anchor) = 0;
7901 mapping_buffer_free(current_buffer);
7902 rb_raise(rb_eArgError, "input string invalid");
7903 }
7904 target_length += current_buffer->used = buffer_length_or_invalid;
7905 }
7906 if (CASEMAP_DEBUG) {
7907 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7908 }
7909
7910 if (buffer_count==1) {
7911 target = rb_str_new((const char*)current_buffer->space, target_length);
7912 }
7913 else {
7914 char *target_current;
7915
7916 target = rb_str_new(0, target_length);
7917 target_current = RSTRING_PTR(target);
7918 current_buffer = DATA_PTR(buffer_anchor);
7919 while (current_buffer) {
7920 memcpy(target_current, current_buffer->space, current_buffer->used);
7921 target_current += current_buffer->used;
7922 current_buffer = current_buffer->next;
7923 }
7924 }
7925 current_buffer = DATA_PTR(buffer_anchor);
7926 DATA_PTR(buffer_anchor) = 0;
7927 mapping_buffer_free(current_buffer);
7928
7929 RB_GC_GUARD(buffer_anchor);
7930
7931 /* TODO: check about string terminator character */
7932 str_enc_copy_direct(target, source);
7933 /*ENC_CODERANGE_SET(mapped, cr);*/
7934
7935 return target;
7936}
7937
7938static VALUE
7939rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7940{
7941 const OnigUChar *source_current, *source_end;
7942 OnigUChar *target_current, *target_end;
7943 long old_length = RSTRING_LEN(source);
7944 int length_or_invalid;
7945
7946 if (old_length == 0) return Qnil;
7947
7948 source_current = (OnigUChar*)RSTRING_PTR(source);
7949 source_end = (OnigUChar*)RSTRING_END(source);
7950 if (source == target) {
7951 target_current = (OnigUChar*)source_current;
7952 target_end = (OnigUChar*)source_end;
7953 }
7954 else {
7955 target_current = (OnigUChar*)RSTRING_PTR(target);
7956 target_end = (OnigUChar*)RSTRING_END(target);
7957 }
7958
7959 length_or_invalid = onigenc_ascii_only_case_map(flags,
7960 &source_current, source_end,
7961 target_current, target_end, enc);
7962 if (length_or_invalid < 0)
7963 rb_raise(rb_eArgError, "input string invalid");
7964 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7965 fprintf(stderr, "problem with rb_str_ascii_casemap"
7966 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7967 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7968 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7969 }
7970
7971 str_enc_copy(target, source);
7972
7973 return target;
7974}
7975
7976static bool
7977upcase_single(VALUE str)
7978{
7979 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7980 bool modified = false;
7981
7982 while (s < send) {
7983 unsigned int c = *(unsigned char*)s;
7984
7985 if ('a' <= c && c <= 'z') {
7986 *s = 'A' + (c - 'a');
7987 modified = true;
7988 }
7989 s++;
7990 }
7991 return modified;
7992}
7993
7994/*
7995 * call-seq:
7996 * upcase!(mapping) -> self or nil
7997 *
7998 * Like String#upcase, except that:
7999 *
8000 * - Changes character casings in +self+ (not in a copy of +self+).
8001 * - Returns +self+ if any changes are made, +nil+ otherwise.
8002 *
8003 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8004 */
8005
8006static VALUE
8007rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8008{
8009 rb_encoding *enc;
8010 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8011
8012 flags = check_case_options(argc, argv, flags);
8013 str_modify_keep_cr(str);
8014 enc = str_true_enc(str);
8015 if (case_option_single_p(flags, enc, str)) {
8016 if (upcase_single(str))
8017 flags |= ONIGENC_CASE_MODIFIED;
8018 }
8019 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8020 rb_str_ascii_casemap(str, str, &flags, enc);
8021 else
8022 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8023
8024 if (ONIGENC_CASE_MODIFIED&flags) return str;
8025 return Qnil;
8026}
8027
8028
8029/*
8030 * call-seq:
8031 * upcase(mapping = :ascii) -> new_string
8032 *
8033 * :include: doc/string/upcase.rdoc
8034 */
8035
8036static VALUE
8037rb_str_upcase(int argc, VALUE *argv, VALUE str)
8038{
8039 rb_encoding *enc;
8040 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8041 VALUE ret;
8042
8043 flags = check_case_options(argc, argv, flags);
8044 enc = str_true_enc(str);
8045 if (case_option_single_p(flags, enc, str)) {
8046 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8047 str_enc_copy_direct(ret, str);
8048 upcase_single(ret);
8049 }
8050 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8051 ret = rb_str_new(0, RSTRING_LEN(str));
8052 rb_str_ascii_casemap(str, ret, &flags, enc);
8053 }
8054 else {
8055 ret = rb_str_casemap(str, &flags, enc);
8056 }
8057
8058 return ret;
8059}
8060
8061static bool
8062downcase_single(VALUE str)
8063{
8064 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8065 bool modified = false;
8066
8067 while (s < send) {
8068 unsigned int c = *(unsigned char*)s;
8069
8070 if ('A' <= c && c <= 'Z') {
8071 *s = 'a' + (c - 'A');
8072 modified = true;
8073 }
8074 s++;
8075 }
8076
8077 return modified;
8078}
8079
8080/*
8081 * call-seq:
8082 * downcase!(mapping) -> self or nil
8083 *
8084 * Like String#downcase, except that:
8085 *
8086 * - Changes character casings in +self+ (not in a copy of +self+).
8087 * - Returns +self+ if any changes are made, +nil+ otherwise.
8088 *
8089 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8090 */
8091
8092static VALUE
8093rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8094{
8095 rb_encoding *enc;
8096 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8097
8098 flags = check_case_options(argc, argv, flags);
8099 str_modify_keep_cr(str);
8100 enc = str_true_enc(str);
8101 if (case_option_single_p(flags, enc, str)) {
8102 if (downcase_single(str))
8103 flags |= ONIGENC_CASE_MODIFIED;
8104 }
8105 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8106 rb_str_ascii_casemap(str, str, &flags, enc);
8107 else
8108 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8109
8110 if (ONIGENC_CASE_MODIFIED&flags) return str;
8111 return Qnil;
8112}
8113
8114
8115/*
8116 * call-seq:
8117 * downcase(mapping = :ascii) -> new_string
8118 *
8119 * :include: doc/string/downcase.rdoc
8120 *
8121 */
8122
8123static VALUE
8124rb_str_downcase(int argc, VALUE *argv, VALUE str)
8125{
8126 rb_encoding *enc;
8127 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8128 VALUE ret;
8129
8130 flags = check_case_options(argc, argv, flags);
8131 enc = str_true_enc(str);
8132 if (case_option_single_p(flags, enc, str)) {
8133 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8134 str_enc_copy_direct(ret, str);
8135 downcase_single(ret);
8136 }
8137 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8138 ret = rb_str_new(0, RSTRING_LEN(str));
8139 rb_str_ascii_casemap(str, ret, &flags, enc);
8140 }
8141 else {
8142 ret = rb_str_casemap(str, &flags, enc);
8143 }
8144
8145 return ret;
8146}
8147
8148
8149/*
8150 * call-seq:
8151 * capitalize!(mapping = :ascii) -> self or nil
8152 *
8153 * Like String#capitalize, except that:
8154 *
8155 * - Changes character casings in +self+ (not in a copy of +self+).
8156 * - Returns +self+ if any changes are made, +nil+ otherwise.
8157 *
8158 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8159 */
8160
8161static VALUE
8162rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8163{
8164 rb_encoding *enc;
8165 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8166
8167 flags = check_case_options(argc, argv, flags);
8168 str_modify_keep_cr(str);
8169 enc = str_true_enc(str);
8170 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8171 if (flags&ONIGENC_CASE_ASCII_ONLY)
8172 rb_str_ascii_casemap(str, str, &flags, enc);
8173 else
8174 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8175
8176 if (ONIGENC_CASE_MODIFIED&flags) return str;
8177 return Qnil;
8178}
8179
8180
8181/*
8182 * call-seq:
8183 * capitalize(mapping = :ascii) -> new_string
8184 *
8185 * :include: doc/string/capitalize.rdoc
8186 *
8187 */
8188
8189static VALUE
8190rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8191{
8192 rb_encoding *enc;
8193 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8194 VALUE ret;
8195
8196 flags = check_case_options(argc, argv, flags);
8197 enc = str_true_enc(str);
8198 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8199 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8200 ret = rb_str_new(0, RSTRING_LEN(str));
8201 rb_str_ascii_casemap(str, ret, &flags, enc);
8202 }
8203 else {
8204 ret = rb_str_casemap(str, &flags, enc);
8205 }
8206 return ret;
8207}
8208
8209
8210/*
8211 * call-seq:
8212 * swapcase!(mapping) -> self or nil
8213 *
8214 * Like String#swapcase, except that:
8215 *
8216 * - Changes are made to +self+, not to copy of +self+.
8217 * - Returns +self+ if any changes are made, +nil+ otherwise.
8218 *
8219 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8220 */
8221
8222static VALUE
8223rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8224{
8225 rb_encoding *enc;
8226 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8227
8228 flags = check_case_options(argc, argv, flags);
8229 str_modify_keep_cr(str);
8230 enc = str_true_enc(str);
8231 if (flags&ONIGENC_CASE_ASCII_ONLY)
8232 rb_str_ascii_casemap(str, str, &flags, enc);
8233 else
8234 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8235
8236 if (ONIGENC_CASE_MODIFIED&flags) return str;
8237 return Qnil;
8238}
8239
8240
8241/*
8242 * call-seq:
8243 * swapcase(mapping = :ascii) -> new_string
8244 *
8245 * :include: doc/string/swapcase.rdoc
8246 *
8247 */
8248
8249static VALUE
8250rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8251{
8252 rb_encoding *enc;
8253 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8254 VALUE ret;
8255
8256 flags = check_case_options(argc, argv, flags);
8257 enc = str_true_enc(str);
8258 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8259 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8260 ret = rb_str_new(0, RSTRING_LEN(str));
8261 rb_str_ascii_casemap(str, ret, &flags, enc);
8262 }
8263 else {
8264 ret = rb_str_casemap(str, &flags, enc);
8265 }
8266 return ret;
8267}
8268
8269typedef unsigned char *USTR;
8270
8271struct tr {
8272 int gen;
8273 unsigned int now, max;
8274 char *p, *pend;
8275};
8276
8277static unsigned int
8278trnext(struct tr *t, rb_encoding *enc)
8279{
8280 int n;
8281
8282 for (;;) {
8283 nextpart:
8284 if (!t->gen) {
8285 if (t->p == t->pend) return -1;
8286 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8287 t->p += n;
8288 }
8289 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8290 t->p += n;
8291 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8292 t->p += n;
8293 if (t->p < t->pend) {
8294 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8295 t->p += n;
8296 if (t->now > c) {
8297 if (t->now < 0x80 && c < 0x80) {
8298 rb_raise(rb_eArgError,
8299 "invalid range \"%c-%c\" in string transliteration",
8300 t->now, c);
8301 }
8302 else {
8303 rb_raise(rb_eArgError, "invalid range in string transliteration");
8304 }
8305 continue; /* not reached */
8306 }
8307 else if (t->now < c) {
8308 t->gen = 1;
8309 t->max = c;
8310 }
8311 }
8312 }
8313 return t->now;
8314 }
8315 else {
8316 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8317 if (t->now == t->max) {
8318 t->gen = 0;
8319 goto nextpart;
8320 }
8321 }
8322 if (t->now < t->max) {
8323 return t->now;
8324 }
8325 else {
8326 t->gen = 0;
8327 return t->max;
8328 }
8329 }
8330 }
8331}
8332
8333static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8334
8335static VALUE
8336tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8337{
8338 const unsigned int errc = -1;
8339 unsigned int trans[256];
8340 rb_encoding *enc, *e1, *e2;
8341 struct tr trsrc, trrepl;
8342 int cflag = 0;
8343 unsigned int c, c0, last = 0;
8344 int modify = 0, i, l;
8345 unsigned char *s, *send;
8346 VALUE hash = 0;
8347 int singlebyte = single_byte_optimizable(str);
8348 int termlen;
8349 int cr;
8350
8351#define CHECK_IF_ASCII(c) \
8352 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8353 (cr = ENC_CODERANGE_VALID) : 0)
8354
8355 StringValue(src);
8356 StringValue(repl);
8357 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8358 if (RSTRING_LEN(repl) == 0) {
8359 return rb_str_delete_bang(1, &src, str);
8360 }
8361
8362 cr = ENC_CODERANGE(str);
8363 e1 = rb_enc_check(str, src);
8364 e2 = rb_enc_check(str, repl);
8365 if (e1 == e2) {
8366 enc = e1;
8367 }
8368 else {
8369 enc = rb_enc_check(src, repl);
8370 }
8371 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8372 if (RSTRING_LEN(src) > 1 &&
8373 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8374 trsrc.p + l < trsrc.pend) {
8375 cflag = 1;
8376 trsrc.p += l;
8377 }
8378 trrepl.p = RSTRING_PTR(repl);
8379 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8380 trsrc.gen = trrepl.gen = 0;
8381 trsrc.now = trrepl.now = 0;
8382 trsrc.max = trrepl.max = 0;
8383
8384 if (cflag) {
8385 for (i=0; i<256; i++) {
8386 trans[i] = 1;
8387 }
8388 while ((c = trnext(&trsrc, enc)) != errc) {
8389 if (c < 256) {
8390 trans[c] = errc;
8391 }
8392 else {
8393 if (!hash) hash = rb_hash_new();
8394 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8395 }
8396 }
8397 while ((c = trnext(&trrepl, enc)) != errc)
8398 /* retrieve last replacer */;
8399 last = trrepl.now;
8400 for (i=0; i<256; i++) {
8401 if (trans[i] != errc) {
8402 trans[i] = last;
8403 }
8404 }
8405 }
8406 else {
8407 unsigned int r;
8408
8409 for (i=0; i<256; i++) {
8410 trans[i] = errc;
8411 }
8412 while ((c = trnext(&trsrc, enc)) != errc) {
8413 r = trnext(&trrepl, enc);
8414 if (r == errc) r = trrepl.now;
8415 if (c < 256) {
8416 trans[c] = r;
8417 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8418 }
8419 else {
8420 if (!hash) hash = rb_hash_new();
8421 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8422 }
8423 }
8424 }
8425
8426 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8427 cr = ENC_CODERANGE_7BIT;
8428 str_modify_keep_cr(str);
8429 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8430 termlen = rb_enc_mbminlen(enc);
8431 if (sflag) {
8432 int clen, tlen;
8433 long offset, max = RSTRING_LEN(str);
8434 unsigned int save = -1;
8435 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8436
8437 while (s < send) {
8438 int may_modify = 0;
8439
8440 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8441 if (!MBCLEN_CHARFOUND_P(r)) {
8442 SIZED_FREE_N(buf, max + termlen);
8443 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8444 }
8445 clen = MBCLEN_CHARFOUND_LEN(r);
8446 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8447
8448 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8449
8450 s += clen;
8451 if (c < 256) {
8452 c = trans[c];
8453 }
8454 else if (hash) {
8455 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8456 if (NIL_P(tmp)) {
8457 if (cflag) c = last;
8458 else c = errc;
8459 }
8460 else if (cflag) c = errc;
8461 else c = NUM2INT(tmp);
8462 }
8463 else {
8464 c = errc;
8465 }
8466 if (c != (unsigned int)-1) {
8467 if (save == c) {
8468 CHECK_IF_ASCII(c);
8469 continue;
8470 }
8471 save = c;
8472 tlen = rb_enc_codelen(c, enc);
8473 modify = 1;
8474 }
8475 else {
8476 save = -1;
8477 c = c0;
8478 if (enc != e1) may_modify = 1;
8479 }
8480 if ((offset = t - buf) + tlen > max) {
8481 size_t MAYBE_UNUSED(old) = max + termlen;
8482 max = offset + tlen + (send - s);
8483 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8484 t = buf + offset;
8485 }
8486 rb_enc_mbcput(c, t, enc);
8487 if (may_modify && memcmp(s, t, tlen) != 0) {
8488 modify = 1;
8489 }
8490 CHECK_IF_ASCII(c);
8491 t += tlen;
8492 }
8493 if (!STR_EMBED_P(str)) {
8494 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8495 }
8496 TERM_FILL((char *)t, termlen);
8497 RSTRING(str)->as.heap.ptr = (char *)buf;
8498 STR_SET_LEN(str, t - buf);
8499 STR_SET_NOEMBED(str);
8500 RSTRING(str)->as.heap.aux.capa = max;
8501 }
8502 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8503 while (s < send) {
8504 c = (unsigned char)*s;
8505 if (trans[c] != errc) {
8506 if (!cflag) {
8507 c = trans[c];
8508 *s = c;
8509 modify = 1;
8510 }
8511 else {
8512 *s = last;
8513 modify = 1;
8514 }
8515 }
8516 CHECK_IF_ASCII(c);
8517 s++;
8518 }
8519 }
8520 else {
8521 int clen, tlen;
8522 long offset, max = (long)((send - s) * 1.2);
8523 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8524
8525 while (s < send) {
8526 int may_modify = 0;
8527
8528 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8529 if (!MBCLEN_CHARFOUND_P(r)) {
8530 SIZED_FREE_N(buf, max + termlen);
8531 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8532 }
8533 clen = MBCLEN_CHARFOUND_LEN(r);
8534 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8535
8536 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8537
8538 if (c < 256) {
8539 c = trans[c];
8540 }
8541 else if (hash) {
8542 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8543 if (NIL_P(tmp)) {
8544 if (cflag) c = last;
8545 else c = errc;
8546 }
8547 else if (cflag) c = errc;
8548 else c = NUM2INT(tmp);
8549 }
8550 else {
8551 c = cflag ? last : errc;
8552 }
8553 if (c != errc) {
8554 tlen = rb_enc_codelen(c, enc);
8555 modify = 1;
8556 }
8557 else {
8558 c = c0;
8559 if (enc != e1) may_modify = 1;
8560 }
8561 if ((offset = t - buf) + tlen > max) {
8562 size_t MAYBE_UNUSED(old) = max + termlen;
8563 max = offset + tlen + (long)((send - s) * 1.2);
8564 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8565 t = buf + offset;
8566 }
8567 if (s != t) {
8568 rb_enc_mbcput(c, t, enc);
8569 if (may_modify && memcmp(s, t, tlen) != 0) {
8570 modify = 1;
8571 }
8572 }
8573 CHECK_IF_ASCII(c);
8574 s += clen;
8575 t += tlen;
8576 }
8577 if (!STR_EMBED_P(str)) {
8578 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8579 }
8580 TERM_FILL((char *)t, termlen);
8581 RSTRING(str)->as.heap.ptr = (char *)buf;
8582 STR_SET_LEN(str, t - buf);
8583 STR_SET_NOEMBED(str);
8584 RSTRING(str)->as.heap.aux.capa = max;
8585 }
8586
8587 if (modify) {
8588 if (cr != ENC_CODERANGE_BROKEN)
8589 ENC_CODERANGE_SET(str, cr);
8590 rb_enc_associate(str, enc);
8591 return str;
8592 }
8593 return Qnil;
8594}
8595
8596
8597/*
8598 * call-seq:
8599 * tr!(selector, replacements) -> self or nil
8600 *
8601 * Like String#tr, except:
8602 *
8603 * - Performs substitutions in +self+ (not in a copy of +self+).
8604 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8605 *
8606 * Related: {Modifying}[rdoc-ref:String@Modifying].
8607 */
8608
8609static VALUE
8610rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8611{
8612 return tr_trans(str, src, repl, 0);
8613}
8614
8615
8616/*
8617 * call-seq:
8618 * tr(selector, replacements) -> new_string
8619 *
8620 * Returns a copy of +self+ with each character specified by string +selector+
8621 * translated to the corresponding character in string +replacements+.
8622 * The correspondence is _positional_:
8623 *
8624 * - Each occurrence of the first character specified by +selector+
8625 * is translated to the first character in +replacements+.
8626 * - Each occurrence of the second character specified by +selector+
8627 * is translated to the second character in +replacements+.
8628 * - And so on.
8629 *
8630 * Example:
8631 *
8632 * 'hello'.tr('el', 'ip') #=> "hippo"
8633 *
8634 * If +replacements+ is shorter than +selector+,
8635 * it is implicitly padded with its own last character:
8636 *
8637 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8638 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8639 *
8640 * Arguments +selector+ and +replacements+ must be valid character selectors
8641 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8642 * and may use any of its valid forms, including negation, ranges, and escapes:
8643 *
8644 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8645 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8646 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8647 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8648 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8649 *
8650 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8651 */
8652
8653static VALUE
8654rb_str_tr(VALUE str, VALUE src, VALUE repl)
8655{
8656 str = str_duplicate(rb_cString, str);
8657 tr_trans(str, src, repl, 0);
8658 return str;
8659}
8660
8661#define TR_TABLE_MAX (UCHAR_MAX+1)
8662#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8663static void
8664tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8665 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8666{
8667 const unsigned int errc = -1;
8668 char buf[TR_TABLE_MAX];
8669 struct tr tr;
8670 unsigned int c;
8671 VALUE table = 0, ptable = 0;
8672 int i, l, cflag = 0;
8673
8674 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8675 tr.gen = tr.now = tr.max = 0;
8676
8677 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8678 cflag = 1;
8679 tr.p += l;
8680 }
8681 if (first) {
8682 for (i=0; i<TR_TABLE_MAX; i++) {
8683 stable[i] = 1;
8684 }
8685 stable[TR_TABLE_MAX] = cflag;
8686 }
8687 else if (stable[TR_TABLE_MAX] && !cflag) {
8688 stable[TR_TABLE_MAX] = 0;
8689 }
8690 for (i=0; i<TR_TABLE_MAX; i++) {
8691 buf[i] = cflag;
8692 }
8693
8694 while ((c = trnext(&tr, enc)) != errc) {
8695 if (c < TR_TABLE_MAX) {
8696 buf[(unsigned char)c] = !cflag;
8697 }
8698 else {
8699 VALUE key = UINT2NUM(c);
8700
8701 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8702 if (cflag) {
8703 ptable = *ctablep;
8704 table = ptable ? ptable : rb_hash_new();
8705 *ctablep = table;
8706 }
8707 else {
8708 table = rb_hash_new();
8709 ptable = *tablep;
8710 *tablep = table;
8711 }
8712 }
8713 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8714 rb_hash_aset(table, key, Qtrue);
8715 }
8716 }
8717 }
8718 for (i=0; i<TR_TABLE_MAX; i++) {
8719 stable[i] = stable[i] && buf[i];
8720 }
8721 if (!table && !cflag) {
8722 *tablep = 0;
8723 }
8724}
8725
8726
8727static int
8728tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8729{
8730 if (c < TR_TABLE_MAX) {
8731 return table[c] != 0;
8732 }
8733 else {
8734 VALUE v = UINT2NUM(c);
8735
8736 if (del) {
8737 if (!NIL_P(rb_hash_lookup(del, v)) &&
8738 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8739 return TRUE;
8740 }
8741 }
8742 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8743 return FALSE;
8744 }
8745 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8746 }
8747}
8748
8749/*
8750 * call-seq:
8751 * delete!(*selectors) -> self or nil
8752 *
8753 * Like String#delete, but modifies +self+ in place;
8754 * returns +self+ if any characters were deleted, +nil+ otherwise.
8755 *
8756 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8757 */
8758
8759static VALUE
8760rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8761{
8762 char squeez[TR_TABLE_SIZE];
8763 rb_encoding *enc = 0;
8764 char *s, *send, *t;
8765 VALUE del = 0, nodel = 0;
8766 int modify = 0;
8767 int i, ascompat, cr;
8768
8769 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8771 for (i=0; i<argc; i++) {
8772 VALUE s = argv[i];
8773
8774 StringValue(s);
8775 enc = rb_enc_check(str, s);
8776 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8777 }
8778
8779 str_modify_keep_cr(str);
8780 ascompat = rb_enc_asciicompat(enc);
8781 s = t = RSTRING_PTR(str);
8782 send = RSTRING_END(str);
8783 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8784 while (s < send) {
8785 unsigned int c;
8786 int clen;
8787
8788 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8789 if (squeez[c]) {
8790 modify = 1;
8791 }
8792 else {
8793 if (t != s) *t = c;
8794 t++;
8795 }
8796 s++;
8797 }
8798 else {
8799 c = rb_enc_codepoint_len(s, send, &clen, enc);
8800
8801 if (tr_find(c, squeez, del, nodel)) {
8802 modify = 1;
8803 }
8804 else {
8805 if (t != s) rb_enc_mbcput(c, t, enc);
8806 t += clen;
8808 }
8809 s += clen;
8810 }
8811 }
8812 TERM_FILL(t, TERM_LEN(str));
8813 STR_SET_LEN(str, t - RSTRING_PTR(str));
8814 ENC_CODERANGE_SET(str, cr);
8815
8816 if (modify) return str;
8817 return Qnil;
8818}
8819
8820
8821/*
8822 * call-seq:
8823 * delete(*selectors) -> new_string
8824 *
8825 * :include: doc/string/delete.rdoc
8826 *
8827 */
8828
8829static VALUE
8830rb_str_delete(int argc, VALUE *argv, VALUE str)
8831{
8832 str = str_duplicate(rb_cString, str);
8833 rb_str_delete_bang(argc, argv, str);
8834 return str;
8835}
8836
8837
8838/*
8839 * call-seq:
8840 * squeeze!(*selectors) -> self or nil
8841 *
8842 * Like String#squeeze, except that:
8843 *
8844 * - Characters are squeezed in +self+ (not in a copy of +self+).
8845 * - Returns +self+ if any changes are made, +nil+ otherwise.
8846 *
8847 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8848 */
8849
8850static VALUE
8851rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8852{
8853 char squeez[TR_TABLE_SIZE];
8854 rb_encoding *enc = 0;
8855 VALUE del = 0, nodel = 0;
8856 unsigned char *s, *send, *t;
8857 int i, modify = 0;
8858 int ascompat, singlebyte = single_byte_optimizable(str);
8859 unsigned int save;
8860
8861 if (argc == 0) {
8862 enc = STR_ENC_GET(str);
8863 }
8864 else {
8865 for (i=0; i<argc; i++) {
8866 VALUE s = argv[i];
8867
8868 StringValue(s);
8869 enc = rb_enc_check(str, s);
8870 if (singlebyte && !single_byte_optimizable(s))
8871 singlebyte = 0;
8872 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8873 }
8874 }
8875
8876 str_modify_keep_cr(str);
8877 s = t = (unsigned char *)RSTRING_PTR(str);
8878 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8879 send = (unsigned char *)RSTRING_END(str);
8880 save = -1;
8881 ascompat = rb_enc_asciicompat(enc);
8882
8883 if (singlebyte) {
8884 while (s < send) {
8885 unsigned int c = *s++;
8886 if (c != save || (argc > 0 && !squeez[c])) {
8887 *t++ = save = c;
8888 }
8889 }
8890 }
8891 else {
8892 while (s < send) {
8893 unsigned int c;
8894 int clen;
8895
8896 if (ascompat && (c = *s) < 0x80) {
8897 if (c != save || (argc > 0 && !squeez[c])) {
8898 *t++ = save = c;
8899 }
8900 s++;
8901 }
8902 else {
8903 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8904
8905 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8906 if (t != s) rb_enc_mbcput(c, t, enc);
8907 save = c;
8908 t += clen;
8909 }
8910 s += clen;
8911 }
8912 }
8913 }
8914
8915 TERM_FILL((char *)t, TERM_LEN(str));
8916 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8917 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8918 modify = 1;
8919 }
8920
8921 if (modify) return str;
8922 return Qnil;
8923}
8924
8925
8926/*
8927 * call-seq:
8928 * squeeze(*selectors) -> new_string
8929 *
8930 * :include: doc/string/squeeze.rdoc
8931 *
8932 */
8933
8934static VALUE
8935rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8936{
8937 str = str_duplicate(rb_cString, str);
8938 rb_str_squeeze_bang(argc, argv, str);
8939 return str;
8940}
8941
8942
8943/*
8944 * call-seq:
8945 * tr_s!(selector, replacements) -> self or nil
8946 *
8947 * Like String#tr_s, except:
8948 *
8949 * - Modifies +self+ in place (not a copy of +self+).
8950 * - Returns +self+ if any changes were made, +nil+ otherwise.
8951 *
8952 * Related: {Modifying}[rdoc-ref:String@Modifying].
8953 */
8954
8955static VALUE
8956rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8957{
8958 return tr_trans(str, src, repl, 1);
8959}
8960
8961
8962/*
8963 * call-seq:
8964 * tr_s(selector, replacements) -> new_string
8965 *
8966 * Like String#tr, except:
8967 *
8968 * - Also squeezes the modified portions of the translated string;
8969 * see String#squeeze.
8970 * - Returns the translated and squeezed string.
8971 *
8972 * Examples:
8973 *
8974 * 'hello'.tr_s('l', 'r') #=> "hero"
8975 * 'hello'.tr_s('el', '-') #=> "h-o"
8976 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8977 *
8978 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8979 *
8980 */
8981
8982static VALUE
8983rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8984{
8985 str = str_duplicate(rb_cString, str);
8986 tr_trans(str, src, repl, 1);
8987 return str;
8988}
8989
8990
8991/*
8992 * call-seq:
8993 * count(*selectors) -> integer
8994 *
8995 * :include: doc/string/count.rdoc
8996 */
8997
8998static VALUE
8999rb_str_count(int argc, VALUE *argv, VALUE str)
9000{
9001 char table[TR_TABLE_SIZE];
9002 rb_encoding *enc = 0;
9003 VALUE del = 0, nodel = 0, tstr;
9004 char *s, *send;
9005 int i;
9006 int ascompat;
9007 size_t n = 0;
9008
9010
9011 tstr = argv[0];
9012 StringValue(tstr);
9013 enc = rb_enc_check(str, tstr);
9014 if (argc == 1) {
9015 const char *ptstr;
9016 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9017 (ptstr = RSTRING_PTR(tstr),
9018 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9019 !is_broken_string(str)) {
9020 int clen;
9021 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9022
9023 s = RSTRING_PTR(str);
9024 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9025 send = RSTRING_END(str);
9026 while (s < send) {
9027 if (*(unsigned char*)s++ == c) n++;
9028 }
9029 return SIZET2NUM(n);
9030 }
9031 }
9032
9033 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9034 for (i=1; i<argc; i++) {
9035 tstr = argv[i];
9036 StringValue(tstr);
9037 enc = rb_enc_check(str, tstr);
9038 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9039 }
9040
9041 s = RSTRING_PTR(str);
9042 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9043 send = RSTRING_END(str);
9044 ascompat = rb_enc_asciicompat(enc);
9045 while (s < send) {
9046 unsigned int c;
9047
9048 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9049 if (table[c]) {
9050 n++;
9051 }
9052 s++;
9053 }
9054 else {
9055 int clen;
9056 c = rb_enc_codepoint_len(s, send, &clen, enc);
9057 if (tr_find(c, table, del, nodel)) {
9058 n++;
9059 }
9060 s += clen;
9061 }
9062 }
9063
9064 return SIZET2NUM(n);
9065}
9066
9067static VALUE
9068rb_fs_check(VALUE val)
9069{
9070 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9071 val = rb_check_string_type(val);
9072 if (NIL_P(val)) return 0;
9073 }
9074 return val;
9075}
9076
9077static const char isspacetable[256] = {
9078 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9080 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9094};
9095
9096#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9097
9098static long
9099split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9100{
9101 if (empty_count >= 0 && len == 0) {
9102 return empty_count + 1;
9103 }
9104 if (empty_count > 0) {
9105 /* make different substrings */
9106 if (result) {
9107 do {
9108 rb_ary_push(result, str_new_empty_String(str));
9109 } while (--empty_count > 0);
9110 }
9111 else {
9112 do {
9113 rb_yield(str_new_empty_String(str));
9114 } while (--empty_count > 0);
9115 }
9116 }
9117 str = rb_str_subseq(str, beg, len);
9118 if (result) {
9119 rb_ary_push(result, str);
9120 }
9121 else {
9122 rb_yield(str);
9123 }
9124 return empty_count;
9125}
9126
9127typedef enum {
9128 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9129} split_type_t;
9130
9131static split_type_t
9132literal_split_pattern(VALUE spat, split_type_t default_type)
9133{
9134 rb_encoding *enc = STR_ENC_GET(spat);
9135 const char *ptr;
9136 long len;
9137 RSTRING_GETMEM(spat, ptr, len);
9138 if (len == 0) {
9139 /* Special case - split into chars */
9140 return SPLIT_TYPE_CHARS;
9141 }
9142 else if (rb_enc_asciicompat(enc)) {
9143 if (len == 1 && ptr[0] == ' ') {
9144 return SPLIT_TYPE_AWK;
9145 }
9146 }
9147 else {
9148 int l;
9149 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9150 return SPLIT_TYPE_AWK;
9151 }
9152 }
9153 return default_type;
9154}
9155
9156/*
9157 * call-seq:
9158 * split(field_sep = $;, limit = 0) -> array_of_substrings
9159 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9160 *
9161 * :include: doc/string/split.rdoc
9162 *
9163 */
9164
9165static VALUE
9166rb_str_split_m(int argc, VALUE *argv, VALUE str)
9167{
9168 rb_encoding *enc;
9169 VALUE spat;
9170 VALUE limit;
9171 split_type_t split_type;
9172 long beg, end, i = 0, empty_count = -1;
9173 int lim = 0;
9174 VALUE result, tmp;
9175
9176 result = rb_block_given_p() ? Qfalse : Qnil;
9177 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9178 lim = NUM2INT(limit);
9179 if (lim <= 0) limit = Qnil;
9180 else if (lim == 1) {
9181 if (RSTRING_LEN(str) == 0)
9182 return result ? rb_ary_new2(0) : str;
9183 tmp = str_duplicate(rb_cString, str);
9184 if (!result) {
9185 rb_yield(tmp);
9186 return str;
9187 }
9188 return rb_ary_new3(1, tmp);
9189 }
9190 i = 1;
9191 }
9192 if (NIL_P(limit) && !lim) empty_count = 0;
9193
9194 enc = STR_ENC_GET(str);
9195 split_type = SPLIT_TYPE_REGEXP;
9196 if (!NIL_P(spat)) {
9197 spat = get_pat_quoted(spat, 0);
9198 }
9199 else if (NIL_P(spat = rb_fs)) {
9200 split_type = SPLIT_TYPE_AWK;
9201 }
9202 else if (!(spat = rb_fs_check(spat))) {
9203 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9204 }
9205 else {
9206 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9207 }
9208 if (split_type != SPLIT_TYPE_AWK) {
9209 switch (BUILTIN_TYPE(spat)) {
9210 case T_REGEXP:
9211 rb_reg_options(spat); /* check if uninitialized */
9212 tmp = RREGEXP_SRC(spat);
9213 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9214 if (split_type == SPLIT_TYPE_AWK) {
9215 spat = tmp;
9216 split_type = SPLIT_TYPE_STRING;
9217 }
9218 break;
9219
9220 case T_STRING:
9221 mustnot_broken(spat);
9222 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9223 break;
9224
9225 default:
9227 }
9228 }
9229
9230#define SPLIT_STR(beg, len) ( \
9231 empty_count = split_string(result, str, beg, len, empty_count), \
9232 str_mod_check(str, str_start, str_len))
9233
9234 beg = 0;
9235 char *ptr = RSTRING_PTR(str);
9236 char *const str_start = ptr;
9237 const long str_len = RSTRING_LEN(str);
9238 char *const eptr = str_start + str_len;
9239 if (split_type == SPLIT_TYPE_AWK) {
9240 char *bptr = ptr;
9241 int skip = 1;
9242 unsigned int c;
9243
9244 if (result) result = rb_ary_new();
9245 end = beg;
9246 if (is_ascii_string(str)) {
9247 while (ptr < eptr) {
9248 c = (unsigned char)*ptr++;
9249 if (skip) {
9250 if (ascii_isspace(c)) {
9251 beg = ptr - bptr;
9252 }
9253 else {
9254 end = ptr - bptr;
9255 skip = 0;
9256 if (!NIL_P(limit) && lim <= i) break;
9257 }
9258 }
9259 else if (ascii_isspace(c)) {
9260 SPLIT_STR(beg, end-beg);
9261 skip = 1;
9262 beg = ptr - bptr;
9263 if (!NIL_P(limit)) ++i;
9264 }
9265 else {
9266 end = ptr - bptr;
9267 }
9268 }
9269 }
9270 else {
9271 while (ptr < eptr) {
9272 int n;
9273
9274 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9275 ptr += n;
9276 if (skip) {
9277 if (rb_isspace(c)) {
9278 beg = ptr - bptr;
9279 }
9280 else {
9281 end = ptr - bptr;
9282 skip = 0;
9283 if (!NIL_P(limit) && lim <= i) break;
9284 }
9285 }
9286 else if (rb_isspace(c)) {
9287 SPLIT_STR(beg, end-beg);
9288 skip = 1;
9289 beg = ptr - bptr;
9290 if (!NIL_P(limit)) ++i;
9291 }
9292 else {
9293 end = ptr - bptr;
9294 }
9295 }
9296 }
9297 }
9298 else if (split_type == SPLIT_TYPE_STRING) {
9299 char *substr_start = ptr;
9300 char *sptr = RSTRING_PTR(spat);
9301 long slen = RSTRING_LEN(spat);
9302
9303 if (result) result = rb_ary_new();
9304 mustnot_broken(str);
9305 enc = rb_enc_check(str, spat);
9306 while (ptr < eptr &&
9307 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9308 /* Check we are at the start of a char */
9309 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9310 if (t != ptr + end) {
9311 ptr = t;
9312 continue;
9313 }
9314 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9315 str_mod_check(spat, sptr, slen);
9316 ptr += end + slen;
9317 substr_start = ptr;
9318 if (!NIL_P(limit) && lim <= ++i) break;
9319 }
9320 beg = ptr - str_start;
9321 }
9322 else if (split_type == SPLIT_TYPE_CHARS) {
9323 int n;
9324
9325 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9326 mustnot_broken(str);
9327 enc = rb_enc_get(str);
9328 while (ptr < eptr &&
9329 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9330 SPLIT_STR(ptr - str_start, n);
9331 ptr += n;
9332 if (!NIL_P(limit) && lim <= ++i) break;
9333 }
9334 beg = ptr - str_start;
9335 }
9336 else {
9337 if (result) result = rb_ary_new();
9338 long len = RSTRING_LEN(str);
9339 long start = beg;
9340 long idx;
9341 int last_null = 0;
9342 struct re_registers *regs;
9343 VALUE match = 0;
9344
9345 for (; rb_reg_search(spat, str, start, 0) >= 0;
9346 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9347 match = rb_backref_get();
9348 if (!result) rb_match_busy(match);
9349 regs = RMATCH_REGS(match);
9350 end = BEG(0);
9351 if (start == end && BEG(0) == END(0)) {
9352 if (!ptr) {
9353 SPLIT_STR(0, 0);
9354 break;
9355 }
9356 else if (last_null == 1) {
9357 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9358 beg = start;
9359 }
9360 else {
9361 if (start == len)
9362 start++;
9363 else
9364 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9365 last_null = 1;
9366 continue;
9367 }
9368 }
9369 else {
9370 SPLIT_STR(beg, end-beg);
9371 beg = start = END(0);
9372 }
9373 last_null = 0;
9374
9375 for (idx=1; idx < regs->num_regs; idx++) {
9376 if (BEG(idx) == -1) continue;
9377 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9378 }
9379 if (!NIL_P(limit) && lim <= ++i) break;
9380 }
9381 if (match) rb_match_unbusy(match);
9382 }
9383 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9384 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9385 }
9386
9387 return result ? result : str;
9388}
9389
9390VALUE
9391rb_str_split(VALUE str, const char *sep0)
9392{
9393 VALUE sep;
9394
9395 StringValue(str);
9396 sep = rb_str_new_cstr(sep0);
9397 return rb_str_split_m(1, &sep, str);
9398}
9399
9400#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9401
9402static inline int
9403enumerator_element(VALUE ary, VALUE e)
9404{
9405 if (ary) {
9406 rb_ary_push(ary, e);
9407 return 0;
9408 }
9409 else {
9410 rb_yield(e);
9411 return 1;
9412 }
9413}
9414
9415#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9416
9417static const char *
9418chomp_newline(const char *p, const char *e, rb_encoding *enc)
9419{
9420 const char *prev = rb_enc_prev_char(p, e, e, enc);
9421 if (rb_enc_is_newline(prev, e, enc)) {
9422 e = prev;
9423 prev = rb_enc_prev_char(p, e, e, enc);
9424 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9425 e = prev;
9426 }
9427 return e;
9428}
9429
9430static VALUE
9431get_rs(void)
9432{
9433 VALUE rs = rb_rs;
9434 if (!NIL_P(rs) &&
9435 (!RB_TYPE_P(rs, T_STRING) ||
9436 RSTRING_LEN(rs) != 1 ||
9437 RSTRING_PTR(rs)[0] != '\n')) {
9438 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9439 }
9440 return rs;
9441}
9442
9443#define rb_rs get_rs()
9444
9445static VALUE
9446rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9447{
9448 rb_encoding *enc;
9449 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9450 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9451 long pos, len, rslen;
9452 int rsnewline = 0;
9453
9454 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9455 rs = rb_rs;
9456 if (!NIL_P(opts)) {
9457 static ID keywords[1];
9458 if (!keywords[0]) {
9459 keywords[0] = rb_intern_const("chomp");
9460 }
9461 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9462 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9463 }
9464
9465 if (NIL_P(rs)) {
9466 if (!ENUM_ELEM(ary, str)) {
9467 return ary;
9468 }
9469 else {
9470 return orig;
9471 }
9472 }
9473
9474 if (!RSTRING_LEN(str)) goto end;
9475 str = rb_str_new_frozen(str);
9476 ptr = subptr = RSTRING_PTR(str);
9477 pend = RSTRING_END(str);
9478 len = RSTRING_LEN(str);
9479 StringValue(rs);
9480 rslen = RSTRING_LEN(rs);
9481
9482 if (rs == rb_default_rs)
9483 enc = rb_enc_get(str);
9484 else
9485 enc = rb_enc_check(str, rs);
9486
9487 if (rslen == 0) {
9488 /* paragraph mode */
9489 int n;
9490 const char *eol = NULL;
9491 subend = subptr;
9492 while (subend < pend) {
9493 long chomp_rslen = 0;
9494 do {
9495 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9496 n = 0;
9497 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9498 if (rb_enc_is_newline(subend + n, pend, enc)) {
9499 if (eol == subend) break;
9500 subend += rslen;
9501 if (subptr) {
9502 eol = subend;
9503 chomp_rslen = -rslen;
9504 }
9505 }
9506 else {
9507 if (!subptr) subptr = subend;
9508 subend += rslen;
9509 }
9510 rslen = 0;
9511 } while (subend < pend);
9512 if (!subptr) break;
9513 if (rslen == 0) chomp_rslen = 0;
9514 line = rb_str_subseq(str, subptr - ptr,
9515 subend - subptr + (chomp ? chomp_rslen : rslen));
9516 if (ENUM_ELEM(ary, line)) {
9517 str_mod_check(str, ptr, len);
9518 }
9519 subptr = eol = NULL;
9520 }
9521 goto end;
9522 }
9523 else {
9524 rsptr = RSTRING_PTR(rs);
9525 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9526 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9527 rsnewline = 1;
9528 }
9529 }
9530
9531 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9532 rs = rb_str_new(rsptr, rslen);
9533 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9534 rsptr = RSTRING_PTR(rs);
9535 rslen = RSTRING_LEN(rs);
9536 }
9537
9538 while (subptr < pend) {
9539 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9540 if (pos < 0) break;
9541 hit = subptr + pos;
9542 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9543 if (hit != adjusted) {
9544 subptr = adjusted;
9545 continue;
9546 }
9547 subend = hit += rslen;
9548 if (chomp) {
9549 if (rsnewline) {
9550 subend = chomp_newline(subptr, subend, enc);
9551 }
9552 else {
9553 subend -= rslen;
9554 }
9555 }
9556 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9557 if (ENUM_ELEM(ary, line)) {
9558 str_mod_check(str, ptr, len);
9559 }
9560 subptr = hit;
9561 }
9562
9563 if (subptr != pend) {
9564 if (chomp) {
9565 if (rsnewline) {
9566 pend = chomp_newline(subptr, pend, enc);
9567 }
9568 else if (pend - subptr >= rslen &&
9569 memcmp(pend - rslen, rsptr, rslen) == 0) {
9570 pend -= rslen;
9571 }
9572 }
9573 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9574 ENUM_ELEM(ary, line);
9575 RB_GC_GUARD(str);
9576 }
9577
9578 end:
9579 if (ary)
9580 return ary;
9581 else
9582 return orig;
9583}
9584
9585/*
9586 * call-seq:
9587 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9588 * each_line(record_separator = $/, chomp: false) -> enumerator
9589 *
9590 * :include: doc/string/each_line.rdoc
9591 *
9592 */
9593
9594static VALUE
9595rb_str_each_line(int argc, VALUE *argv, VALUE str)
9596{
9597 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9598 return rb_str_enumerate_lines(argc, argv, str, 0);
9599}
9600
9601/*
9602 * call-seq:
9603 * lines(record_separator = $/, chomp: false) -> array_of_strings
9604 *
9605 * Returns substrings ("lines") of +self+
9606 * according to the given arguments:
9607 *
9608 * s = <<~EOT
9609 * This is the first line.
9610 * This is line two.
9611 *
9612 * This is line four.
9613 * This is line five.
9614 * EOT
9615 *
9616 * With the default argument values:
9617 *
9618 * $/ # => "\n"
9619 * s.lines
9620 * # =>
9621 * ["This is the first line.\n",
9622 * "This is line two.\n",
9623 * "\n",
9624 * "This is line four.\n",
9625 * "This is line five.\n"]
9626 *
9627 * With a different +record_separator+:
9628 *
9629 * record_separator = ' is '
9630 * s.lines(record_separator)
9631 * # =>
9632 * ["This is ",
9633 * "the first line.\nThis is ",
9634 * "line two.\n\nThis is ",
9635 * "line four.\nThis is ",
9636 * "line five.\n"]
9637 *
9638 * With keyword argument +chomp+ as +true+,
9639 * removes the trailing newline from each line:
9640 *
9641 * s.lines(chomp: true)
9642 * # =>
9643 * ["This is the first line.",
9644 * "This is line two.",
9645 * "",
9646 * "This is line four.",
9647 * "This is line five."]
9648 *
9649 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
9650 */
9651
9652static VALUE
9653rb_str_lines(int argc, VALUE *argv, VALUE str)
9654{
9655 VALUE ary = WANTARRAY("lines", 0);
9656 return rb_str_enumerate_lines(argc, argv, str, ary);
9657}
9658
9659static VALUE
9660rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9661{
9662 return LONG2FIX(RSTRING_LEN(str));
9663}
9664
9665static VALUE
9666rb_str_enumerate_bytes(VALUE str, VALUE ary)
9667{
9668 long i;
9669
9670 for (i=0; i<RSTRING_LEN(str); i++) {
9671 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9672 }
9673 if (ary)
9674 return ary;
9675 else
9676 return str;
9677}
9678
9679/*
9680 * call-seq:
9681 * each_byte {|byte| ... } -> self
9682 * each_byte -> enumerator
9683 *
9684 * :include: doc/string/each_byte.rdoc
9685 *
9686 */
9687
9688static VALUE
9689rb_str_each_byte(VALUE str)
9690{
9691 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9692 return rb_str_enumerate_bytes(str, 0);
9693}
9694
9695/*
9696 * call-seq:
9697 * bytes -> array_of_bytes
9698 *
9699 * :include: doc/string/bytes.rdoc
9700 *
9701 */
9702
9703static VALUE
9704rb_str_bytes(VALUE str)
9705{
9706 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9707 return rb_str_enumerate_bytes(str, ary);
9708}
9709
9710static VALUE
9711rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9712{
9713 return rb_str_length(str);
9714}
9715
9716static VALUE
9717rb_str_enumerate_chars(VALUE str, VALUE ary)
9718{
9719 VALUE orig = str;
9720 long i, len, n;
9721 const char *ptr;
9722 rb_encoding *enc;
9723
9724 str = rb_str_new_frozen(str);
9725 ptr = RSTRING_PTR(str);
9726 len = RSTRING_LEN(str);
9727 enc = rb_enc_get(str);
9728
9730 for (i = 0; i < len; i += n) {
9731 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9732 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9733 }
9734 }
9735 else {
9736 for (i = 0; i < len; i += n) {
9737 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9738 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9739 }
9740 }
9741 RB_GC_GUARD(str);
9742 if (ary)
9743 return ary;
9744 else
9745 return orig;
9746}
9747
9748/*
9749 * call-seq:
9750 * each_char {|char| ... } -> self
9751 * each_char -> enumerator
9752 *
9753 * :include: doc/string/each_char.rdoc
9754 *
9755 */
9756
9757static VALUE
9758rb_str_each_char(VALUE str)
9759{
9760 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9761 return rb_str_enumerate_chars(str, 0);
9762}
9763
9764/*
9765 * call-seq:
9766 * chars -> array_of_characters
9767 *
9768 * :include: doc/string/chars.rdoc
9769 *
9770 */
9771
9772static VALUE
9773rb_str_chars(VALUE str)
9774{
9775 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9776 return rb_str_enumerate_chars(str, ary);
9777}
9778
9779static VALUE
9780rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9781{
9782 VALUE orig = str;
9783 int n;
9784 unsigned int c;
9785 const char *ptr, *end;
9786 rb_encoding *enc;
9787
9788 if (single_byte_optimizable(str))
9789 return rb_str_enumerate_bytes(str, ary);
9790
9791 str = rb_str_new_frozen(str);
9792 ptr = RSTRING_PTR(str);
9793 end = RSTRING_END(str);
9794 enc = STR_ENC_GET(str);
9795
9796 while (ptr < end) {
9797 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9798 ENUM_ELEM(ary, UINT2NUM(c));
9799 ptr += n;
9800 }
9801 RB_GC_GUARD(str);
9802 if (ary)
9803 return ary;
9804 else
9805 return orig;
9806}
9807
9808/*
9809 * call-seq:
9810 * each_codepoint {|codepoint| ... } -> self
9811 * each_codepoint -> enumerator
9812 *
9813 * :include: doc/string/each_codepoint.rdoc
9814 *
9815 */
9816
9817static VALUE
9818rb_str_each_codepoint(VALUE str)
9819{
9820 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9821 return rb_str_enumerate_codepoints(str, 0);
9822}
9823
9824/*
9825 * call-seq:
9826 * codepoints -> array_of_integers
9827 *
9828 * :include: doc/string/codepoints.rdoc
9829 *
9830 */
9831
9832static VALUE
9833rb_str_codepoints(VALUE str)
9834{
9835 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9836 return rb_str_enumerate_codepoints(str, ary);
9837}
9838
9839static regex_t *
9840get_reg_grapheme_cluster(rb_encoding *enc)
9841{
9842 int encidx = rb_enc_to_index(enc);
9843
9844 const OnigUChar source_ascii[] = "\\X";
9845 const OnigUChar *source = source_ascii;
9846 size_t source_len = sizeof(source_ascii) - 1;
9847
9848 switch (encidx) {
9849#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9850#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9851#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9852#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9853#define CASE_UTF(e) \
9854 case ENCINDEX_UTF_##e: { \
9855 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9856 source = source_UTF_##e; \
9857 source_len = sizeof(source_UTF_##e); \
9858 break; \
9859 }
9860 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9861#undef CASE_UTF
9862#undef CHARS_16BE
9863#undef CHARS_16LE
9864#undef CHARS_32BE
9865#undef CHARS_32LE
9866 }
9867
9868 regex_t *reg_grapheme_cluster;
9869 OnigErrorInfo einfo;
9870 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9871 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9872 if (r) {
9873 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9874 onig_error_code_to_str(message, r, &einfo);
9875 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9876 }
9877
9878 return reg_grapheme_cluster;
9879}
9880
9881static regex_t *
9882get_cached_reg_grapheme_cluster(rb_encoding *enc)
9883{
9884 int encidx = rb_enc_to_index(enc);
9885 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9886
9887 if (encidx == rb_utf8_encindex()) {
9888 if (!reg_grapheme_cluster_utf8) {
9889 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9890 }
9891
9892 return reg_grapheme_cluster_utf8;
9893 }
9894
9895 return NULL;
9896}
9897
9898static VALUE
9899rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9900{
9901 size_t grapheme_cluster_count = 0;
9902 rb_encoding *enc = get_encoding(str);
9903 const char *ptr, *end;
9904
9905 if (!rb_enc_unicode_p(enc)) {
9906 return rb_str_length(str);
9907 }
9908
9909 bool cached_reg_grapheme_cluster = true;
9910 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9911 if (!reg_grapheme_cluster) {
9912 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9913 cached_reg_grapheme_cluster = false;
9914 }
9915
9916 ptr = RSTRING_PTR(str);
9917 end = RSTRING_END(str);
9918
9919 while (ptr < end) {
9920 OnigPosition len = onig_match(reg_grapheme_cluster,
9921 (const OnigUChar *)ptr, (const OnigUChar *)end,
9922 (const OnigUChar *)ptr, NULL, 0);
9923 if (len <= 0) break;
9924 grapheme_cluster_count++;
9925 ptr += len;
9926 }
9927
9928 if (!cached_reg_grapheme_cluster) {
9929 onig_free(reg_grapheme_cluster);
9930 }
9931
9932 return SIZET2NUM(grapheme_cluster_count);
9933}
9934
9935static VALUE
9936rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9937{
9938 VALUE orig = str;
9939 rb_encoding *enc = get_encoding(str);
9940 const char *ptr0, *ptr, *end;
9941
9942 if (!rb_enc_unicode_p(enc)) {
9943 return rb_str_enumerate_chars(str, ary);
9944 }
9945
9946 if (!ary) str = rb_str_new_frozen(str);
9947
9948 bool cached_reg_grapheme_cluster = true;
9949 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9950 if (!reg_grapheme_cluster) {
9951 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9952 cached_reg_grapheme_cluster = false;
9953 }
9954
9955 ptr0 = ptr = RSTRING_PTR(str);
9956 end = RSTRING_END(str);
9957
9958 while (ptr < end) {
9959 OnigPosition len = onig_match(reg_grapheme_cluster,
9960 (const OnigUChar *)ptr, (const OnigUChar *)end,
9961 (const OnigUChar *)ptr, NULL, 0);
9962 if (len <= 0) break;
9963 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9964 ptr += len;
9965 }
9966
9967 if (!cached_reg_grapheme_cluster) {
9968 onig_free(reg_grapheme_cluster);
9969 }
9970
9971 RB_GC_GUARD(str);
9972 if (ary)
9973 return ary;
9974 else
9975 return orig;
9976}
9977
9978/*
9979 * call-seq:
9980 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9981 * each_grapheme_cluster -> enumerator
9982 *
9983 * :include: doc/string/each_grapheme_cluster.rdoc
9984 *
9985 */
9986
9987static VALUE
9988rb_str_each_grapheme_cluster(VALUE str)
9989{
9990 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9991 return rb_str_enumerate_grapheme_clusters(str, 0);
9992}
9993
9994/*
9995 * call-seq:
9996 * grapheme_clusters -> array_of_grapheme_clusters
9997 *
9998 * :include: doc/string/grapheme_clusters.rdoc
9999 *
10000 */
10001
10002static VALUE
10003rb_str_grapheme_clusters(VALUE str)
10004{
10005 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10006 return rb_str_enumerate_grapheme_clusters(str, ary);
10007}
10008
10009static long
10010chopped_length(VALUE str)
10011{
10012 rb_encoding *enc = STR_ENC_GET(str);
10013 const char *p, *p2, *beg, *end;
10014
10015 beg = RSTRING_PTR(str);
10016 end = beg + RSTRING_LEN(str);
10017 if (beg >= end) return 0;
10018 p = rb_enc_prev_char(beg, end, end, enc);
10019 if (!p) return 0;
10020 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10021 p2 = rb_enc_prev_char(beg, p, end, enc);
10022 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10023 }
10024 return p - beg;
10025}
10026
10027/*
10028 * call-seq:
10029 * chop! -> self or nil
10030 *
10031 * Like String#chop, except that:
10032 *
10033 * - Removes trailing characters from +self+ (not from a copy of +self+).
10034 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10035 *
10036 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10037 */
10038
10039static VALUE
10040rb_str_chop_bang(VALUE str)
10041{
10042 str_modify_keep_cr(str);
10043 if (RSTRING_LEN(str) > 0) {
10044 long len;
10045 len = chopped_length(str);
10046 STR_SET_LEN(str, len);
10047 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10048 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10050 }
10051 return str;
10052 }
10053 return Qnil;
10054}
10055
10056
10057/*
10058 * call-seq:
10059 * chop -> new_string
10060 *
10061 * :include: doc/string/chop.rdoc
10062 *
10063 */
10064
10065static VALUE
10066rb_str_chop(VALUE str)
10067{
10068 return rb_str_subseq(str, 0, chopped_length(str));
10069}
10070
10071static long
10072smart_chomp(VALUE str, const char *e, const char *p)
10073{
10074 rb_encoding *enc = rb_enc_get(str);
10075 if (rb_enc_mbminlen(enc) > 1) {
10076 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10077 if (rb_enc_is_newline(pp, e, enc)) {
10078 e = pp;
10079 }
10080 pp = e - rb_enc_mbminlen(enc);
10081 if (pp >= p) {
10082 pp = rb_enc_left_char_head(p, pp, e, enc);
10083 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10084 e = pp;
10085 }
10086 }
10087 }
10088 else {
10089 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10090 case '\n':
10091 if (--e > p && *(e-1) == '\r') {
10092 --e;
10093 }
10094 break;
10095 case '\r':
10096 --e;
10097 break;
10098 }
10099 }
10100 return e - p;
10101}
10102
10103static long
10104chompped_length(VALUE str, VALUE rs)
10105{
10106 rb_encoding *enc;
10107 int newline;
10108 char *pp, *e, *rsptr;
10109 long rslen;
10110 char *const p = RSTRING_PTR(str);
10111 long len = RSTRING_LEN(str);
10112
10113 if (len == 0) return 0;
10114 e = p + len;
10115 if (rs == rb_default_rs) {
10116 return smart_chomp(str, e, p);
10117 }
10118
10119 enc = rb_enc_get(str);
10120 RSTRING_GETMEM(rs, rsptr, rslen);
10121 if (rslen == 0) {
10122 if (rb_enc_mbminlen(enc) > 1) {
10123 while (e > p) {
10124 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10125 if (!rb_enc_is_newline(pp, e, enc)) break;
10126 e = pp;
10127 pp -= rb_enc_mbminlen(enc);
10128 if (pp >= p) {
10129 pp = rb_enc_left_char_head(p, pp, e, enc);
10130 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10131 e = pp;
10132 }
10133 }
10134 }
10135 }
10136 else {
10137 while (e > p && *(e-1) == '\n') {
10138 --e;
10139 if (e > p && *(e-1) == '\r')
10140 --e;
10141 }
10142 }
10143 return e - p;
10144 }
10145 if (rslen > len) return len;
10146
10147 enc = rb_enc_get(rs);
10148 newline = rsptr[rslen-1];
10149 if (rslen == rb_enc_mbminlen(enc)) {
10150 if (rslen == 1) {
10151 if (newline == '\n')
10152 return smart_chomp(str, e, p);
10153 }
10154 else {
10155 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10156 return smart_chomp(str, e, p);
10157 }
10158 }
10159
10160 enc = rb_enc_check(str, rs);
10161 if (is_broken_string(rs)) {
10162 return len;
10163 }
10164 pp = e - rslen;
10165 if (p[len-1] == newline &&
10166 (rslen <= 1 ||
10167 memcmp(rsptr, pp, rslen) == 0)) {
10168 if (at_char_boundary(p, pp, e, enc))
10169 return len - rslen;
10170 RB_GC_GUARD(rs);
10171 }
10172 return len;
10173}
10174
10180static VALUE
10181chomp_rs(int argc, const VALUE *argv)
10182{
10183 rb_check_arity(argc, 0, 1);
10184 if (argc > 0) {
10185 VALUE rs = argv[0];
10186 if (!NIL_P(rs)) StringValue(rs);
10187 return rs;
10188 }
10189 else {
10190 return rb_rs;
10191 }
10192}
10193
10194VALUE
10195rb_str_chomp_string(VALUE str, VALUE rs)
10196{
10197 long olen = RSTRING_LEN(str);
10198 long len = chompped_length(str, rs);
10199 if (len >= olen) return Qnil;
10200 str_modify_keep_cr(str);
10201 STR_SET_LEN(str, len);
10202 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10203 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10205 }
10206 return str;
10207}
10208
10209/*
10210 * call-seq:
10211 * chomp!(line_sep = $/) -> self or nil
10212 *
10213 * Like String#chomp, except that:
10214 *
10215 * - Removes trailing characters from +self+ (not from a copy of +self+).
10216 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10217 *
10218 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10219 */
10220
10221static VALUE
10222rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10223{
10224 VALUE rs;
10225 str_modifiable(str);
10226 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10227 rs = chomp_rs(argc, argv);
10228 if (NIL_P(rs)) return Qnil;
10229 return rb_str_chomp_string(str, rs);
10230}
10231
10232
10233/*
10234 * call-seq:
10235 * chomp(line_sep = $/) -> new_string
10236 *
10237 * :include: doc/string/chomp.rdoc
10238 *
10239 */
10240
10241static VALUE
10242rb_str_chomp(int argc, VALUE *argv, VALUE str)
10243{
10244 VALUE rs = chomp_rs(argc, argv);
10245 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10246 return rb_str_subseq(str, 0, chompped_length(str, rs));
10247}
10248
10249static void
10250tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10251 VALUE str, int num_selectors, VALUE *selectors)
10252{
10253 int i;
10254
10255 for (i=0; i<num_selectors; i++) {
10256 VALUE selector = selectors[i];
10257 rb_encoding *enc;
10258
10259 StringValue(selector);
10260 enc = rb_enc_check(str, selector);
10261 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10262 }
10263}
10264
10265static long
10266lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10267{
10268 const char *const start = s;
10269
10270 if (!s || s >= e) return 0;
10271
10272 /* remove spaces at head */
10273 if (single_byte_optimizable(str)) {
10274 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10275 }
10276 else {
10277 while (s < e) {
10278 int n;
10279 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10280
10281 if (cc && !rb_isspace(cc)) break;
10282 s += n;
10283 }
10284 }
10285 return s - start;
10286}
10287
10288static long
10289lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10290 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10291{
10292 const char *const start = s;
10293
10294 if (!s || s >= e) return 0;
10295
10296 /* remove leading characters in the table */
10297 while (s < e) {
10298 int n;
10299 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10300
10301 if (!tr_find(cc, table, del, nodel)) break;
10302 s += n;
10303 }
10304 return s - start;
10305}
10306
10307/*
10308 * call-seq:
10309 * lstrip!(*selectors) -> self or nil
10310 *
10311 * Like String#lstrip, except that:
10312 *
10313 * - Performs stripping in +self+ (not in a copy of +self+).
10314 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10315 *
10316 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10317 */
10318
10319static VALUE
10320rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10321{
10322 rb_encoding *enc;
10323 char *start, *s;
10324 long olen, loffset;
10325
10326 str_modify_keep_cr(str);
10327 enc = STR_ENC_GET(str);
10328 RSTRING_GETMEM(str, start, olen);
10329 if (argc > 0) {
10330 char table[TR_TABLE_SIZE];
10331 VALUE del = 0, nodel = 0;
10332
10333 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10334 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10335 }
10336 else {
10337 loffset = lstrip_offset(str, start, start+olen, enc);
10338 }
10339
10340 if (loffset > 0) {
10341 long len = olen-loffset;
10342 s = start + loffset;
10343 memmove(start, s, len);
10344 STR_SET_LEN(str, len);
10345 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10346 return str;
10347 }
10348 return Qnil;
10349}
10350
10351
10352/*
10353 * call-seq:
10354 * lstrip(*selectors) -> new_string
10355 *
10356 * Returns a copy of +self+ with leading whitespace removed;
10357 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10358 *
10359 * whitespace = "\x00\t\n\v\f\r "
10360 * s = whitespace + 'abc' + whitespace
10361 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10362 * s.lstrip
10363 * # => "abc\u0000\t\n\v\f\r "
10364 *
10365 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10366 *
10367 * s = "---abc+++"
10368 * s.lstrip("-") # => "abc+++"
10369 *
10370 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10371 * and may use any of its valid forms, including negation, ranges, and escapes:
10372 *
10373 * "01234abc56789".lstrip("0-9") # "abc56789"
10374 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10375 *
10376 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10377 */
10378
10379static VALUE
10380rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10381{
10382 char *start;
10383 long len, loffset;
10384
10385 RSTRING_GETMEM(str, start, len);
10386 if (argc > 0) {
10387 char table[TR_TABLE_SIZE];
10388 VALUE del = 0, nodel = 0;
10389
10390 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10391 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10392 }
10393 else {
10394 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10395 }
10396 if (loffset <= 0) return str_duplicate(rb_cString, str);
10397 return rb_str_subseq(str, loffset, len - loffset);
10398}
10399
10400static long
10401rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10402{
10403 const char *t;
10404
10405 rb_str_check_dummy_enc(enc);
10406 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10407 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10408 }
10409 if (!s || s >= e) return 0;
10410 t = e;
10411
10412 /* remove trailing spaces or '\0's */
10413 if (single_byte_optimizable(str)) {
10414 unsigned char c;
10415 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10416 }
10417 else {
10418 char *tp;
10419
10420 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10421 unsigned int c = rb_enc_codepoint(tp, e, enc);
10422 if (c && !rb_isspace(c)) break;
10423 t = tp;
10424 }
10425 }
10426 return e - t;
10427}
10428
10429static long
10430rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10431 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10432{
10433 const char *t;
10434 char *tp;
10435
10436 rb_str_check_dummy_enc(enc);
10437 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10438 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10439 }
10440 if (!s || s >= e) return 0;
10441 t = e;
10442
10443 /* remove trailing characters in the table */
10444 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10445 unsigned int c = rb_enc_codepoint(tp, e, enc);
10446 if (!tr_find(c, table, del, nodel)) break;
10447 t = tp;
10448 }
10449
10450 return e - t;
10451}
10452
10453/*
10454 * call-seq:
10455 * rstrip!(*selectors) -> self or nil
10456 *
10457 * Like String#rstrip, except that:
10458 *
10459 * - Performs stripping in +self+ (not in a copy of +self+).
10460 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10461 *
10462 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10463 */
10464
10465static VALUE
10466rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10467{
10468 rb_encoding *enc;
10469 char *start;
10470 long olen, roffset;
10471
10472 str_modify_keep_cr(str);
10473 enc = STR_ENC_GET(str);
10474 RSTRING_GETMEM(str, start, olen);
10475 if (argc > 0) {
10476 char table[TR_TABLE_SIZE];
10477 VALUE del = 0, nodel = 0;
10478
10479 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10480 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10481 }
10482 else {
10483 roffset = rstrip_offset(str, start, start+olen, enc);
10484 }
10485 if (roffset > 0) {
10486 long len = olen - roffset;
10487
10488 STR_SET_LEN(str, len);
10489 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10490 return str;
10491 }
10492 return Qnil;
10493}
10494
10495
10496/*
10497 * call-seq:
10498 * rstrip(*selectors) -> new_string
10499 *
10500 * Returns a copy of +self+ with trailing whitespace removed;
10501 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10502 *
10503 * whitespace = "\x00\t\n\v\f\r "
10504 * s = whitespace + 'abc' + whitespace
10505 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10506 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10507 *
10508 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10509 *
10510 * s = "---abc+++"
10511 * s.rstrip("+") # => "---abc"
10512 *
10513 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10514 * and may use any of its valid forms, including negation, ranges, and escapes:
10515 *
10516 * "01234abc56789".rstrip("0-9") # "01234abc"
10517 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10518 *
10519 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10520 */
10521
10522static VALUE
10523rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10524{
10525 rb_encoding *enc;
10526 char *start;
10527 long olen, roffset;
10528
10529 enc = STR_ENC_GET(str);
10530 RSTRING_GETMEM(str, start, olen);
10531 if (argc > 0) {
10532 char table[TR_TABLE_SIZE];
10533 VALUE del = 0, nodel = 0;
10534
10535 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10536 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10537 }
10538 else {
10539 roffset = rstrip_offset(str, start, start+olen, enc);
10540 }
10541 if (roffset <= 0) return str_duplicate(rb_cString, str);
10542 return rb_str_subseq(str, 0, olen-roffset);
10543}
10544
10545
10546/*
10547 * call-seq:
10548 * strip!(*selectors) -> self or nil
10549 *
10550 * Like String#strip, except that:
10551 *
10552 * - Any modifications are made to +self+.
10553 * - Returns +self+ if any modification are made, +nil+ otherwise.
10554 *
10555 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10556 */
10557
10558static VALUE
10559rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10560{
10561 char *start;
10562 long olen, loffset, roffset;
10563 rb_encoding *enc;
10564
10565 str_modify_keep_cr(str);
10566 enc = STR_ENC_GET(str);
10567 RSTRING_GETMEM(str, start, olen);
10568
10569 if (argc > 0) {
10570 char table[TR_TABLE_SIZE];
10571 VALUE del = 0, nodel = 0;
10572
10573 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10574 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10575 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10576 }
10577 else {
10578 loffset = lstrip_offset(str, start, start+olen, enc);
10579 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10580 }
10581
10582 if (loffset > 0 || roffset > 0) {
10583 long len = olen-roffset;
10584 if (loffset > 0) {
10585 len -= loffset;
10586 memmove(start, start + loffset, len);
10587 }
10588 STR_SET_LEN(str, len);
10589 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10590 return str;
10591 }
10592 return Qnil;
10593}
10594
10595
10596/*
10597 * call-seq:
10598 * strip(*selectors) -> new_string
10599 *
10600 * Returns a copy of +self+ with leading and trailing whitespace removed;
10601 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10602 *
10603 * whitespace = "\x00\t\n\v\f\r "
10604 * s = whitespace + 'abc' + whitespace
10605 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10606 * s.strip # => "abc"
10607 *
10608 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10609 *
10610 * s = "---abc+++"
10611 * s.strip("-+") # => "abc"
10612 * s.strip("+-") # => "abc"
10613 *
10614 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10615 * and may use any of its valid forms, including negation, ranges, and escapes:
10616 *
10617 * "01234abc56789".strip("0-9") # "abc"
10618 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10619 *
10620 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10621 */
10622
10623static VALUE
10624rb_str_strip(int argc, VALUE *argv, VALUE str)
10625{
10626 char *start;
10627 long olen, loffset, roffset;
10628 rb_encoding *enc = STR_ENC_GET(str);
10629
10630 RSTRING_GETMEM(str, start, olen);
10631
10632 if (argc > 0) {
10633 char table[TR_TABLE_SIZE];
10634 VALUE del = 0, nodel = 0;
10635
10636 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10637 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10638 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10639 }
10640 else {
10641 loffset = lstrip_offset(str, start, start+olen, enc);
10642 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10643 }
10644
10645 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10646 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10647}
10648
10649static VALUE
10650scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10651{
10652 VALUE result = Qnil;
10653 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10654 if (pos >= 0) {
10655 VALUE match;
10656 struct re_registers *regs;
10657 if (BUILTIN_TYPE(pat) == T_STRING) {
10658 regs = NULL;
10659 end = pos + RSTRING_LEN(pat);
10660 }
10661 else {
10662 match = rb_backref_get();
10663 regs = RMATCH_REGS(match);
10664 pos = BEG(0);
10665 end = END(0);
10666 }
10667
10668 if (pos == end) {
10669 rb_encoding *enc = STR_ENC_GET(str);
10670 /*
10671 * Always consume at least one character of the input string
10672 */
10673 if (RSTRING_LEN(str) > end)
10674 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10675 RSTRING_END(str), enc);
10676 else
10677 *start = end + 1;
10678 }
10679 else {
10680 *start = end;
10681 }
10682
10683 if (!regs || regs->num_regs == 1) {
10684 result = rb_str_subseq(str, pos, end - pos);
10685 return result;
10686 }
10687 else {
10688 result = rb_ary_new2(regs->num_regs);
10689 for (int i = 1; i < regs->num_regs; i++) {
10690 VALUE s = Qnil;
10691 if (BEG(i) >= 0) {
10692 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10693 }
10694
10695 rb_ary_push(result, s);
10696 }
10697 }
10698
10699 RB_GC_GUARD(match);
10700 }
10701
10702 return result;
10703}
10704
10705
10706/*
10707 * call-seq:
10708 * scan(pattern) -> array_of_results
10709 * scan(pattern) {|result| ... } -> self
10710 *
10711 * :include: doc/string/scan.rdoc
10712 *
10713 */
10714
10715static VALUE
10716rb_str_scan(VALUE str, VALUE pat)
10717{
10718 VALUE result;
10719 long start = 0;
10720 long last = -1, prev = 0;
10721 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10722
10723 pat = get_pat_quoted(pat, 1);
10724 mustnot_broken(str);
10725 if (!rb_block_given_p()) {
10726 VALUE ary = rb_ary_new();
10727
10728 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10729 last = prev;
10730 prev = start;
10731 rb_ary_push(ary, result);
10732 }
10733 if (last >= 0) rb_pat_search(pat, str, last, 1);
10734 else rb_backref_set(Qnil);
10735 return ary;
10736 }
10737
10738 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10739 last = prev;
10740 prev = start;
10741 rb_yield(result);
10742 str_mod_check(str, p, len);
10743 }
10744 if (last >= 0) rb_pat_search(pat, str, last, 1);
10745 return str;
10746}
10747
10748
10749/*
10750 * call-seq:
10751 * hex -> integer
10752 *
10753 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10754 * returns its value as an integer.
10755 *
10756 * The leading substring is interpreted as hexadecimal when it begins with:
10757 *
10758 * - One or more character representing hexadecimal digits
10759 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10760 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10761 *
10762 * 'f'.hex # => 15
10763 * '11'.hex # => 17
10764 * 'FFF'.hex # => 4095
10765 * 'fffg'.hex # => 4095
10766 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10767 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10768 * 'deadbeef'.hex # => 3735928559
10769 *
10770 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10771 *
10772 * '0xfff'.hex # => 4095
10773 * '0xfffg'.hex # => 4095
10774 *
10775 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10776 *
10777 * '-fff'.hex # => -4095
10778 * '-0xFFF'.hex # => -4095
10779 *
10780 * For any substring not described above, returns zero:
10781 *
10782 * 'xxx'.hex # => 0
10783 * ''.hex # => 0
10784 *
10785 * Note that, unlike #oct, this method interprets only hexadecimal,
10786 * and not binary, octal, or decimal notations:
10787 *
10788 * '0b111'.hex # => 45329
10789 * '0o777'.hex # => 0
10790 * '0d999'.hex # => 55705
10791 *
10792 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10793 */
10794
10795static VALUE
10796rb_str_hex(VALUE str)
10797{
10798 return rb_str_to_inum(str, 16, FALSE);
10799}
10800
10801
10802/*
10803 * call-seq:
10804 * oct -> integer
10805 *
10806 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10807 * returns their value as an integer.
10808 *
10809 * In brief:
10810 *
10811 * # Interpreted as octal.
10812 * '777'.oct # => 511
10813 * '777x'.oct # => 511
10814 * '0777'.oct # => 511
10815 * '0o777'.oct # => 511
10816 * '-777'.oct # => -511
10817 * # Not interpreted as octal.
10818 * '0b111'.oct # => 7 # Interpreted as binary.
10819 * '0d999'.oct # => 999 # Interpreted as decimal.
10820 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10821 *
10822 * The leading substring is interpreted as octal when it begins with:
10823 *
10824 * - One or more character representing octal digits
10825 * (each in the range <tt>'0'..'7'</tt>);
10826 * the string to be interpreted ends at the first character that does not represent an octal digit:
10827 *
10828 * '7'.oct @ => 7
10829 * '11'.oct # => 9
10830 * '777'.oct # => 511
10831 * '0777'.oct # => 511
10832 * '7778'.oct # => 511
10833 * '777x'.oct # => 511
10834 *
10835 * - <tt>'0o'</tt>, followed by one or more octal digits:
10836 *
10837 * '0o777'.oct # => 511
10838 * '0o7778'.oct # => 511
10839 *
10840 * The leading substring is _not_ interpreted as octal when it begins with:
10841 *
10842 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10843 * (each in the range <tt>'0'..'1'</tt>);
10844 * the string to be interpreted ends at the first character that does not represent a binary digit.
10845 * the string is interpreted as binary digits (base 2):
10846 *
10847 * '0b111'.oct # => 7
10848 * '0b1112'.oct # => 7
10849 *
10850 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10851 * (each in the range <tt>'0'..'9'</tt>);
10852 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10853 * the string is interpreted as decimal digits (base 10):
10854 *
10855 * '0d999'.oct # => 999
10856 * '0d999x'.oct # => 999
10857 *
10858 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10859 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10860 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10861 * the string is interpreted as hexadecimal digits (base 16):
10862 *
10863 * '0xfff'.oct # => 4095
10864 * '0xfffg'.oct # => 4095
10865 *
10866 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10867 *
10868 * '-777'.oct # => -511
10869 * '-0777'.oct # => -511
10870 * '-0b111'.oct # => -7
10871 * '-0xfff'.oct # => -4095
10872 *
10873 * For any substring not described above, returns zero:
10874 *
10875 * 'foo'.oct # => 0
10876 * ''.oct # => 0
10877 *
10878 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10879 */
10880
10881static VALUE
10882rb_str_oct(VALUE str)
10883{
10884 return rb_str_to_inum(str, -8, FALSE);
10885}
10886
10887#ifndef HAVE_CRYPT_R
10888# include "ruby/thread_native.h"
10889# include "ruby/atomic.h"
10890
10891static struct {
10892 rb_nativethread_lock_t lock;
10893} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10894#endif
10895
10896/*
10897 * call-seq:
10898 * crypt(salt_str) -> new_string
10899 *
10900 * Returns the string generated by calling <code>crypt(3)</code>
10901 * standard library function with <code>str</code> and
10902 * <code>salt_str</code>, in this order, as its arguments. Please do
10903 * not use this method any longer. It is legacy; provided only for
10904 * backward compatibility with ruby scripts in earlier days. It is
10905 * bad to use in contemporary programs for several reasons:
10906 *
10907 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10908 * run. The generated string lacks data portability.
10909 *
10910 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10911 * (i.e. silently ends up in unexpected results).
10912 *
10913 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10914 * thread safe.
10915 *
10916 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10917 * very very weak. According to its manpage, Linux's traditional
10918 * <code>crypt(3)</code> output has only 2**56 variations; too
10919 * easy to brute force today. And this is the default behaviour.
10920 *
10921 * * In order to make things robust some OSes implement so-called
10922 * "modular" usage. To go through, you have to do a complex
10923 * build-up of the <code>salt_str</code> parameter, by hand.
10924 * Failure in generation of a proper salt string tends not to
10925 * yield any errors; typos in parameters are normally not
10926 * detectable.
10927 *
10928 * * For instance, in the following example, the second invocation
10929 * of String#crypt is wrong; it has a typo in "round=" (lacks
10930 * "s"). However the call does not fail and something unexpected
10931 * is generated.
10932 *
10933 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10934 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10935 *
10936 * * Even in the "modular" mode, some hash functions are considered
10937 * archaic and no longer recommended at all; for instance module
10938 * <code>$1$</code> is officially abandoned by its author: see
10939 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10940 * instance module <code>$3$</code> is considered completely
10941 * broken: see the manpage of FreeBSD.
10942 *
10943 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10944 * written above, <code>crypt(3)</code> on Mac OS never fails.
10945 * This means even if you build up a proper salt string it
10946 * generates a traditional DES hash anyways, and there is no way
10947 * for you to be aware of.
10948 *
10949 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10950 *
10951 * If for some reason you cannot migrate to other secure contemporary
10952 * password hashing algorithms, install the string-crypt gem and
10953 * <code>require 'string/crypt'</code> to continue using it.
10954 */
10955
10956static VALUE
10957rb_str_crypt(VALUE str, VALUE salt)
10958{
10959#ifdef HAVE_CRYPT_R
10960 VALUE databuf;
10961 struct crypt_data *data;
10962# define CRYPT_END() ALLOCV_END(databuf)
10963#else
10964 char *tmp_buf;
10965 extern char *crypt(const char *, const char *);
10966# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10967#endif
10968 VALUE result;
10969 const char *s, *saltp;
10970 char *res;
10971#ifdef BROKEN_CRYPT
10972 char salt_8bit_clean[3];
10973#endif
10974
10975 StringValue(salt);
10976 mustnot_wchar(str);
10977 mustnot_wchar(salt);
10978 s = StringValueCStr(str);
10979 saltp = RSTRING_PTR(salt);
10980 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10981 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10982 }
10983
10984#ifdef BROKEN_CRYPT
10985 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10986 salt_8bit_clean[0] = saltp[0] & 0x7f;
10987 salt_8bit_clean[1] = saltp[1] & 0x7f;
10988 salt_8bit_clean[2] = '\0';
10989 saltp = salt_8bit_clean;
10990 }
10991#endif
10992#ifdef HAVE_CRYPT_R
10993 data = ALLOCV(databuf, sizeof(struct crypt_data));
10994# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10995 data->initialized = 0;
10996# endif
10997 res = crypt_r(s, saltp, data);
10998#else
10999 rb_nativethread_lock_lock(&crypt_mutex.lock);
11000 res = crypt(s, saltp);
11001#endif
11002 if (!res) {
11003 int err = errno;
11004 CRYPT_END();
11005 rb_syserr_fail(err, "crypt");
11006 }
11007#ifdef HAVE_CRYPT_R
11008 result = rb_str_new_cstr(res);
11009 CRYPT_END();
11010#else
11011 // We need to copy this buffer because it's static and we need to unlock the mutex
11012 // before allocating a new object (the string to be returned). If we allocate while
11013 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
11014 // if other ractors are waiting on this lock.
11015 size_t res_size = strlen(res)+1;
11016 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
11017 memcpy(tmp_buf, res, res_size);
11018 res = tmp_buf;
11019 CRYPT_END();
11020 result = rb_str_new_cstr(res);
11021#endif
11022 return result;
11023}
11024
11025
11026/*
11027 * call-seq:
11028 * ord -> integer
11029 *
11030 * :include: doc/string/ord.rdoc
11031 *
11032 */
11033
11034static VALUE
11035rb_str_ord(VALUE s)
11036{
11037 unsigned int c;
11038
11039 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11040 return UINT2NUM(c);
11041}
11042/*
11043 * call-seq:
11044 * sum(n = 16) -> integer
11045 *
11046 * :include: doc/string/sum.rdoc
11047 *
11048 */
11049
11050static VALUE
11051rb_str_sum(int argc, VALUE *argv, VALUE str)
11052{
11053 int bits = 16;
11054 char *ptr, *p, *pend;
11055 long len;
11056 VALUE sum = INT2FIX(0);
11057 unsigned long sum0 = 0;
11058
11059 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11060 bits = 0;
11061 }
11062 ptr = p = RSTRING_PTR(str);
11063 len = RSTRING_LEN(str);
11064 pend = p + len;
11065
11066 while (p < pend) {
11067 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11068 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11069 str_mod_check(str, ptr, len);
11070 sum0 = 0;
11071 }
11072 sum0 += (unsigned char)*p;
11073 p++;
11074 }
11075
11076 if (bits == 0) {
11077 if (sum0) {
11078 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11079 }
11080 }
11081 else {
11082 if (sum == INT2FIX(0)) {
11083 if (bits < (int)sizeof(long)*CHAR_BIT) {
11084 sum0 &= (((unsigned long)1)<<bits)-1;
11085 }
11086 sum = LONG2FIX(sum0);
11087 }
11088 else {
11089 VALUE mod;
11090
11091 if (sum0) {
11092 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11093 }
11094
11095 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11096 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11097 sum = rb_funcall(sum, '&', 1, mod);
11098 }
11099 }
11100 return sum;
11101}
11102
11103static VALUE
11104rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11105{
11106 rb_encoding *enc;
11107 VALUE w;
11108 long width, len, flen = 1, fclen = 1;
11109 VALUE res;
11110 char *p;
11111 const char *f = " ";
11112 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11113 VALUE pad;
11114 int singlebyte = 1, cr;
11115 int termlen;
11116
11117 rb_scan_args(argc, argv, "11", &w, &pad);
11118 enc = STR_ENC_GET(str);
11119 termlen = rb_enc_mbminlen(enc);
11120 width = NUM2LONG(w);
11121 if (argc == 2) {
11122 StringValue(pad);
11123 enc = rb_enc_check(str, pad);
11124 f = RSTRING_PTR(pad);
11125 flen = RSTRING_LEN(pad);
11126 fclen = str_strlen(pad, enc); /* rb_enc_check */
11127 singlebyte = single_byte_optimizable(pad);
11128 if (flen == 0 || fclen == 0) {
11129 rb_raise(rb_eArgError, "zero width padding");
11130 }
11131 }
11132 len = str_strlen(str, enc); /* rb_enc_check */
11133 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11134 n = width - len;
11135 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11136 rlen = n - llen;
11137 cr = ENC_CODERANGE(str);
11138 if (flen > 1) {
11139 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11140 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11141 }
11142 size = RSTRING_LEN(str);
11143 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11144 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11145 (len += llen2 + rlen2) >= LONG_MAX - size) {
11146 rb_raise(rb_eArgError, "argument too big");
11147 }
11148 len += size;
11149 res = str_enc_new(rb_cString, 0, len, enc);
11150 p = RSTRING_PTR(res);
11151 if (flen <= 1) {
11152 memset(p, *f, llen);
11153 p += llen;
11154 }
11155 else {
11156 while (llen >= fclen) {
11157 memcpy(p,f,flen);
11158 p += flen;
11159 llen -= fclen;
11160 }
11161 if (llen > 0) {
11162 memcpy(p, f, llen2);
11163 p += llen2;
11164 }
11165 }
11166 memcpy(p, RSTRING_PTR(str), size);
11167 p += size;
11168 if (flen <= 1) {
11169 memset(p, *f, rlen);
11170 p += rlen;
11171 }
11172 else {
11173 while (rlen >= fclen) {
11174 memcpy(p,f,flen);
11175 p += flen;
11176 rlen -= fclen;
11177 }
11178 if (rlen > 0) {
11179 memcpy(p, f, rlen2);
11180 p += rlen2;
11181 }
11182 }
11183 TERM_FILL(p, termlen);
11184 STR_SET_LEN(res, p-RSTRING_PTR(res));
11185
11186 if (argc == 2)
11187 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11188 if (cr != ENC_CODERANGE_BROKEN)
11189 ENC_CODERANGE_SET(res, cr);
11190
11191 RB_GC_GUARD(pad);
11192 return res;
11193}
11194
11195
11196/*
11197 * call-seq:
11198 * ljust(width, pad_string = ' ') -> new_string
11199 *
11200 * :include: doc/string/ljust.rdoc
11201 *
11202 */
11203
11204static VALUE
11205rb_str_ljust(int argc, VALUE *argv, VALUE str)
11206{
11207 return rb_str_justify(argc, argv, str, 'l');
11208}
11209
11210/*
11211 * call-seq:
11212 * rjust(width, pad_string = ' ') -> new_string
11213 *
11214 * :include: doc/string/rjust.rdoc
11215 *
11216 */
11217
11218static VALUE
11219rb_str_rjust(int argc, VALUE *argv, VALUE str)
11220{
11221 return rb_str_justify(argc, argv, str, 'r');
11222}
11223
11224
11225/*
11226 * call-seq:
11227 * center(size, pad_string = ' ') -> new_string
11228 *
11229 * :include: doc/string/center.rdoc
11230 *
11231 */
11232
11233static VALUE
11234rb_str_center(int argc, VALUE *argv, VALUE str)
11235{
11236 return rb_str_justify(argc, argv, str, 'c');
11237}
11238
11239/*
11240 * call-seq:
11241 * partition(pattern) -> [pre_match, first_match, post_match]
11242 *
11243 * :include: doc/string/partition.rdoc
11244 *
11245 */
11246
11247static VALUE
11248rb_str_partition(VALUE str, VALUE sep)
11249{
11250 long pos;
11251
11252 sep = get_pat_quoted(sep, 0);
11253 if (RB_TYPE_P(sep, T_REGEXP)) {
11254 if (rb_reg_search(sep, str, 0, 0) < 0) {
11255 goto failed;
11256 }
11257 VALUE match = rb_backref_get();
11258 struct re_registers *regs = RMATCH_REGS(match);
11259
11260 pos = BEG(0);
11261 sep = rb_str_subseq(str, pos, END(0) - pos);
11262 }
11263 else {
11264 pos = rb_str_index(str, sep, 0);
11265 if (pos < 0) goto failed;
11266 }
11267 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11268 sep,
11269 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11270 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11271
11272 failed:
11273 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11274}
11275
11276/*
11277 * call-seq:
11278 * rpartition(pattern) -> [pre_match, last_match, post_match]
11279 *
11280 * :include: doc/string/rpartition.rdoc
11281 *
11282 */
11283
11284static VALUE
11285rb_str_rpartition(VALUE str, VALUE sep)
11286{
11287 long pos = RSTRING_LEN(str);
11288
11289 sep = get_pat_quoted(sep, 0);
11290 if (RB_TYPE_P(sep, T_REGEXP)) {
11291 if (rb_reg_search(sep, str, pos, 1) < 0) {
11292 goto failed;
11293 }
11294 VALUE match = rb_backref_get();
11295 struct re_registers *regs = RMATCH_REGS(match);
11296
11297 pos = BEG(0);
11298 sep = rb_str_subseq(str, pos, END(0) - pos);
11299 }
11300 else {
11301 pos = rb_str_sublen(str, pos);
11302 pos = rb_str_rindex(str, sep, pos);
11303 if (pos < 0) {
11304 goto failed;
11305 }
11306 }
11307
11308 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11309 sep,
11310 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11311 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11312 failed:
11313 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11314}
11315
11316/*
11317 * call-seq:
11318 * start_with?(*patterns) -> true or false
11319 *
11320 * :include: doc/string/start_with_p.rdoc
11321 *
11322 */
11323
11324static VALUE
11325rb_str_start_with(int argc, VALUE *argv, VALUE str)
11326{
11327 int i;
11328
11329 for (i=0; i<argc; i++) {
11330 VALUE tmp = argv[i];
11331 if (RB_TYPE_P(tmp, T_REGEXP)) {
11332 if (rb_reg_start_with_p(tmp, str))
11333 return Qtrue;
11334 }
11335 else {
11336 const char *p, *s, *e;
11337 long slen, tlen;
11338 rb_encoding *enc;
11339
11340 StringValue(tmp);
11341 enc = rb_enc_check(str, tmp);
11342 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11343 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11344 p = RSTRING_PTR(str);
11345 e = p + slen;
11346 s = p + tlen;
11347 if (!at_char_right_boundary(p, s, e, enc))
11348 continue;
11349 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11350 return Qtrue;
11351 }
11352 }
11353 return Qfalse;
11354}
11355
11356/*
11357 * call-seq:
11358 * end_with?(*strings) -> true or false
11359 *
11360 * :include: doc/string/end_with_p.rdoc
11361 *
11362 */
11363
11364static VALUE
11365rb_str_end_with(int argc, VALUE *argv, VALUE str)
11366{
11367 int i;
11368
11369 for (i=0; i<argc; i++) {
11370 VALUE tmp = argv[i];
11371 const char *p, *s, *e;
11372 long slen, tlen;
11373 rb_encoding *enc;
11374
11375 StringValue(tmp);
11376 enc = rb_enc_check(str, tmp);
11377 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11378 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11379 p = RSTRING_PTR(str);
11380 e = p + slen;
11381 s = e - tlen;
11382 if (!at_char_boundary(p, s, e, enc))
11383 continue;
11384 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11385 return Qtrue;
11386 }
11387 return Qfalse;
11388}
11389
11399static long
11400deleted_prefix_length(VALUE str, VALUE prefix)
11401{
11402 const char *strptr, *prefixptr;
11403 long olen, prefixlen;
11404 rb_encoding *enc = rb_enc_get(str);
11405
11406 StringValue(prefix);
11407
11408 if (!is_broken_string(prefix) ||
11409 !rb_enc_asciicompat(enc) ||
11410 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11411 enc = rb_enc_check(str, prefix);
11412 }
11413
11414 /* return 0 if not start with prefix */
11415 prefixlen = RSTRING_LEN(prefix);
11416 if (prefixlen <= 0) return 0;
11417 olen = RSTRING_LEN(str);
11418 if (olen < prefixlen) return 0;
11419 strptr = RSTRING_PTR(str);
11420 prefixptr = RSTRING_PTR(prefix);
11421 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11422 if (is_broken_string(prefix)) {
11423 if (!is_broken_string(str)) {
11424 /* prefix in a valid string cannot be broken */
11425 return 0;
11426 }
11427 const char *strend = strptr + olen;
11428 const char *after_prefix = strptr + prefixlen;
11429 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11430 /* prefix does not end at char-boundary */
11431 return 0;
11432 }
11433 }
11434 /* prefix part in `str` also should be valid. */
11435
11436 return prefixlen;
11437}
11438
11439/*
11440 * call-seq:
11441 * delete_prefix!(prefix) -> self or nil
11442 *
11443 * Like String#delete_prefix, except that +self+ is modified in place;
11444 * returns +self+ if the prefix is removed, +nil+ otherwise.
11445 *
11446 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11447 */
11448
11449static VALUE
11450rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11451{
11452 long prefixlen;
11453 str_modify_keep_cr(str);
11454
11455 prefixlen = deleted_prefix_length(str, prefix);
11456 if (prefixlen <= 0) return Qnil;
11457
11458 return rb_str_drop_bytes(str, prefixlen);
11459}
11460
11461/*
11462 * call-seq:
11463 * delete_prefix(prefix) -> new_string
11464 *
11465 * :include: doc/string/delete_prefix.rdoc
11466 *
11467 */
11468
11469static VALUE
11470rb_str_delete_prefix(VALUE str, VALUE prefix)
11471{
11472 long prefixlen;
11473
11474 prefixlen = deleted_prefix_length(str, prefix);
11475 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11476
11477 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11478}
11479
11489static long
11490deleted_suffix_length(VALUE str, VALUE suffix)
11491{
11492 const char *strptr, *suffixptr;
11493 long olen, suffixlen;
11494 rb_encoding *enc;
11495
11496 StringValue(suffix);
11497 if (is_broken_string(suffix)) return 0;
11498 enc = rb_enc_check(str, suffix);
11499
11500 /* return 0 if not start with suffix */
11501 suffixlen = RSTRING_LEN(suffix);
11502 if (suffixlen <= 0) return 0;
11503 olen = RSTRING_LEN(str);
11504 if (olen < suffixlen) return 0;
11505 strptr = RSTRING_PTR(str);
11506 suffixptr = RSTRING_PTR(suffix);
11507 const char *strend = strptr + olen;
11508 const char *before_suffix = strend - suffixlen;
11509 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11510 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11511
11512 return suffixlen;
11513}
11514
11515/*
11516 * call-seq:
11517 * delete_suffix!(suffix) -> self or nil
11518 *
11519 * Like String#delete_suffix, except that +self+ is modified in place;
11520 * returns +self+ if the suffix is removed, +nil+ otherwise.
11521 *
11522 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11523 */
11524
11525static VALUE
11526rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11527{
11528 long olen, suffixlen, len;
11529 str_modifiable(str);
11530
11531 suffixlen = deleted_suffix_length(str, suffix);
11532 if (suffixlen <= 0) return Qnil;
11533
11534 olen = RSTRING_LEN(str);
11535 str_modify_keep_cr(str);
11536 len = olen - suffixlen;
11537 STR_SET_LEN(str, len);
11538 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11539 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11541 }
11542 return str;
11543}
11544
11545/*
11546 * call-seq:
11547 * delete_suffix(suffix) -> new_string
11548 *
11549 * :include: doc/string/delete_suffix.rdoc
11550 *
11551 */
11552
11553static VALUE
11554rb_str_delete_suffix(VALUE str, VALUE suffix)
11555{
11556 long suffixlen;
11557
11558 suffixlen = deleted_suffix_length(str, suffix);
11559 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11560
11561 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11562}
11563
11564void
11565rb_str_setter(VALUE val, ID id, VALUE *var)
11566{
11567 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11568 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11569 }
11570 *var = val;
11571}
11572
11573static void
11574nil_setter_warning(ID id)
11575{
11576 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11577}
11578
11579void
11580rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11581{
11582 rb_str_setter(val, id, var);
11583 if (!NIL_P(*var)) {
11584 nil_setter_warning(id);
11585 }
11586}
11587
11588static void
11589rb_fs_setter(VALUE val, ID id, VALUE *var)
11590{
11591 val = rb_fs_check(val);
11592 if (!val) {
11593 rb_raise(rb_eTypeError,
11594 "value of %"PRIsVALUE" must be String or Regexp",
11595 rb_id2str(id));
11596 }
11597 if (!NIL_P(val)) {
11598 nil_setter_warning(id);
11599 }
11600 *var = val;
11601}
11602
11603
11604/*
11605 * call-seq:
11606 * force_encoding(encoding) -> self
11607 *
11608 * :include: doc/string/force_encoding.rdoc
11609 *
11610 */
11611
11612static VALUE
11613rb_str_force_encoding(VALUE str, VALUE enc)
11614{
11615 str_modifiable(str);
11616
11617 rb_encoding *encoding = rb_to_encoding(enc);
11618 int idx = rb_enc_to_index(encoding);
11619
11620 // If the encoding is unchanged, we do nothing.
11621 if (ENCODING_GET(str) == idx) {
11622 return str;
11623 }
11624
11625 rb_enc_associate_index(str, idx);
11626
11627 // If the coderange was 7bit and the new encoding is ASCII-compatible
11628 // we can keep the coderange.
11629 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11630 return str;
11631 }
11632
11634 return str;
11635}
11636
11637/*
11638 * call-seq:
11639 * b -> new_string
11640 *
11641 * :include: doc/string/b.rdoc
11642 *
11643 */
11644
11645static VALUE
11646rb_str_b(VALUE str)
11647{
11648 VALUE str2;
11649 if (STR_EMBED_P(str)) {
11650 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11651 }
11652 else {
11653 str2 = str_alloc_heap(rb_cString);
11654 }
11655 str_replace_shared_without_enc(str2, str);
11656
11657 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11658 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11659 // If we know the receiver's code range then we know the result's code range.
11660 int cr = ENC_CODERANGE(str);
11661 switch (cr) {
11662 case ENC_CODERANGE_7BIT:
11664 break;
11668 break;
11669 default:
11670 ENC_CODERANGE_CLEAR(str2);
11671 break;
11672 }
11673 }
11674
11675 return str2;
11676}
11677
11678/*
11679 * call-seq:
11680 * valid_encoding? -> true or false
11681 *
11682 * :include: doc/string/valid_encoding_p.rdoc
11683 *
11684 */
11685
11686static VALUE
11687rb_str_valid_encoding_p(VALUE str)
11688{
11689 int cr = rb_enc_str_coderange(str);
11690
11691 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11692}
11693
11694/*
11695 * call-seq:
11696 * ascii_only? -> true or false
11697 *
11698 * Returns whether +self+ contains only ASCII characters:
11699 *
11700 * 'abc'.ascii_only? # => true
11701 * "abc\u{6666}".ascii_only? # => false
11702 *
11703 * Related: see {Querying}[rdoc-ref:String@Querying].
11704 */
11705
11706static VALUE
11707rb_str_is_ascii_only_p(VALUE str)
11708{
11709 int cr = rb_enc_str_coderange(str);
11710
11711 return RBOOL(cr == ENC_CODERANGE_7BIT);
11712}
11713
11714VALUE
11716{
11717 static const char ellipsis[] = "...";
11718 const long ellipsislen = sizeof(ellipsis) - 1;
11719 rb_encoding *const enc = rb_enc_get(str);
11720 const long blen = RSTRING_LEN(str);
11721 const char *const p = RSTRING_PTR(str), *e = p + blen;
11722 VALUE estr, ret = 0;
11723
11724 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11725 if (len * rb_enc_mbminlen(enc) >= blen ||
11726 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11727 ret = str;
11728 }
11729 else if (len <= ellipsislen ||
11730 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11731 if (rb_enc_asciicompat(enc)) {
11732 ret = rb_str_new(ellipsis, len);
11733 rb_enc_associate(ret, enc);
11734 }
11735 else {
11736 estr = rb_usascii_str_new(ellipsis, len);
11737 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11738 }
11739 }
11740 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11741 rb_str_cat(ret, ellipsis, ellipsislen);
11742 }
11743 else {
11744 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11745 rb_enc_from_encoding(enc), 0, Qnil);
11746 rb_str_append(ret, estr);
11747 }
11748 return ret;
11749}
11750
11751static VALUE
11752str_compat_and_valid(VALUE str, rb_encoding *enc)
11753{
11754 int cr;
11755 str = StringValue(str);
11756 cr = rb_enc_str_coderange(str);
11757 if (cr == ENC_CODERANGE_BROKEN) {
11758 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11759 }
11760 else {
11761 rb_encoding *e = STR_ENC_GET(str);
11762 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11763 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11764 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11765 }
11766 }
11767 return str;
11768}
11769
11770static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11771
11772VALUE
11774{
11775 rb_encoding *enc = STR_ENC_GET(str);
11776 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11777}
11778
11779VALUE
11780rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11781{
11782 int cr = ENC_CODERANGE_UNKNOWN;
11783 if (enc == STR_ENC_GET(str)) {
11784 /* cached coderange makes sense only when enc equals the
11785 * actual encoding of str */
11786 cr = ENC_CODERANGE(str);
11787 }
11788 return enc_str_scrub(enc, str, repl, cr);
11789}
11790
11791static VALUE
11792enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11793{
11794 int encidx;
11795 VALUE buf = Qnil;
11796 const char *rep, *p, *e, *p1, *sp;
11797 long replen = -1;
11798 long slen;
11799
11800 if (rb_block_given_p()) {
11801 if (!NIL_P(repl))
11802 rb_raise(rb_eArgError, "both of block and replacement given");
11803 replen = 0;
11804 }
11805
11806 if (ENC_CODERANGE_CLEAN_P(cr))
11807 return Qnil;
11808
11809 if (!NIL_P(repl)) {
11810 repl = str_compat_and_valid(repl, enc);
11811 }
11812
11813 if (rb_enc_dummy_p(enc)) {
11814 return Qnil;
11815 }
11816 encidx = rb_enc_to_index(enc);
11817
11818#define DEFAULT_REPLACE_CHAR(str) do { \
11819 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11820 rep = replace; replen = (int)sizeof(replace); \
11821 } while (0)
11822
11823 slen = RSTRING_LEN(str);
11824 p = RSTRING_PTR(str);
11825 e = RSTRING_END(str);
11826 p1 = p;
11827 sp = p;
11828
11829 if (rb_enc_asciicompat(enc)) {
11830 int rep7bit_p;
11831 if (!replen) {
11832 rep = NULL;
11833 rep7bit_p = FALSE;
11834 }
11835 else if (!NIL_P(repl)) {
11836 rep = RSTRING_PTR(repl);
11837 replen = RSTRING_LEN(repl);
11838 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11839 }
11840 else if (encidx == rb_utf8_encindex()) {
11841 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11842 rep7bit_p = FALSE;
11843 }
11844 else {
11845 DEFAULT_REPLACE_CHAR("?");
11846 rep7bit_p = TRUE;
11847 }
11848 cr = ENC_CODERANGE_7BIT;
11849
11850 p = search_nonascii(p, e);
11851 if (!p) {
11852 p = e;
11853 }
11854 while (p < e) {
11855 int ret = rb_enc_precise_mbclen(p, e, enc);
11856 if (MBCLEN_NEEDMORE_P(ret)) {
11857 break;
11858 }
11859 else if (MBCLEN_CHARFOUND_P(ret)) {
11861 p += MBCLEN_CHARFOUND_LEN(ret);
11862 }
11863 else if (MBCLEN_INVALID_P(ret)) {
11864 /*
11865 * p1~p: valid ascii/multibyte chars
11866 * p ~e: invalid bytes + unknown bytes
11867 */
11868 long clen = rb_enc_mbmaxlen(enc);
11869 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11870 if (p > p1) {
11871 rb_str_buf_cat(buf, p1, p - p1);
11872 }
11873
11874 if (e - p < clen) clen = e - p;
11875 if (clen <= 2) {
11876 clen = 1;
11877 }
11878 else {
11879 const char *q = p;
11880 clen--;
11881 for (; clen > 1; clen--) {
11882 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11883 if (MBCLEN_NEEDMORE_P(ret)) break;
11884 if (MBCLEN_INVALID_P(ret)) continue;
11886 }
11887 }
11888 if (rep) {
11889 rb_str_buf_cat(buf, rep, replen);
11890 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11891 }
11892 else {
11893 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11894 str_mod_check(str, sp, slen);
11895 repl = str_compat_and_valid(repl, enc);
11896 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11899 }
11900 p += clen;
11901 p1 = p;
11902 p = search_nonascii(p, e);
11903 if (!p) {
11904 p = e;
11905 break;
11906 }
11907 }
11908 else {
11910 }
11911 }
11912 if (NIL_P(buf)) {
11913 if (p == e) {
11914 ENC_CODERANGE_SET(str, cr);
11915 return Qnil;
11916 }
11917 buf = rb_str_buf_new(RSTRING_LEN(str));
11918 }
11919 if (p1 < p) {
11920 rb_str_buf_cat(buf, p1, p - p1);
11921 }
11922 if (p < e) {
11923 if (rep) {
11924 rb_str_buf_cat(buf, rep, replen);
11925 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11926 }
11927 else {
11928 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11929 str_mod_check(str, sp, slen);
11930 repl = str_compat_and_valid(repl, enc);
11931 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11934 }
11935 }
11936 }
11937 else {
11938 /* ASCII incompatible */
11939 long mbminlen = rb_enc_mbminlen(enc);
11940 if (!replen) {
11941 rep = NULL;
11942 }
11943 else if (!NIL_P(repl)) {
11944 rep = RSTRING_PTR(repl);
11945 replen = RSTRING_LEN(repl);
11946 }
11947 else if (encidx == ENCINDEX_UTF_16BE) {
11948 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11949 }
11950 else if (encidx == ENCINDEX_UTF_16LE) {
11951 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11952 }
11953 else if (encidx == ENCINDEX_UTF_32BE) {
11954 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11955 }
11956 else if (encidx == ENCINDEX_UTF_32LE) {
11957 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11958 }
11959 else {
11960 DEFAULT_REPLACE_CHAR("?");
11961 }
11962
11963 while (p < e) {
11964 int ret = rb_enc_precise_mbclen(p, e, enc);
11965 if (MBCLEN_NEEDMORE_P(ret)) {
11966 break;
11967 }
11968 else if (MBCLEN_CHARFOUND_P(ret)) {
11969 p += MBCLEN_CHARFOUND_LEN(ret);
11970 }
11971 else if (MBCLEN_INVALID_P(ret)) {
11972 const char *q = p;
11973 long clen = rb_enc_mbmaxlen(enc);
11974 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11975 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11976
11977 if (e - p < clen) clen = e - p;
11978 if (clen <= mbminlen * 2) {
11979 clen = mbminlen;
11980 }
11981 else {
11982 clen -= mbminlen;
11983 for (; clen > mbminlen; clen-=mbminlen) {
11984 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11985 if (MBCLEN_NEEDMORE_P(ret)) break;
11986 if (MBCLEN_INVALID_P(ret)) continue;
11988 }
11989 }
11990 if (rep) {
11991 rb_str_buf_cat(buf, rep, replen);
11992 }
11993 else {
11994 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11995 str_mod_check(str, sp, slen);
11996 repl = str_compat_and_valid(repl, enc);
11997 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11998 }
11999 p += clen;
12000 p1 = p;
12001 }
12002 else {
12004 }
12005 }
12006 if (NIL_P(buf)) {
12007 if (p == e) {
12009 return Qnil;
12010 }
12011 buf = rb_str_buf_new(RSTRING_LEN(str));
12012 }
12013 if (p1 < p) {
12014 rb_str_buf_cat(buf, p1, p - p1);
12015 }
12016 if (p < e) {
12017 if (rep) {
12018 rb_str_buf_cat(buf, rep, replen);
12019 }
12020 else {
12021 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12022 str_mod_check(str, sp, slen);
12023 repl = str_compat_and_valid(repl, enc);
12024 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12025 }
12026 }
12028 }
12029 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12030 return buf;
12031}
12032
12033/*
12034 * call-seq:
12035 * scrub(replacement_string = default_replacement_string) -> new_string
12036 * scrub{|sequence| ... } -> new_string
12037 *
12038 * :include: doc/string/scrub.rdoc
12039 *
12040 */
12041static VALUE
12042str_scrub(int argc, VALUE *argv, VALUE str)
12043{
12044 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12045 VALUE new = rb_str_scrub(str, repl);
12046 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12047}
12048
12049/*
12050 * call-seq:
12051 * scrub!(replacement_string = default_replacement_string) -> self
12052 * scrub!{|sequence| ... } -> self
12053 *
12054 * Like String#scrub, except that:
12055 *
12056 * - Any replacements are made in +self+.
12057 * - Returns +self+.
12058 *
12059 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12060 *
12061 */
12062static VALUE
12063str_scrub_bang(int argc, VALUE *argv, VALUE str)
12064{
12065 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12066 VALUE new = rb_str_scrub(str, repl);
12067 if (!NIL_P(new)) rb_str_replace(str, new);
12068 return str;
12069}
12070
12071static ID id_normalize;
12072static ID id_normalized_p;
12073static VALUE mUnicodeNormalize;
12074
12075static VALUE
12076unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12077{
12078 static int UnicodeNormalizeRequired = 0;
12079 VALUE argv2[2];
12080
12081 if (!UnicodeNormalizeRequired) {
12082 rb_require("unicode_normalize/normalize.rb");
12083 UnicodeNormalizeRequired = 1;
12084 }
12085 argv2[0] = str;
12086 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12087 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12088}
12089
12090/*
12091 * call-seq:
12092 * unicode_normalize(form = :nfc) -> string
12093 *
12094 * :include: doc/string/unicode_normalize.rdoc
12095 *
12096 */
12097static VALUE
12098rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12099{
12100 return unicode_normalize_common(argc, argv, str, id_normalize);
12101}
12102
12103/*
12104 * call-seq:
12105 * unicode_normalize!(form = :nfc) -> self
12106 *
12107 * Like String#unicode_normalize, except that the normalization
12108 * is performed on +self+ (not on a copy of +self+).
12109 *
12110 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12111 *
12112 */
12113static VALUE
12114rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12115{
12116 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12117}
12118
12119/* call-seq:
12120 * unicode_normalized?(form = :nfc) -> true or false
12121 *
12122 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12123 * see String#unicode_normalize.
12124 *
12125 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12126 *
12127 * Examples:
12128 *
12129 * "a\u0300".unicode_normalized? # => false
12130 * "a\u0300".unicode_normalized?(:nfd) # => true
12131 * "\u00E0".unicode_normalized? # => true
12132 * "\u00E0".unicode_normalized?(:nfd) # => false
12133 *
12134 *
12135 * Raises an exception if +self+ is not in a Unicode encoding:
12136 *
12137 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12138 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12139 *
12140 * Related: see {Querying}[rdoc-ref:String@Querying].
12141 */
12142static VALUE
12143rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12144{
12145 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12146}
12147
12148/**********************************************************************
12149 * Document-class: Symbol
12150 *
12151 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12152 *
12153 * You can create a +Symbol+ object explicitly with:
12154 *
12155 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12156 *
12157 * The same +Symbol+ object will be
12158 * created for a given name or string for the duration of a program's
12159 * execution, regardless of the context or meaning of that name. Thus
12160 * if <code>Fred</code> is a constant in one context, a method in
12161 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12162 * will be the same object in all three contexts.
12163 *
12164 * module One
12165 * class Fred
12166 * end
12167 * $f1 = :Fred
12168 * end
12169 * module Two
12170 * Fred = 1
12171 * $f2 = :Fred
12172 * end
12173 * def Fred()
12174 * end
12175 * $f3 = :Fred
12176 * $f1.object_id #=> 2514190
12177 * $f2.object_id #=> 2514190
12178 * $f3.object_id #=> 2514190
12179 *
12180 * Constant, method, and variable names are returned as symbols:
12181 *
12182 * module One
12183 * Two = 2
12184 * def three; 3 end
12185 * @four = 4
12186 * @@five = 5
12187 * $six = 6
12188 * end
12189 * seven = 7
12190 *
12191 * One.constants
12192 * # => [:Two]
12193 * One.instance_methods(true)
12194 * # => [:three]
12195 * One.instance_variables
12196 * # => [:@four]
12197 * One.class_variables
12198 * # => [:@@five]
12199 * global_variables.grep(/six/)
12200 * # => [:$six]
12201 * local_variables
12202 * # => [:seven]
12203 *
12204 * A +Symbol+ object differs from a String object in that
12205 * a +Symbol+ object represents an identifier, while a String object
12206 * represents text or data.
12207 *
12208 * == What's Here
12209 *
12210 * First, what's elsewhere. Class +Symbol+:
12211 *
12212 * - Inherits from {class Object}[rdoc-ref:Object@Whats+Here].
12213 * - Includes {module Comparable}[rdoc-ref:Comparable@Whats+Here].
12214 *
12215 * Here, class +Symbol+ provides methods that are useful for:
12216 *
12217 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12218 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12219 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12220 *
12221 * === Methods for Querying
12222 *
12223 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12224 * - #=~: Returns the index of the first substring in symbol that matches a
12225 * given Regexp or other object; returns +nil+ if no match is found.
12226 * - #[], #slice : Returns a substring of symbol
12227 * determined by a given index, start/length, or range, or string.
12228 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12229 * - #encoding: Returns the Encoding object that represents the encoding
12230 * of symbol.
12231 * - #end_with?: Returns +true+ if symbol ends with
12232 * any of the given strings.
12233 * - #match: Returns a MatchData object if symbol
12234 * matches a given Regexp; +nil+ otherwise.
12235 * - #match?: Returns +true+ if symbol
12236 * matches a given Regexp; +false+ otherwise.
12237 * - #length, #size: Returns the number of characters in symbol.
12238 * - #start_with?: Returns +true+ if symbol starts with
12239 * any of the given strings.
12240 *
12241 * === Methods for Comparing
12242 *
12243 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12244 * or larger than symbol.
12245 * - #==, #===: Returns +true+ if a given symbol has the same content and
12246 * encoding.
12247 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12248 * symbol is smaller than, equal to, or larger than symbol.
12249 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12250 * after Unicode case folding; +false+ otherwise.
12251 *
12252 * === Methods for Converting
12253 *
12254 * - #capitalize: Returns symbol with the first character upcased
12255 * and all other characters downcased.
12256 * - #downcase: Returns symbol with all characters downcased.
12257 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12258 * - #name: Returns the frozen string corresponding to symbol.
12259 * - #succ, #next: Returns the symbol that is the successor to symbol.
12260 * - #swapcase: Returns symbol with all upcase characters downcased
12261 * and all downcase characters upcased.
12262 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12263 * - #to_s, #id2name: Returns the string corresponding to +self+.
12264 * - #to_sym, #intern: Returns +self+.
12265 * - #upcase: Returns symbol with all characters upcased.
12266 *
12267 */
12268
12269
12270/*
12271 * call-seq:
12272 * self == other -> true or false
12273 *
12274 * Returns whether +other+ is the same object as +self+.
12275 */
12276
12277#define sym_equal rb_obj_equal
12278
12279static int
12280sym_printable(const char *s, const char *send, rb_encoding *enc)
12281{
12282 while (s < send) {
12283 int n;
12284 int c = rb_enc_precise_mbclen(s, send, enc);
12285
12286 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12287 n = MBCLEN_CHARFOUND_LEN(c);
12288 c = rb_enc_mbc_to_codepoint(s, send, enc);
12289 if (!rb_enc_isprint(c, enc)) return FALSE;
12290 s += n;
12291 }
12292 return TRUE;
12293}
12294
12295int
12296rb_str_symname_p(VALUE sym)
12297{
12298 rb_encoding *enc;
12299 const char *ptr;
12300 long len;
12301 rb_encoding *resenc = rb_default_internal_encoding();
12302
12303 if (resenc == NULL) resenc = rb_default_external_encoding();
12304 enc = STR_ENC_GET(sym);
12305 ptr = RSTRING_PTR(sym);
12306 len = RSTRING_LEN(sym);
12307 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12308 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12309 return FALSE;
12310 }
12311 return TRUE;
12312}
12313
12314VALUE
12315rb_str_quote_unprintable(VALUE str)
12316{
12317 rb_encoding *enc;
12318 const char *ptr;
12319 long len;
12320 rb_encoding *resenc;
12321
12322 Check_Type(str, T_STRING);
12323 resenc = rb_default_internal_encoding();
12324 if (resenc == NULL) resenc = rb_default_external_encoding();
12325 enc = STR_ENC_GET(str);
12326 ptr = RSTRING_PTR(str);
12327 len = RSTRING_LEN(str);
12328 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12329 !sym_printable(ptr, ptr + len, enc)) {
12330 return rb_str_escape(str);
12331 }
12332 return str;
12333}
12334
12335VALUE
12336rb_id_quote_unprintable(ID id)
12337{
12338 VALUE str = rb_id2str(id);
12339 if (!rb_str_symname_p(str)) {
12340 return rb_str_escape(str);
12341 }
12342 return str;
12343}
12344
12345/*
12346 * call-seq:
12347 * inspect -> string
12348 *
12349 * Returns a string representation of +self+ (including the leading colon):
12350 *
12351 * :foo.inspect # => ":foo"
12352 *
12353 * Related: Symbol#to_s, Symbol#name.
12354 *
12355 */
12356
12357static VALUE
12358sym_inspect(VALUE sym)
12359{
12360 VALUE str = rb_sym2str(sym);
12361 const char *ptr;
12362 long len;
12363 char *dest;
12364
12365 if (!rb_str_symname_p(str)) {
12366 str = rb_str_inspect(str);
12367 len = RSTRING_LEN(str);
12368 rb_str_resize(str, len + 1);
12369 dest = RSTRING_PTR(str);
12370 memmove(dest + 1, dest, len);
12371 }
12372 else {
12373 rb_encoding *enc = STR_ENC_GET(str);
12374 VALUE orig_str = str;
12375
12376 len = RSTRING_LEN(orig_str);
12377 str = rb_enc_str_new(0, len + 1, enc);
12378
12379 // Get data pointer after allocation
12380 ptr = RSTRING_PTR(orig_str);
12381 dest = RSTRING_PTR(str);
12382 memcpy(dest + 1, ptr, len);
12383
12384 RB_GC_GUARD(orig_str);
12385 }
12386 dest[0] = ':';
12387
12389
12390 return str;
12391}
12392
12393VALUE
12395{
12396 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12397 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12398 return str;
12399}
12400
12401VALUE
12402rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12403{
12404 VALUE obj;
12405
12406 if (argc < 1) {
12407 rb_raise(rb_eArgError, "no receiver given");
12408 }
12409 obj = argv[0];
12410 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12411}
12412
12413/*
12414 * call-seq:
12415 * succ
12416 *
12417 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12418 *
12419 * :foo.succ # => :fop
12420 *
12421 * Related: String#succ.
12422 */
12423
12424static VALUE
12425sym_succ(VALUE sym)
12426{
12427 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12428}
12429
12430/*
12431 * call-seq:
12432 * self <=> other -> -1, 0, 1, or nil
12433 *
12434 * Compares +self+ and +other+, using String#<=>.
12435 *
12436 * Returns:
12437 *
12438 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12439 * - +nil+, otherwise.
12440 *
12441 * Examples:
12442 *
12443 * :bar <=> :foo # => -1
12444 * :foo <=> :foo # => 0
12445 * :foo <=> :bar # => 1
12446 * :foo <=> 'bar' # => nil
12447 *
12448 * \Class \Symbol includes module Comparable,
12449 * each of whose methods uses Symbol#<=> for comparison.
12450 *
12451 * Related: String#<=>.
12452 */
12453
12454static VALUE
12455sym_cmp(VALUE sym, VALUE other)
12456{
12457 if (!SYMBOL_P(other)) {
12458 return Qnil;
12459 }
12460 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12461}
12462
12463/*
12464 * call-seq:
12465 * casecmp(object) -> -1, 0, 1, or nil
12466 *
12467 * :include: doc/symbol/casecmp.rdoc
12468 *
12469 */
12470
12471static VALUE
12472sym_casecmp(VALUE sym, VALUE other)
12473{
12474 if (!SYMBOL_P(other)) {
12475 return Qnil;
12476 }
12477 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12478}
12479
12480/*
12481 * call-seq:
12482 * casecmp?(object) -> true, false, or nil
12483 *
12484 * :include: doc/symbol/casecmp_p.rdoc
12485 *
12486 */
12487
12488static VALUE
12489sym_casecmp_p(VALUE sym, VALUE other)
12490{
12491 if (!SYMBOL_P(other)) {
12492 return Qnil;
12493 }
12494 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12495}
12496
12497/*
12498 * call-seq:
12499 * self =~ other -> integer or nil
12500 *
12501 * Equivalent to <tt>self.to_s =~ other</tt>,
12502 * including possible updates to global variables;
12503 * see String#=~.
12504 *
12505 */
12506
12507static VALUE
12508sym_match(VALUE sym, VALUE other)
12509{
12510 return rb_str_match(rb_sym2str(sym), other);
12511}
12512
12513/*
12514 * call-seq:
12515 * match(pattern, offset = 0) -> matchdata or nil
12516 * match(pattern, offset = 0) {|matchdata| } -> object
12517 *
12518 * Equivalent to <tt>self.to_s.match</tt>,
12519 * including possible updates to global variables;
12520 * see String#match.
12521 *
12522 */
12523
12524static VALUE
12525sym_match_m(int argc, VALUE *argv, VALUE sym)
12526{
12527 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12528}
12529
12530/*
12531 * call-seq:
12532 * match?(pattern, offset) -> true or false
12533 *
12534 * Equivalent to <tt>sym.to_s.match?</tt>;
12535 * see String#match.
12536 *
12537 */
12538
12539static VALUE
12540sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12541{
12542 return rb_str_match_m_p(argc, argv, sym);
12543}
12544
12545/*
12546 * call-seq:
12547 * self[offset] -> string or nil
12548 * self[offset, size] -> string or nil
12549 * self[range] -> string or nil
12550 * self[regexp, capture = 0] -> string or nil
12551 * self[substring] -> string or nil
12552 *
12553 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12554 *
12555 */
12556
12557static VALUE
12558sym_aref(int argc, VALUE *argv, VALUE sym)
12559{
12560 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12561}
12562
12563/*
12564 * call-seq:
12565 * length -> integer
12566 *
12567 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12568 */
12569
12570static VALUE
12571sym_length(VALUE sym)
12572{
12573 return rb_str_length(rb_sym2str(sym));
12574}
12575
12576/*
12577 * call-seq:
12578 * empty? -> true or false
12579 *
12580 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12581 *
12582 */
12583
12584static VALUE
12585sym_empty(VALUE sym)
12586{
12587 return rb_str_empty(rb_sym2str(sym));
12588}
12589
12590/*
12591 * call-seq:
12592 * upcase(mapping) -> symbol
12593 *
12594 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12595 *
12596 * See String#upcase.
12597 *
12598 */
12599
12600static VALUE
12601sym_upcase(int argc, VALUE *argv, VALUE sym)
12602{
12603 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12604}
12605
12606/*
12607 * call-seq:
12608 * downcase(mapping) -> symbol
12609 *
12610 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12611 *
12612 * See String#downcase.
12613 *
12614 * Related: Symbol#upcase.
12615 *
12616 */
12617
12618static VALUE
12619sym_downcase(int argc, VALUE *argv, VALUE sym)
12620{
12621 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12622}
12623
12624/*
12625 * call-seq:
12626 * capitalize(mapping) -> symbol
12627 *
12628 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12629 *
12630 * See String#capitalize.
12631 *
12632 */
12633
12634static VALUE
12635sym_capitalize(int argc, VALUE *argv, VALUE sym)
12636{
12637 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12638}
12639
12640/*
12641 * call-seq:
12642 * swapcase(mapping) -> symbol
12643 *
12644 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12645 *
12646 * See String#swapcase.
12647 *
12648 */
12649
12650static VALUE
12651sym_swapcase(int argc, VALUE *argv, VALUE sym)
12652{
12653 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12654}
12655
12656/*
12657 * call-seq:
12658 * start_with?(*string_or_regexp) -> true or false
12659 *
12660 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12661 *
12662 */
12663
12664static VALUE
12665sym_start_with(int argc, VALUE *argv, VALUE sym)
12666{
12667 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12668}
12669
12670/*
12671 * call-seq:
12672 * end_with?(*strings) -> true or false
12673 *
12674 *
12675 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12676 *
12677 */
12678
12679static VALUE
12680sym_end_with(int argc, VALUE *argv, VALUE sym)
12681{
12682 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12683}
12684
12685/*
12686 * call-seq:
12687 * encoding -> encoding
12688 *
12689 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12690 *
12691 */
12692
12693static VALUE
12694sym_encoding(VALUE sym)
12695{
12696 return rb_obj_encoding(rb_sym2str(sym));
12697}
12698
12699static VALUE
12700string_for_symbol(VALUE name)
12701{
12702 if (!RB_TYPE_P(name, T_STRING)) {
12703 VALUE tmp = rb_check_string_type(name);
12704 if (NIL_P(tmp)) {
12705 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12706 name);
12707 }
12708 name = tmp;
12709 }
12710 return name;
12711}
12712
12713ID
12715{
12716 if (SYMBOL_P(name)) {
12717 return SYM2ID(name);
12718 }
12719 name = string_for_symbol(name);
12720 return rb_intern_str(name);
12721}
12722
12723VALUE
12725{
12726 if (SYMBOL_P(name)) {
12727 return name;
12728 }
12729 name = string_for_symbol(name);
12730 return rb_str_intern(name);
12731}
12732
12733/*
12734 * call-seq:
12735 * Symbol.all_symbols -> array_of_symbols
12736 *
12737 * Returns an array of all symbols currently in Ruby's symbol table:
12738 *
12739 * Symbol.all_symbols.size # => 9334
12740 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12741 *
12742 */
12743
12744static VALUE
12745sym_all_symbols(VALUE _)
12746{
12747 return rb_sym_all_symbols();
12748}
12749
12750VALUE
12751rb_str_to_interned_str(VALUE str)
12752{
12753 return rb_fstring(str);
12754}
12755
12756VALUE
12757rb_interned_str(const char *ptr, long len)
12758{
12759 struct RString fake_str = {RBASIC_INIT};
12760 int encidx = ENCINDEX_US_ASCII;
12761 int coderange = ENC_CODERANGE_7BIT;
12762 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12763 encidx = ENCINDEX_ASCII_8BIT;
12764 coderange = ENC_CODERANGE_VALID;
12765 }
12766 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12767 ENC_CODERANGE_SET(str, coderange);
12768 return register_fstring(str, true, false);
12769}
12770
12771VALUE
12773{
12774 return rb_interned_str(ptr, strlen(ptr));
12775}
12776
12777VALUE
12778rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12779{
12780 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12781 rb_enc_autoload(enc);
12782 }
12783
12784 struct RString fake_str = {RBASIC_INIT};
12785 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12786}
12787
12788VALUE
12789rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12790{
12791 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12792 rb_enc_autoload(enc);
12793 }
12794
12795 struct RString fake_str = {RBASIC_INIT};
12796 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12797 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12798 return str;
12799}
12800
12801VALUE
12803{
12804 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12805}
12806
12807#if USE_YJIT || USE_ZJIT
12808void
12809rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12810{
12811 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12812 ssize_t code = RB_NUM2SSIZE(codepoint);
12813
12814 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12815 rb_str_buf_cat_byte(str, (char) code);
12816 return;
12817 }
12818 }
12819
12820 rb_str_concat(str, codepoint);
12821}
12822#endif
12823
12824static int
12825fstring_set_class_i(VALUE *str, void *data)
12826{
12827 RBASIC_SET_CLASS(*str, rb_cString);
12828
12829 return ST_CONTINUE;
12830}
12831
12832void
12833Init_String(void)
12834{
12836
12837 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12838
12840 rb_define_alloc_func(rb_cString, empty_str_alloc);
12841 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12842 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12843 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12845 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12846 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12849 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12850 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12851 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12852 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12855 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12856 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12857 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12858 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12861 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12862 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12863 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12864 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12865 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12867 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12869 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12870 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12871 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12872 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12873 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12874 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12875 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12876 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12877 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12878 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12879 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12880 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12881 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12882 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12884 rb_define_method(rb_cString, "+@", str_uplus, 0);
12885 rb_define_method(rb_cString, "-@", str_uminus, 0);
12886 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12887 rb_define_alias(rb_cString, "dedup", "-@");
12888
12889 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12890 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12891 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12892 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12895 rb_define_method(rb_cString, "undump", str_undump, 0);
12896
12897 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12898 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12899 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12900 sym_fold = ID2SYM(rb_intern_const("fold"));
12901
12902 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12903 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12904 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12905 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12906
12907 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12908 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12909 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12910 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12911
12912 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12913 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12914 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12915 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12916 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12917 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12918 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12919 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12920 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12921 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12922 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12923 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12925 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12926 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12927 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12928 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12929 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12930
12931 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12932 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12933 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12934
12935 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12936
12937 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12938 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12939 rb_define_method(rb_cString, "center", rb_str_center, -1);
12940
12941 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12942 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12943 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12944 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12945 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12946 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12947 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12948 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12949 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12950
12951 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12952 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12953 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12954 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12955 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12956 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12957 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12958 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12959 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12960
12961 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12962 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12963 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12964 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12965 rb_define_method(rb_cString, "count", rb_str_count, -1);
12966
12967 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12968 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12969 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12970 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12971
12972 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12973 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12974 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12975 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12976 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12977
12978 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12979
12980 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12981 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12982
12983 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12984 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12985
12986 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12987 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12988 rb_define_method(rb_cString, "b", rb_str_b, 0);
12989 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12990 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12991
12992 /* define UnicodeNormalize module here so that we don't have to look it up */
12993 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12994 id_normalize = rb_intern_const("normalize");
12995 id_normalized_p = rb_intern_const("normalized?");
12996
12997 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12998 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12999 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
13000
13001 rb_fs = Qnil;
13002 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
13003 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
13004 rb_gc_register_address(&rb_fs);
13005
13010 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13011
13012 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13013 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13014 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13015 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13016 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13017 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13018
13019 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13020 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13021 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13022 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13023
13024 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13025 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13026 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13027 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13028 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13029 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13030 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13031
13032 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13033 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13034 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13035 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13036
13037 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13038 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13039
13040 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13041}
13042
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:696
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:404
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1704
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1497
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1610
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2864
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2674
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3154
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1018
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2943
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:130
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:128
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:125
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:122
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:127
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:129
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:126
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:134
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:477
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:661
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3967
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1431
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1427
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1434
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1425
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1429
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2254
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2272
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1325
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3650
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:235
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:141
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1313
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3334
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1347
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1212
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3057
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1231
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12778
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:255
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2337
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3774
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1160
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1452
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1353
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:973
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12802
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:755
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1120
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1207
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:712
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2043
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1091
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2049
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1949
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1230
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4287
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3779
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1484
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1910
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1757
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1517
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2490
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3839
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1428
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12394
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2563
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1404
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1751
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3085
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5384
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4202
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3192
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11715
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1791
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1793
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1194
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1008
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1523
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1999
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4188
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3607
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2426
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2017
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6598
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3200
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12772
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1434
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3805
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3132
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4309
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3426
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7277
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2795
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12757
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4256
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4076
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4231
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3781
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3317
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5871
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11773
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1707
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2979
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3289
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3408
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1206
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2749
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7384
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1416
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1723
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2440
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5786
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9391
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1200
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:968
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1855
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2047
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2126
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3474
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1731
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1024
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12724
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12714
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1849
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3489
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4531
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1375
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:131
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1446
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2956
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2814
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1440
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2827
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1784
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:122
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:531
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RString::@53::@55 embed
Embedded contents.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@53 as
String's specific fields.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@53::@54 heap
Strings that use separated memory region for contents use this pattern.
union RString::@53::@54::@56 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:229
Definition string.c:8271
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:308
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113