Ruby 3.5.0dev (2025-08-09 revision 2a6345e957c01f4495323723c7a3d7ac0d4ac339)
string.c (2a6345e957c01f4495323723c7a3d7ac0d4ac339)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555 .free = NULL,
556};
557
558void
559Init_fstring_table(void)
560{
561 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
562 rb_gc_register_address(&fstring_table_obj);
563}
564
565static VALUE
566register_fstring(VALUE str, bool copy, bool force_precompute_hash)
567{
568 struct fstr_create_arg args = {
569 .copy = copy,
570 .force_precompute_hash = force_precompute_hash
571 };
572
573#if SIZEOF_VOIDP == SIZEOF_LONG
574 if (FL_TEST_RAW(str, STR_FAKESTR)) {
575 // if the string hasn't been interned, we'll need the hash twice, so we
576 // compute it once and store it in capa
577 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
578 }
579#endif
580
581 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
582
583 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
585 RUBY_ASSERT(OBJ_FROZEN(result));
586 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
588
589 return result;
590}
591
592bool
593rb_obj_is_fstring_table(VALUE obj)
594{
595 ASSERT_vm_locking();
596
597 return obj == fstring_table_obj;
598}
599
600void
601rb_gc_free_fstring(VALUE obj)
602{
603 // Assume locking and barrier (which there is no assert for)
604 ASSERT_vm_locking();
605
606 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
607
608 RB_DEBUG_COUNTER_INC(obj_str_fstr);
609
610 FL_UNSET(obj, RSTRING_FSTR);
611}
612
613void
614rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
615{
616 if (fstring_table_obj) {
617 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
618 }
619}
620
621static VALUE
622setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
623{
624 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
625 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
626
627 if (!name) {
629 name = "";
630 }
631
632 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
633
634 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
635 fake_str->len = len;
636 fake_str->as.heap.ptr = (char *)name;
637 fake_str->as.heap.aux.capa = len;
638 return (VALUE)fake_str;
639}
640
641/*
642 * set up a fake string which refers a static string literal.
643 */
644VALUE
645rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
646{
647 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
648}
649
650/*
651 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
652 * shared string which refers a static string literal. `ptr` must
653 * point a constant string.
654 */
655VALUE
656rb_fstring_new(const char *ptr, long len)
657{
658 struct RString fake_str;
659 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
660}
661
662VALUE
663rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
664{
665 struct RString fake_str;
666 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
667}
668
669VALUE
670rb_fstring_cstr(const char *ptr)
671{
672 return rb_fstring_new(ptr, strlen(ptr));
673}
674
675static inline bool
676single_byte_optimizable(VALUE str)
677{
678 int encindex = ENCODING_GET(str);
679 switch (encindex) {
680 case ENCINDEX_ASCII_8BIT:
681 case ENCINDEX_US_ASCII:
682 return true;
683 case ENCINDEX_UTF_8:
684 // For UTF-8 it's worth scanning the string coderange when unknown.
686 }
687 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
688 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
689 return true;
690 }
691
692 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
693 return true;
694 }
695
696 /* Conservative. Possibly single byte.
697 * "\xa1" in Shift_JIS for example. */
698 return false;
699}
700
702
703static inline const char *
704search_nonascii(const char *p, const char *e)
705{
706 const uintptr_t *s, *t;
707
708#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
709# if SIZEOF_UINTPTR_T == 8
710# define NONASCII_MASK UINT64_C(0x8080808080808080)
711# elif SIZEOF_UINTPTR_T == 4
712# define NONASCII_MASK UINT32_C(0x80808080)
713# else
714# error "don't know what to do."
715# endif
716#else
717# if SIZEOF_UINTPTR_T == 8
718# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
719# elif SIZEOF_UINTPTR_T == 4
720# define NONASCII_MASK 0x80808080UL /* or...? */
721# else
722# error "don't know what to do."
723# endif
724#endif
725
726 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
727#if !UNALIGNED_WORD_ACCESS
728 if ((uintptr_t)p % SIZEOF_VOIDP) {
729 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
730 p += l;
731 switch (l) {
732 default: UNREACHABLE;
733#if SIZEOF_VOIDP > 4
734 case 7: if (p[-7]&0x80) return p-7;
735 case 6: if (p[-6]&0x80) return p-6;
736 case 5: if (p[-5]&0x80) return p-5;
737 case 4: if (p[-4]&0x80) return p-4;
738#endif
739 case 3: if (p[-3]&0x80) return p-3;
740 case 2: if (p[-2]&0x80) return p-2;
741 case 1: if (p[-1]&0x80) return p-1;
742 case 0: break;
743 }
744 }
745#endif
746#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
747#define aligned_ptr(value) \
748 __builtin_assume_aligned((value), sizeof(uintptr_t))
749#else
750#define aligned_ptr(value) (uintptr_t *)(value)
751#endif
752 s = aligned_ptr(p);
753 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
754#undef aligned_ptr
755 for (;s < t; s++) {
756 if (*s & NONASCII_MASK) {
757#ifdef WORDS_BIGENDIAN
758 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
759#else
760 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
761#endif
762 }
763 }
764 p = (const char *)s;
765 }
766
767 switch (e - p) {
768 default: UNREACHABLE;
769#if SIZEOF_VOIDP > 4
770 case 7: if (e[-7]&0x80) return e-7;
771 case 6: if (e[-6]&0x80) return e-6;
772 case 5: if (e[-5]&0x80) return e-5;
773 case 4: if (e[-4]&0x80) return e-4;
774#endif
775 case 3: if (e[-3]&0x80) return e-3;
776 case 2: if (e[-2]&0x80) return e-2;
777 case 1: if (e[-1]&0x80) return e-1;
778 case 0: return NULL;
779 }
780}
781
782static int
783coderange_scan(const char *p, long len, rb_encoding *enc)
784{
785 const char *e = p + len;
786
787 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
788 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
789 p = search_nonascii(p, e);
791 }
792
793 if (rb_enc_asciicompat(enc)) {
794 p = search_nonascii(p, e);
795 if (!p) return ENC_CODERANGE_7BIT;
796 for (;;) {
797 int ret = rb_enc_precise_mbclen(p, e, enc);
799 p += MBCLEN_CHARFOUND_LEN(ret);
800 if (p == e) break;
801 p = search_nonascii(p, e);
802 if (!p) break;
803 }
804 }
805 else {
806 while (p < e) {
807 int ret = rb_enc_precise_mbclen(p, e, enc);
809 p += MBCLEN_CHARFOUND_LEN(ret);
810 }
811 }
812 return ENC_CODERANGE_VALID;
813}
814
815long
816rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
817{
818 const char *p = s;
819
820 if (*cr == ENC_CODERANGE_BROKEN)
821 return e - s;
822
823 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
824 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
825 if (*cr == ENC_CODERANGE_VALID) return e - s;
826 p = search_nonascii(p, e);
828 return e - s;
829 }
830 else if (rb_enc_asciicompat(enc)) {
831 p = search_nonascii(p, e);
832 if (!p) {
833 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
834 return e - s;
835 }
836 for (;;) {
837 int ret = rb_enc_precise_mbclen(p, e, enc);
838 if (!MBCLEN_CHARFOUND_P(ret)) {
840 return p - s;
841 }
842 p += MBCLEN_CHARFOUND_LEN(ret);
843 if (p == e) break;
844 p = search_nonascii(p, e);
845 if (!p) break;
846 }
847 }
848 else {
849 while (p < e) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 }
857 }
859 return e - s;
860}
861
862static inline void
863str_enc_copy(VALUE str1, VALUE str2)
864{
865 rb_enc_set_index(str1, ENCODING_GET(str2));
866}
867
868/* Like str_enc_copy, but does not check frozen status of str1.
869 * You should use this only if you're certain that str1 is not frozen. */
870static inline void
871str_enc_copy_direct(VALUE str1, VALUE str2)
872{
873 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
874 if (inlined_encoding == ENCODING_INLINE_MAX) {
875 rb_enc_set_index(str1, rb_enc_get_index(str2));
876 }
877 else {
878 ENCODING_SET_INLINED(str1, inlined_encoding);
879 }
880}
881
882static void
883rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
884{
885 /* this function is designed for copying encoding and coderange
886 * from src to new string "dest" which is made from the part of src.
887 */
888 str_enc_copy(dest, src);
889 if (RSTRING_LEN(dest) == 0) {
890 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
892 else
894 return;
895 }
896 switch (ENC_CODERANGE(src)) {
899 break;
901 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
902 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
904 else
906 break;
907 default:
908 break;
909 }
910}
911
912static void
913rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
914{
915 str_enc_copy(dest, src);
917}
918
919static int
920enc_coderange_scan(VALUE str, rb_encoding *enc)
921{
922 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
923}
924
925int
926rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
927{
928 return enc_coderange_scan(str, enc);
929}
930
931int
933{
934 int cr = ENC_CODERANGE(str);
935
936 if (cr == ENC_CODERANGE_UNKNOWN) {
937 cr = enc_coderange_scan(str, get_encoding(str));
938 ENC_CODERANGE_SET(str, cr);
939 }
940 return cr;
941}
942
943static inline bool
944rb_enc_str_asciicompat(VALUE str)
945{
946 int encindex = ENCODING_GET_INLINED(str);
947 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
948}
949
950int
952{
953 switch(ENC_CODERANGE(str)) {
955 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
957 return true;
958 default:
959 return false;
960 }
961}
962
963static inline void
964str_mod_check(VALUE s, const char *p, long len)
965{
966 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
967 rb_raise(rb_eRuntimeError, "string modified");
968 }
969}
970
971static size_t
972str_capacity(VALUE str, const int termlen)
973{
974 if (STR_EMBED_P(str)) {
975 return str_embed_capa(str) - termlen;
976 }
977 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
978 return RSTRING(str)->len;
979 }
980 else {
981 return RSTRING(str)->as.heap.aux.capa;
982 }
983}
984
985size_t
987{
988 return str_capacity(str, TERM_LEN(str));
989}
990
991static inline void
992must_not_null(const char *ptr)
993{
994 if (!ptr) {
995 rb_raise(rb_eArgError, "NULL pointer given");
996 }
997}
998
999static inline VALUE
1000str_alloc_embed(VALUE klass, size_t capa)
1001{
1002 size_t size = rb_str_embed_size(capa);
1003 RUBY_ASSERT(size > 0);
1004 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1005
1006 NEWOBJ_OF(str, struct RString, klass,
1008
1009 return (VALUE)str;
1010}
1011
1012static inline VALUE
1013str_alloc_heap(VALUE klass)
1014{
1015 NEWOBJ_OF(str, struct RString, klass,
1016 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1017
1018 return (VALUE)str;
1019}
1020
1021static inline VALUE
1022empty_str_alloc(VALUE klass)
1023{
1024 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1025 VALUE str = str_alloc_embed(klass, 0);
1026 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1028 return str;
1029}
1030
1031static VALUE
1032str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1033{
1034 VALUE str;
1035
1036 if (len < 0) {
1037 rb_raise(rb_eArgError, "negative string size (or size too big)");
1038 }
1039
1040 if (enc == NULL) {
1041 enc = rb_ascii8bit_encoding();
1042 }
1043
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045
1046 int termlen = rb_enc_mbminlen(enc);
1047
1048 if (STR_EMBEDDABLE_P(len, termlen)) {
1049 str = str_alloc_embed(klass, len + termlen);
1050 if (len == 0) {
1051 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1052 }
1053 }
1054 else {
1055 str = str_alloc_heap(klass);
1056 RSTRING(str)->as.heap.aux.capa = len;
1057 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1058 * integer overflow. If we can STATIC_ASSERT that, the following
1059 * mul_add_mul can be reverted to a simple ALLOC_N. */
1060 RSTRING(str)->as.heap.ptr =
1061 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1062 }
1063
1064 rb_enc_raw_set(str, enc);
1065
1066 if (ptr) {
1067 memcpy(RSTRING_PTR(str), ptr, len);
1068 }
1069
1070 STR_SET_LEN(str, len);
1071 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1072 return str;
1073}
1074
1075static VALUE
1076str_new(VALUE klass, const char *ptr, long len)
1077{
1078 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1079}
1080
1081VALUE
1082rb_str_new(const char *ptr, long len)
1083{
1084 return str_new(rb_cString, ptr, len);
1085}
1086
1087VALUE
1088rb_usascii_str_new(const char *ptr, long len)
1089{
1090 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1091}
1092
1093VALUE
1094rb_utf8_str_new(const char *ptr, long len)
1095{
1096 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1097}
1098
1099VALUE
1100rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1101{
1102 return str_enc_new(rb_cString, ptr, len, enc);
1103}
1104
1105VALUE
1107{
1108 must_not_null(ptr);
1109 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1110 * memory regions, and that cannot be detected by the MSAN. Just
1111 * trust the programmer that the argument passed here is a sane C
1112 * string. */
1113 __msan_unpoison_string(ptr);
1114 return rb_str_new(ptr, strlen(ptr));
1115}
1116
1117VALUE
1119{
1120 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1121}
1122
1123VALUE
1125{
1126 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1127}
1128
1129VALUE
1131{
1132 must_not_null(ptr);
1133 if (rb_enc_mbminlen(enc) != 1) {
1134 rb_raise(rb_eArgError, "wchar encoding given");
1135 }
1136 return rb_enc_str_new(ptr, strlen(ptr), enc);
1137}
1138
1139static VALUE
1140str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1141{
1142 VALUE str;
1143
1144 if (len < 0) {
1145 rb_raise(rb_eArgError, "negative string size (or size too big)");
1146 }
1147
1148 if (!ptr) {
1149 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1150 }
1151 else {
1152 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1153 str = str_alloc_heap(klass);
1154 RSTRING(str)->len = len;
1155 RSTRING(str)->as.heap.ptr = (char *)ptr;
1156 RSTRING(str)->as.heap.aux.capa = len;
1157 RBASIC(str)->flags |= STR_NOFREE;
1158 rb_enc_associate_index(str, encindex);
1159 }
1160 return str;
1161}
1162
1163VALUE
1164rb_str_new_static(const char *ptr, long len)
1165{
1166 return str_new_static(rb_cString, ptr, len, 0);
1167}
1168
1169VALUE
1171{
1172 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1173}
1174
1175VALUE
1177{
1178 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1179}
1180
1181VALUE
1183{
1184 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1185}
1186
1187static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1188 rb_encoding *from, rb_encoding *to,
1189 int ecflags, VALUE ecopts);
1190
1191static inline bool
1192is_enc_ascii_string(VALUE str, rb_encoding *enc)
1193{
1194 int encidx = rb_enc_to_index(enc);
1195 if (rb_enc_get_index(str) == encidx)
1196 return is_ascii_string(str);
1197 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1198}
1199
1200VALUE
1201rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1202{
1203 long len;
1204 const char *ptr;
1205 VALUE newstr;
1206
1207 if (!to) return str;
1208 if (!from) from = rb_enc_get(str);
1209 if (from == to) return str;
1210 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1211 rb_is_ascii8bit_enc(to)) {
1212 if (STR_ENC_GET(str) != to) {
1213 str = rb_str_dup(str);
1214 rb_enc_associate(str, to);
1215 }
1216 return str;
1217 }
1218
1219 RSTRING_GETMEM(str, ptr, len);
1220 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1221 from, to, ecflags, ecopts);
1222 if (NIL_P(newstr)) {
1223 /* some error, return original */
1224 return str;
1225 }
1226 return newstr;
1227}
1228
1229VALUE
1230rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1231 rb_encoding *from, int ecflags, VALUE ecopts)
1232{
1233 long olen;
1234
1235 olen = RSTRING_LEN(newstr);
1236 if (ofs < -olen || olen < ofs)
1237 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1238 if (ofs < 0) ofs += olen;
1239 if (!from) {
1240 STR_SET_LEN(newstr, ofs);
1241 return rb_str_cat(newstr, ptr, len);
1242 }
1243
1244 rb_str_modify(newstr);
1245 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1246 rb_enc_get(newstr),
1247 ecflags, ecopts);
1248}
1249
1250VALUE
1251rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1252{
1253 STR_SET_LEN(str, 0);
1254 rb_enc_associate(str, enc);
1255 rb_str_cat(str, ptr, len);
1256 return str;
1257}
1258
1259static VALUE
1260str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1261 rb_encoding *from, rb_encoding *to,
1262 int ecflags, VALUE ecopts)
1263{
1264 rb_econv_t *ec;
1266 long olen;
1267 VALUE econv_wrapper;
1268 const unsigned char *start, *sp;
1269 unsigned char *dest, *dp;
1270 size_t converted_output = (size_t)ofs;
1271
1272 olen = rb_str_capacity(newstr);
1273
1274 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1275 RBASIC_CLEAR_CLASS(econv_wrapper);
1276 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1277 if (!ec) return Qnil;
1278 DATA_PTR(econv_wrapper) = ec;
1279
1280 sp = (unsigned char*)ptr;
1281 start = sp;
1282 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1283 (dp = dest + converted_output),
1284 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1286 /* destination buffer short */
1287 size_t converted_input = sp - start;
1288 size_t rest = len - converted_input;
1289 converted_output = dp - dest;
1290 rb_str_set_len(newstr, converted_output);
1291 if (converted_input && converted_output &&
1292 rest < (LONG_MAX / converted_output)) {
1293 rest = (rest * converted_output) / converted_input;
1294 }
1295 else {
1296 rest = olen;
1297 }
1298 olen += rest < 2 ? 2 : rest;
1299 rb_str_resize(newstr, olen);
1300 }
1301 DATA_PTR(econv_wrapper) = 0;
1302 RB_GC_GUARD(econv_wrapper);
1303 rb_econv_close(ec);
1304 switch (ret) {
1305 case econv_finished:
1306 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1307 rb_str_set_len(newstr, len);
1308 rb_enc_associate(newstr, to);
1309 return newstr;
1310
1311 default:
1312 return Qnil;
1313 }
1314}
1315
1316VALUE
1318{
1319 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1320}
1321
1322VALUE
1324{
1325 rb_encoding *ienc;
1326 VALUE str;
1327 const int eidx = rb_enc_to_index(eenc);
1328
1329 if (!ptr) {
1330 return rb_enc_str_new(ptr, len, eenc);
1331 }
1332
1333 /* ASCII-8BIT case, no conversion */
1334 if ((eidx == rb_ascii8bit_encindex()) ||
1335 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1336 return rb_str_new(ptr, len);
1337 }
1338 /* no default_internal or same encoding, no conversion */
1339 ienc = rb_default_internal_encoding();
1340 if (!ienc || eenc == ienc) {
1341 return rb_enc_str_new(ptr, len, eenc);
1342 }
1343 /* ASCII compatible, and ASCII only string, no conversion in
1344 * default_internal */
1345 if ((eidx == rb_ascii8bit_encindex()) ||
1346 (eidx == rb_usascii_encindex()) ||
1347 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1348 return rb_enc_str_new(ptr, len, ienc);
1349 }
1350 /* convert from the given encoding to default_internal */
1351 str = rb_enc_str_new(NULL, 0, ienc);
1352 /* when the conversion failed for some reason, just ignore the
1353 * default_internal and result in the given encoding as-is. */
1354 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1355 rb_str_initialize(str, ptr, len, eenc);
1356 }
1357 return str;
1358}
1359
1360VALUE
1361rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1362{
1363 int eidx = rb_enc_to_index(eenc);
1364 if (eidx == rb_usascii_encindex() &&
1365 !is_ascii_string(str)) {
1366 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1367 return str;
1368 }
1369 rb_enc_associate_index(str, eidx);
1370 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1371}
1372
1373VALUE
1374rb_external_str_new(const char *ptr, long len)
1375{
1376 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1377}
1378
1379VALUE
1381{
1382 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1383}
1384
1385VALUE
1386rb_locale_str_new(const char *ptr, long len)
1387{
1388 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1389}
1390
1391VALUE
1393{
1394 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1395}
1396
1397VALUE
1399{
1400 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1401}
1402
1403VALUE
1405{
1406 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1407}
1408
1409VALUE
1411{
1412 return rb_str_export_to_enc(str, rb_default_external_encoding());
1413}
1414
1415VALUE
1417{
1418 return rb_str_export_to_enc(str, rb_locale_encoding());
1419}
1420
1421VALUE
1423{
1424 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1425}
1426
1427static VALUE
1428str_replace_shared_without_enc(VALUE str2, VALUE str)
1429{
1430 const int termlen = TERM_LEN(str);
1431 char *ptr;
1432 long len;
1433
1434 RSTRING_GETMEM(str, ptr, len);
1435 if (str_embed_capa(str2) >= len + termlen) {
1436 char *ptr2 = RSTRING(str2)->as.embed.ary;
1437 STR_SET_EMBED(str2);
1438 memcpy(ptr2, RSTRING_PTR(str), len);
1439 TERM_FILL(ptr2+len, termlen);
1440 }
1441 else {
1442 VALUE root;
1443 if (STR_SHARED_P(str)) {
1444 root = RSTRING(str)->as.heap.aux.shared;
1445 RSTRING_GETMEM(str, ptr, len);
1446 }
1447 else {
1448 root = rb_str_new_frozen(str);
1449 RSTRING_GETMEM(root, ptr, len);
1450 }
1451 RUBY_ASSERT(OBJ_FROZEN(root));
1452
1453 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1454 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1455 rb_fatal("about to free a possible shared root");
1456 }
1457 char *ptr2 = STR_HEAP_PTR(str2);
1458 if (ptr2 != ptr) {
1459 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1460 }
1461 }
1462 FL_SET(str2, STR_NOEMBED);
1463 RSTRING(str2)->as.heap.ptr = ptr;
1464 STR_SET_SHARED(str2, root);
1465 }
1466
1467 STR_SET_LEN(str2, len);
1468
1469 return str2;
1470}
1471
1472static VALUE
1473str_replace_shared(VALUE str2, VALUE str)
1474{
1475 str_replace_shared_without_enc(str2, str);
1476 rb_enc_cr_str_exact_copy(str2, str);
1477 return str2;
1478}
1479
1480static VALUE
1481str_new_shared(VALUE klass, VALUE str)
1482{
1483 return str_replace_shared(str_alloc_heap(klass), str);
1484}
1485
1486VALUE
1488{
1489 return str_new_shared(rb_obj_class(str), str);
1490}
1491
1492VALUE
1494{
1495 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1496 return str_new_frozen(rb_obj_class(orig), orig);
1497}
1498
1499static VALUE
1500rb_str_new_frozen_String(VALUE orig)
1501{
1502 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1503 return str_new_frozen(rb_cString, orig);
1504}
1505
1506
1507VALUE
1508rb_str_frozen_bare_string(VALUE orig)
1509{
1510 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1511 return str_new_frozen(rb_cString, orig);
1512}
1513
1514VALUE
1515rb_str_tmp_frozen_acquire(VALUE orig)
1516{
1517 if (OBJ_FROZEN_RAW(orig)) return orig;
1518 return str_new_frozen_buffer(0, orig, FALSE);
1519}
1520
1521VALUE
1522rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1523{
1524 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1525 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1526
1527 VALUE str = str_alloc_heap(0);
1528 OBJ_FREEZE(str);
1529 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1530 FL_SET(str, STR_SHARED_ROOT);
1531
1532 size_t capa = str_capacity(orig, TERM_LEN(orig));
1533
1534 /* If the string is embedded then we want to create a copy that is heap
1535 * allocated. If the string is shared then the shared root must be
1536 * embedded, so we want to create a copy. If the string is a shared root
1537 * then it must be embedded, so we want to create a copy. */
1538 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1539 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1540 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1541 }
1542 else {
1543 /* orig must be heap allocated and not shared, so we can safely transfer
1544 * the pointer to str. */
1545 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1546 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1547 RBASIC(orig)->flags &= ~STR_NOFREE;
1548 STR_SET_SHARED(orig, str);
1549 }
1550
1551 RSTRING(str)->len = RSTRING(orig)->len;
1552 RSTRING(str)->as.heap.aux.capa = capa;
1553
1554 return str;
1555}
1556
1557void
1558rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1559{
1560 if (RBASIC_CLASS(tmp) != 0)
1561 return;
1562
1563 if (STR_EMBED_P(tmp)) {
1565 }
1566 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1567 !OBJ_FROZEN_RAW(orig)) {
1568 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1569
1570 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1571 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1572 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1573
1574 /* Unshare orig since the root (tmp) only has this one child. */
1575 FL_UNSET_RAW(orig, STR_SHARED);
1576 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1577 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1579
1580 /* Make tmp embedded and empty so it is safe for sweeping. */
1581 STR_SET_EMBED(tmp);
1582 STR_SET_LEN(tmp, 0);
1583 }
1584 }
1585}
1586
1587static VALUE
1588str_new_frozen(VALUE klass, VALUE orig)
1589{
1590 return str_new_frozen_buffer(klass, orig, TRUE);
1591}
1592
1593static VALUE
1594heap_str_make_shared(VALUE klass, VALUE orig)
1595{
1596 RUBY_ASSERT(!STR_EMBED_P(orig));
1597 RUBY_ASSERT(!STR_SHARED_P(orig));
1598
1599 VALUE str = str_alloc_heap(klass);
1600 STR_SET_LEN(str, RSTRING_LEN(orig));
1601 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1602 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1603 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1604 RBASIC(orig)->flags &= ~STR_NOFREE;
1605 STR_SET_SHARED(orig, str);
1606 if (klass == 0)
1607 FL_UNSET_RAW(str, STR_BORROWED);
1608 return str;
1609}
1610
1611static VALUE
1612str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1613{
1614 VALUE str;
1615
1616 long len = RSTRING_LEN(orig);
1617 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1618 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1619
1620 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1621 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1622 RUBY_ASSERT(STR_EMBED_P(str));
1623 }
1624 else {
1625 if (FL_TEST_RAW(orig, STR_SHARED)) {
1626 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1627 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1628 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1629 RUBY_ASSERT(ofs >= 0);
1630 RUBY_ASSERT(rest >= 0);
1631 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1633
1634 if ((ofs > 0) || (rest > 0) ||
1635 (klass != RBASIC(shared)->klass) ||
1636 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1637 str = str_new_shared(klass, shared);
1638 RUBY_ASSERT(!STR_EMBED_P(str));
1639 RSTRING(str)->as.heap.ptr += ofs;
1640 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1641 }
1642 else {
1643 if (RBASIC_CLASS(shared) == 0)
1644 FL_SET_RAW(shared, STR_BORROWED);
1645 return shared;
1646 }
1647 }
1648 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1649 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1650 STR_SET_EMBED(str);
1651 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1652 STR_SET_LEN(str, RSTRING_LEN(orig));
1653 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1654 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1655 }
1656 else {
1657 str = heap_str_make_shared(klass, orig);
1658 }
1659 }
1660
1661 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1662 OBJ_FREEZE(str);
1663 return str;
1664}
1665
1666VALUE
1667rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1668{
1669 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1670}
1671
1672static VALUE
1673str_new_empty_String(VALUE str)
1674{
1675 VALUE v = rb_str_new(0, 0);
1676 rb_enc_copy(v, str);
1677 return v;
1678}
1679
1680#define STR_BUF_MIN_SIZE 63
1681
1682VALUE
1684{
1685 if (STR_EMBEDDABLE_P(capa, 1)) {
1686 return str_alloc_embed(rb_cString, capa + 1);
1687 }
1688
1689 VALUE str = str_alloc_heap(rb_cString);
1690
1691 RSTRING(str)->as.heap.aux.capa = capa;
1692 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1693 RSTRING(str)->as.heap.ptr[0] = '\0';
1694
1695 return str;
1696}
1697
1698VALUE
1700{
1701 VALUE str;
1702 long len = strlen(ptr);
1703
1704 str = rb_str_buf_new(len);
1705 rb_str_buf_cat(str, ptr, len);
1706
1707 return str;
1708}
1709
1710VALUE
1712{
1713 return str_new(0, 0, len);
1714}
1715
1716void
1718{
1719 if (STR_EMBED_P(str)) {
1720 RB_DEBUG_COUNTER_INC(obj_str_embed);
1721 }
1722 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1724 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1725 }
1726 else {
1727 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1728 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1729 }
1730}
1731
1732size_t
1733rb_str_memsize(VALUE str)
1734{
1735 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1736 return STR_HEAP_SIZE(str);
1737 }
1738 else {
1739 return 0;
1740 }
1741}
1742
1743VALUE
1745{
1746 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1747}
1748
1749static inline void str_discard(VALUE str);
1750static void str_shared_replace(VALUE str, VALUE str2);
1751
1752void
1754{
1755 if (str != str2) str_shared_replace(str, str2);
1756}
1757
1758static void
1759str_shared_replace(VALUE str, VALUE str2)
1760{
1761 rb_encoding *enc;
1762 int cr;
1763 int termlen;
1764
1765 RUBY_ASSERT(str2 != str);
1766 enc = STR_ENC_GET(str2);
1767 cr = ENC_CODERANGE(str2);
1768 str_discard(str);
1769 termlen = rb_enc_mbminlen(enc);
1770
1771 STR_SET_LEN(str, RSTRING_LEN(str2));
1772
1773 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1774 STR_SET_EMBED(str);
1775 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1776 rb_enc_associate(str, enc);
1777 ENC_CODERANGE_SET(str, cr);
1778 }
1779 else {
1780 if (STR_EMBED_P(str2)) {
1781 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1782 long len = RSTRING_LEN(str2);
1783 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1784
1785 char *new_ptr = ALLOC_N(char, len + termlen);
1786 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1787 RSTRING(str2)->as.heap.ptr = new_ptr;
1788 STR_SET_LEN(str2, len);
1789 RSTRING(str2)->as.heap.aux.capa = len;
1790 STR_SET_NOEMBED(str2);
1791 }
1792
1793 STR_SET_NOEMBED(str);
1794 FL_UNSET(str, STR_SHARED);
1795 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1796
1797 if (FL_TEST(str2, STR_SHARED)) {
1798 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1799 STR_SET_SHARED(str, shared);
1800 }
1801 else {
1802 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1803 }
1804
1805 /* abandon str2 */
1806 STR_SET_EMBED(str2);
1807 RSTRING_PTR(str2)[0] = 0;
1808 STR_SET_LEN(str2, 0);
1809 rb_enc_associate(str, enc);
1810 ENC_CODERANGE_SET(str, cr);
1811 }
1812}
1813
1814VALUE
1816{
1817 VALUE str;
1818
1819 if (RB_TYPE_P(obj, T_STRING)) {
1820 return obj;
1821 }
1822 str = rb_funcall(obj, idTo_s, 0);
1823 return rb_obj_as_string_result(str, obj);
1824}
1825
1826VALUE
1827rb_obj_as_string_result(VALUE str, VALUE obj)
1828{
1829 if (!RB_TYPE_P(str, T_STRING))
1830 return rb_any_to_s(obj);
1831 return str;
1832}
1833
1834static VALUE
1835str_replace(VALUE str, VALUE str2)
1836{
1837 long len;
1838
1839 len = RSTRING_LEN(str2);
1840 if (STR_SHARED_P(str2)) {
1841 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1843 STR_SET_NOEMBED(str);
1844 STR_SET_LEN(str, len);
1845 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1846 STR_SET_SHARED(str, shared);
1847 rb_enc_cr_str_exact_copy(str, str2);
1848 }
1849 else {
1850 str_replace_shared(str, str2);
1851 }
1852
1853 return str;
1854}
1855
1856static inline VALUE
1857ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1858{
1859 size_t size = rb_str_embed_size(capa);
1860 RUBY_ASSERT(size > 0);
1861 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1862
1863 NEWOBJ_OF(str, struct RString, klass,
1865
1866 return (VALUE)str;
1867}
1868
1869static inline VALUE
1870ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1871{
1872 NEWOBJ_OF(str, struct RString, klass,
1873 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1874
1875 return (VALUE)str;
1876}
1877
1878static inline VALUE
1879str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1880{
1881 int encidx = 0;
1882 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1883 encidx = rb_enc_get_index(str);
1884 flags &= ~ENCODING_MASK;
1885 }
1886 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1887 if (encidx) rb_enc_associate_index(dup, encidx);
1888 return dup;
1889}
1890
1891static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1892
1893static inline VALUE
1894str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1895{
1896 VALUE flags = FL_TEST_RAW(str, flag_mask);
1897 long len = RSTRING_LEN(str);
1898
1899 RUBY_ASSERT(STR_EMBED_P(dup));
1900 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1901 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1902 STR_SET_LEN(dup, RSTRING_LEN(str));
1903 return str_duplicate_setup_encoding(str, dup, flags);
1904}
1905
1906static inline VALUE
1907str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1908{
1909 VALUE flags = FL_TEST_RAW(str, flag_mask);
1910 VALUE root = str;
1911 if (FL_TEST_RAW(str, STR_SHARED)) {
1912 root = RSTRING(str)->as.heap.aux.shared;
1913 }
1914 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1915 root = str = str_new_frozen(klass, str);
1916 flags = FL_TEST_RAW(str, flag_mask);
1917 }
1918 RUBY_ASSERT(!STR_SHARED_P(root));
1920
1921 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1922 FL_SET(root, STR_SHARED_ROOT);
1923 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1924 flags |= RSTRING_NOEMBED | STR_SHARED;
1925
1926 STR_SET_LEN(dup, RSTRING_LEN(str));
1927 return str_duplicate_setup_encoding(str, dup, flags);
1928}
1929
1930static inline VALUE
1931str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1932{
1933 if (STR_EMBED_P(str)) {
1934 return str_duplicate_setup_embed(klass, str, dup);
1935 }
1936 else {
1937 return str_duplicate_setup_heap(klass, str, dup);
1938 }
1939}
1940
1941static inline VALUE
1942str_duplicate(VALUE klass, VALUE str)
1943{
1944 VALUE dup;
1945 if (STR_EMBED_P(str)) {
1946 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1947 }
1948 else {
1949 dup = str_alloc_heap(klass);
1950 }
1951
1952 return str_duplicate_setup(klass, str, dup);
1953}
1954
1955VALUE
1957{
1958 return str_duplicate(rb_obj_class(str), str);
1959}
1960
1961/* :nodoc: */
1962VALUE
1963rb_str_dup_m(VALUE str)
1964{
1965 if (LIKELY(BARE_STRING_P(str))) {
1966 return str_duplicate(rb_cString, str);
1967 }
1968 else {
1969 return rb_obj_dup(str);
1970 }
1971}
1972
1973VALUE
1975{
1976 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1977 return str_duplicate(rb_cString, str);
1978}
1979
1980VALUE
1981rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1982{
1983 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1984 VALUE new_str, klass = rb_cString;
1985
1986 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1987 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1988 str_duplicate_setup_embed(klass, str, new_str);
1989 }
1990 else {
1991 new_str = ec_str_alloc_heap(ec, klass);
1992 str_duplicate_setup_heap(klass, str, new_str);
1993 }
1994 if (chilled) {
1995 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1996 }
1997 return new_str;
1998}
1999
2000VALUE
2001rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2002{
2003 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2004 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2005 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2006 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2007 return rb_str_freeze(str);
2008}
2009
2010/*
2011 * The documentation block below uses an include (instead of inline text)
2012 * because the included text has non-ASCII characters (which are not allowed in a C file).
2013 */
2014
2015/*
2016 *
2017 * call-seq:
2018 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2019 *
2020 * :include: doc/string/new.rdoc
2021 *
2022 */
2023
2024static VALUE
2025rb_str_init(int argc, VALUE *argv, VALUE str)
2026{
2027 static ID keyword_ids[2];
2028 VALUE orig, opt, venc, vcapa;
2029 VALUE kwargs[2];
2030 rb_encoding *enc = 0;
2031 int n;
2032
2033 if (!keyword_ids[0]) {
2034 keyword_ids[0] = rb_id_encoding();
2035 CONST_ID(keyword_ids[1], "capacity");
2036 }
2037
2038 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2039 if (!NIL_P(opt)) {
2040 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2041 venc = kwargs[0];
2042 vcapa = kwargs[1];
2043 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2044 enc = rb_to_encoding(venc);
2045 }
2046 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2047 long capa = NUM2LONG(vcapa);
2048 long len = 0;
2049 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2050
2051 if (capa < STR_BUF_MIN_SIZE) {
2052 capa = STR_BUF_MIN_SIZE;
2053 }
2054 if (n == 1) {
2055 StringValue(orig);
2056 len = RSTRING_LEN(orig);
2057 if (capa < len) {
2058 capa = len;
2059 }
2060 if (orig == str) n = 0;
2061 }
2062 str_modifiable(str);
2063 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2064 /* make noembed always */
2065 const size_t size = (size_t)capa + termlen;
2066 const char *const old_ptr = RSTRING_PTR(str);
2067 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2068 char *new_ptr = ALLOC_N(char, size);
2069 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2070 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2071 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2072 RSTRING(str)->as.heap.ptr = new_ptr;
2073 }
2074 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2075 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2076 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2077 }
2078 STR_SET_LEN(str, len);
2079 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2080 if (n == 1) {
2081 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2082 rb_enc_cr_str_exact_copy(str, orig);
2083 }
2084 FL_SET(str, STR_NOEMBED);
2085 RSTRING(str)->as.heap.aux.capa = capa;
2086 }
2087 else if (n == 1) {
2088 rb_str_replace(str, orig);
2089 }
2090 if (enc) {
2091 rb_enc_associate(str, enc);
2093 }
2094 }
2095 else if (n == 1) {
2096 rb_str_replace(str, orig);
2097 }
2098 return str;
2099}
2100
2101/* :nodoc: */
2102static VALUE
2103rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2104{
2105 if (klass != rb_cString) {
2106 return rb_class_new_instance_pass_kw(argc, argv, klass);
2107 }
2108
2109 static ID keyword_ids[2];
2110 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2111 VALUE kwargs[2];
2112 rb_encoding *enc = NULL;
2113
2114 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2115 if (NIL_P(opt)) {
2116 return rb_class_new_instance_pass_kw(argc, argv, klass);
2117 }
2118
2119 keyword_ids[0] = rb_id_encoding();
2120 CONST_ID(keyword_ids[1], "capacity");
2121 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2122 encoding = kwargs[0];
2123 capacity = kwargs[1];
2124
2125 if (n == 1) {
2126 orig = StringValue(orig);
2127 }
2128 else {
2129 orig = Qnil;
2130 }
2131
2132 if (UNDEF_P(encoding)) {
2133 if (!NIL_P(orig)) {
2134 encoding = rb_obj_encoding(orig);
2135 }
2136 }
2137
2138 if (!UNDEF_P(encoding)) {
2139 enc = rb_to_encoding(encoding);
2140 }
2141
2142 // If capacity is nil, we're basically just duping `orig`.
2143 if (UNDEF_P(capacity)) {
2144 if (NIL_P(orig)) {
2145 VALUE empty_str = str_new(klass, "", 0);
2146 if (enc) {
2147 rb_enc_associate(empty_str, enc);
2148 }
2149 return empty_str;
2150 }
2151 VALUE copy = str_duplicate(klass, orig);
2152 rb_enc_associate(copy, enc);
2153 ENC_CODERANGE_CLEAR(copy);
2154 return copy;
2155 }
2156
2157 long capa = 0;
2158 capa = NUM2LONG(capacity);
2159 if (capa < 0) {
2160 capa = 0;
2161 }
2162
2163 if (!NIL_P(orig)) {
2164 long orig_capa = rb_str_capacity(orig);
2165 if (orig_capa > capa) {
2166 capa = orig_capa;
2167 }
2168 }
2169
2170 VALUE str = str_enc_new(klass, NULL, capa, enc);
2171 STR_SET_LEN(str, 0);
2172 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2173
2174 if (!NIL_P(orig)) {
2175 rb_str_buf_append(str, orig);
2176 }
2177
2178 return str;
2179}
2180
2181#ifdef NONASCII_MASK
2182#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2183
2184/*
2185 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2186 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2187 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2188 *
2189 * if (!(byte & 0x80))
2190 * byte |= 0x40; // turn on bit6
2191 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2192 *
2193 * This function calculates whether a byte is leading or not for all bytes
2194 * in the argument word by concurrently using the above logic, and then
2195 * adds up the number of leading bytes in the word.
2196 */
2197static inline uintptr_t
2198count_utf8_lead_bytes_with_word(const uintptr_t *s)
2199{
2200 uintptr_t d = *s;
2201
2202 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2203 d = (d>>6) | (~d>>7);
2204 d &= NONASCII_MASK >> 7;
2205
2206 /* Gather all bytes. */
2207#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2208 /* use only if it can use POPCNT */
2209 return rb_popcount_intptr(d);
2210#else
2211 d += (d>>8);
2212 d += (d>>16);
2213# if SIZEOF_VOIDP == 8
2214 d += (d>>32);
2215# endif
2216 return (d&0xF);
2217#endif
2218}
2219#endif
2220
2221static inline long
2222enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2223{
2224 long c;
2225 const char *q;
2226
2227 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2228 long diff = (long)(e - p);
2229 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2230 }
2231#ifdef NONASCII_MASK
2232 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2233 uintptr_t len = 0;
2234 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2235 const uintptr_t *s, *t;
2236 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2237 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2238 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2239 while (p < (const char *)s) {
2240 if (is_utf8_lead_byte(*p)) len++;
2241 p++;
2242 }
2243 while (s < t) {
2244 len += count_utf8_lead_bytes_with_word(s);
2245 s++;
2246 }
2247 p = (const char *)s;
2248 }
2249 while (p < e) {
2250 if (is_utf8_lead_byte(*p)) len++;
2251 p++;
2252 }
2253 return (long)len;
2254 }
2255#endif
2256 else if (rb_enc_asciicompat(enc)) {
2257 c = 0;
2258 if (ENC_CODERANGE_CLEAN_P(cr)) {
2259 while (p < e) {
2260 if (ISASCII(*p)) {
2261 q = search_nonascii(p, e);
2262 if (!q)
2263 return c + (e - p);
2264 c += q - p;
2265 p = q;
2266 }
2267 p += rb_enc_fast_mbclen(p, e, enc);
2268 c++;
2269 }
2270 }
2271 else {
2272 while (p < e) {
2273 if (ISASCII(*p)) {
2274 q = search_nonascii(p, e);
2275 if (!q)
2276 return c + (e - p);
2277 c += q - p;
2278 p = q;
2279 }
2280 p += rb_enc_mbclen(p, e, enc);
2281 c++;
2282 }
2283 }
2284 return c;
2285 }
2286
2287 for (c=0; p<e; c++) {
2288 p += rb_enc_mbclen(p, e, enc);
2289 }
2290 return c;
2291}
2292
2293long
2294rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2295{
2296 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2297}
2298
2299/* To get strlen with cr
2300 * Note that given cr is not used.
2301 */
2302long
2303rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2304{
2305 long c;
2306 const char *q;
2307 int ret;
2308
2309 *cr = 0;
2310 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2311 long diff = (long)(e - p);
2312 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2313 }
2314 else if (rb_enc_asciicompat(enc)) {
2315 c = 0;
2316 while (p < e) {
2317 if (ISASCII(*p)) {
2318 q = search_nonascii(p, e);
2319 if (!q) {
2320 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2321 return c + (e - p);
2322 }
2323 c += q - p;
2324 p = q;
2325 }
2326 ret = rb_enc_precise_mbclen(p, e, enc);
2327 if (MBCLEN_CHARFOUND_P(ret)) {
2328 *cr |= ENC_CODERANGE_VALID;
2329 p += MBCLEN_CHARFOUND_LEN(ret);
2330 }
2331 else {
2333 p++;
2334 }
2335 c++;
2336 }
2337 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2338 return c;
2339 }
2340
2341 for (c=0; p<e; c++) {
2342 ret = rb_enc_precise_mbclen(p, e, enc);
2343 if (MBCLEN_CHARFOUND_P(ret)) {
2344 *cr |= ENC_CODERANGE_VALID;
2345 p += MBCLEN_CHARFOUND_LEN(ret);
2346 }
2347 else {
2349 if (p + rb_enc_mbminlen(enc) <= e)
2350 p += rb_enc_mbminlen(enc);
2351 else
2352 p = e;
2353 }
2354 }
2355 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2356 return c;
2357}
2358
2359/* enc must be str's enc or rb_enc_check(str, str2) */
2360static long
2361str_strlen(VALUE str, rb_encoding *enc)
2362{
2363 const char *p, *e;
2364 int cr;
2365
2366 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2367 if (!enc) enc = STR_ENC_GET(str);
2368 p = RSTRING_PTR(str);
2369 e = RSTRING_END(str);
2370 cr = ENC_CODERANGE(str);
2371
2372 if (cr == ENC_CODERANGE_UNKNOWN) {
2373 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2374 if (cr) ENC_CODERANGE_SET(str, cr);
2375 return n;
2376 }
2377 else {
2378 return enc_strlen(p, e, enc, cr);
2379 }
2380}
2381
2382long
2384{
2385 return str_strlen(str, NULL);
2386}
2387
2388/*
2389 * call-seq:
2390 * length -> integer
2391 *
2392 * :include: doc/string/length.rdoc
2393 *
2394 */
2395
2396VALUE
2398{
2399 return LONG2NUM(str_strlen(str, NULL));
2400}
2401
2402/*
2403 * call-seq:
2404 * bytesize -> integer
2405 *
2406 * :include: doc/string/bytesize.rdoc
2407 *
2408 */
2409
2410VALUE
2411rb_str_bytesize(VALUE str)
2412{
2413 return LONG2NUM(RSTRING_LEN(str));
2414}
2415
2416/*
2417 * call-seq:
2418 * empty? -> true or false
2419 *
2420 * Returns whether the length of +self+ is zero:
2421 *
2422 * 'hello'.empty? # => false
2423 * ' '.empty? # => false
2424 * ''.empty? # => true
2425 *
2426 * Related: see {Querying}[rdoc-ref:String@Querying].
2427 */
2428
2429static VALUE
2430rb_str_empty(VALUE str)
2431{
2432 return RBOOL(RSTRING_LEN(str) == 0);
2433}
2434
2435/*
2436 * call-seq:
2437 * self + other_string -> new_string
2438 *
2439 * Returns a new string containing +other_string+ concatenated to +self+:
2440 *
2441 * 'Hello from ' + self.to_s # => "Hello from main"
2442 *
2443 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2444 */
2445
2446VALUE
2448{
2449 VALUE str3;
2450 rb_encoding *enc;
2451 char *ptr1, *ptr2, *ptr3;
2452 long len1, len2;
2453 int termlen;
2454
2455 StringValue(str2);
2456 enc = rb_enc_check_str(str1, str2);
2457 RSTRING_GETMEM(str1, ptr1, len1);
2458 RSTRING_GETMEM(str2, ptr2, len2);
2459 termlen = rb_enc_mbminlen(enc);
2460 if (len1 > LONG_MAX - len2) {
2461 rb_raise(rb_eArgError, "string size too big");
2462 }
2463 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2464 ptr3 = RSTRING_PTR(str3);
2465 memcpy(ptr3, ptr1, len1);
2466 memcpy(ptr3+len1, ptr2, len2);
2467 TERM_FILL(&ptr3[len1+len2], termlen);
2468
2469 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2471 RB_GC_GUARD(str1);
2472 RB_GC_GUARD(str2);
2473 return str3;
2474}
2475
2476/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2477VALUE
2478rb_str_opt_plus(VALUE str1, VALUE str2)
2479{
2482 long len1, len2;
2483 MAYBE_UNUSED(char) *ptr1, *ptr2;
2484 RSTRING_GETMEM(str1, ptr1, len1);
2485 RSTRING_GETMEM(str2, ptr2, len2);
2486 int enc1 = rb_enc_get_index(str1);
2487 int enc2 = rb_enc_get_index(str2);
2488
2489 if (enc1 < 0) {
2490 return Qundef;
2491 }
2492 else if (enc2 < 0) {
2493 return Qundef;
2494 }
2495 else if (enc1 != enc2) {
2496 return Qundef;
2497 }
2498 else if (len1 > LONG_MAX - len2) {
2499 return Qundef;
2500 }
2501 else {
2502 return rb_str_plus(str1, str2);
2503 }
2504
2505}
2506
2507/*
2508 * call-seq:
2509 * self * n -> new_string
2510 *
2511 * Returns a new string containing +n+ copies of +self+:
2512 *
2513 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2514 * 'No!' * 0 # => ""
2515 *
2516 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2517 */
2518
2519VALUE
2521{
2522 VALUE str2;
2523 long n, len;
2524 char *ptr2;
2525 int termlen;
2526
2527 if (times == INT2FIX(1)) {
2528 return str_duplicate(rb_cString, str);
2529 }
2530 if (times == INT2FIX(0)) {
2531 str2 = str_alloc_embed(rb_cString, 0);
2532 rb_enc_copy(str2, str);
2533 return str2;
2534 }
2535 len = NUM2LONG(times);
2536 if (len < 0) {
2537 rb_raise(rb_eArgError, "negative argument");
2538 }
2539 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2540 if (STR_EMBEDDABLE_P(len, 1)) {
2541 str2 = str_alloc_embed(rb_cString, len + 1);
2542 memset(RSTRING_PTR(str2), 0, len + 1);
2543 }
2544 else {
2545 str2 = str_alloc_heap(rb_cString);
2546 RSTRING(str2)->as.heap.aux.capa = len;
2547 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2548 }
2549 STR_SET_LEN(str2, len);
2550 rb_enc_copy(str2, str);
2551 return str2;
2552 }
2553 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2554 rb_raise(rb_eArgError, "argument too big");
2555 }
2556
2557 len *= RSTRING_LEN(str);
2558 termlen = TERM_LEN(str);
2559 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2560 ptr2 = RSTRING_PTR(str2);
2561 if (len) {
2562 n = RSTRING_LEN(str);
2563 memcpy(ptr2, RSTRING_PTR(str), n);
2564 while (n <= len/2) {
2565 memcpy(ptr2 + n, ptr2, n);
2566 n *= 2;
2567 }
2568 memcpy(ptr2 + n, ptr2, len-n);
2569 }
2570 STR_SET_LEN(str2, len);
2571 TERM_FILL(&ptr2[len], termlen);
2572 rb_enc_cr_str_copy_for_substr(str2, str);
2573
2574 return str2;
2575}
2576
2577/*
2578 * call-seq:
2579 * self % object -> new_string
2580 *
2581 * Returns the result of formatting +object+ into the format specifications
2582 * contained in +self+
2583 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2584 *
2585 * '%05d' % 123 # => "00123"
2586 *
2587 * If +self+ contains multiple format specifications,
2588 * +object+ must be an array or hash containing the objects to be formatted:
2589 *
2590 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2591 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2592 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2593 *
2594 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2595 */
2596
2597static VALUE
2598rb_str_format_m(VALUE str, VALUE arg)
2599{
2600 VALUE tmp = rb_check_array_type(arg);
2601
2602 if (!NIL_P(tmp)) {
2603 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2604 }
2605 return rb_str_format(1, &arg, str);
2606}
2607
2608static inline void
2609rb_check_lockedtmp(VALUE str)
2610{
2611 if (FL_TEST(str, STR_TMPLOCK)) {
2612 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2613 }
2614}
2615
2616// If none of these flags are set, we know we have an modifiable string.
2617// If any is set, we need to do more detailed checks.
2618#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2619static inline void
2620str_modifiable(VALUE str)
2621{
2622 RUBY_ASSERT(ruby_thread_has_gvl_p());
2623
2624 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2625 if (CHILLED_STRING_P(str)) {
2626 CHILLED_STRING_MUTATED(str);
2627 }
2628 rb_check_lockedtmp(str);
2629 rb_check_frozen(str);
2630 }
2631}
2632
2633static inline int
2634str_dependent_p(VALUE str)
2635{
2636 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2637 return FALSE;
2638 }
2639 else {
2640 return TRUE;
2641 }
2642}
2643
2644// If none of these flags are set, we know we have an independent string.
2645// If any is set, we need to do more detailed checks.
2646#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2647static inline int
2648str_independent(VALUE str)
2649{
2650 RUBY_ASSERT(ruby_thread_has_gvl_p());
2651
2652 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2653 str_modifiable(str);
2654 return !str_dependent_p(str);
2655 }
2656 return TRUE;
2657}
2658
2659static void
2660str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2661{
2662 RUBY_ASSERT(ruby_thread_has_gvl_p());
2663
2664 char *ptr;
2665 char *oldptr;
2666 long capa = len + expand;
2667
2668 if (len > capa) len = capa;
2669
2670 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2671 ptr = RSTRING(str)->as.heap.ptr;
2672 STR_SET_EMBED(str);
2673 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2674 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2675 STR_SET_LEN(str, len);
2676 return;
2677 }
2678
2679 ptr = ALLOC_N(char, (size_t)capa + termlen);
2680 oldptr = RSTRING_PTR(str);
2681 if (oldptr) {
2682 memcpy(ptr, oldptr, len);
2683 }
2684 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2685 xfree(oldptr);
2686 }
2687 STR_SET_NOEMBED(str);
2688 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2689 TERM_FILL(ptr + len, termlen);
2690 RSTRING(str)->as.heap.ptr = ptr;
2691 STR_SET_LEN(str, len);
2692 RSTRING(str)->as.heap.aux.capa = capa;
2693}
2694
2695void
2696rb_str_modify(VALUE str)
2697{
2698 if (!str_independent(str))
2699 str_make_independent(str);
2701}
2702
2703void
2705{
2706 RUBY_ASSERT(ruby_thread_has_gvl_p());
2707
2708 int termlen = TERM_LEN(str);
2709 long len = RSTRING_LEN(str);
2710
2711 if (expand < 0) {
2712 rb_raise(rb_eArgError, "negative expanding string size");
2713 }
2714 if (expand >= LONG_MAX - len) {
2715 rb_raise(rb_eArgError, "string size too big");
2716 }
2717
2718 if (!str_independent(str)) {
2719 str_make_independent_expand(str, len, expand, termlen);
2720 }
2721 else if (expand > 0) {
2722 RESIZE_CAPA_TERM(str, len + expand, termlen);
2723 }
2725}
2726
2727/* As rb_str_modify(), but don't clear coderange */
2728static void
2729str_modify_keep_cr(VALUE str)
2730{
2731 if (!str_independent(str))
2732 str_make_independent(str);
2734 /* Force re-scan later */
2736}
2737
2738static inline void
2739str_discard(VALUE str)
2740{
2741 str_modifiable(str);
2742 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2743 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2744 RSTRING(str)->as.heap.ptr = 0;
2745 STR_SET_LEN(str, 0);
2746 }
2747}
2748
2749void
2751{
2752 int encindex = rb_enc_get_index(str);
2753
2754 if (RB_UNLIKELY(encindex == -1)) {
2755 rb_raise(rb_eTypeError, "not encoding capable object");
2756 }
2757
2758 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2759 return;
2760 }
2761
2762 rb_encoding *enc = rb_enc_from_index(encindex);
2763 if (!rb_enc_asciicompat(enc)) {
2764 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2765 }
2766}
2767
2768VALUE
2770{
2771 RUBY_ASSERT(ruby_thread_has_gvl_p());
2772
2773 VALUE s = *ptr;
2774 if (!RB_TYPE_P(s, T_STRING)) {
2775 s = rb_str_to_str(s);
2776 *ptr = s;
2777 }
2778 return s;
2779}
2780
2781char *
2783{
2784 VALUE str = rb_string_value(ptr);
2785 return RSTRING_PTR(str);
2786}
2787
2788static int
2789zero_filled(const char *s, int n)
2790{
2791 for (; n > 0; --n) {
2792 if (*s++) return 0;
2793 }
2794 return 1;
2795}
2796
2797static const char *
2798str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2799{
2800 const char *e = s + len;
2801
2802 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2803 if (zero_filled(s, minlen)) return s;
2804 }
2805 return 0;
2806}
2807
2808static char *
2809str_fill_term(VALUE str, char *s, long len, int termlen)
2810{
2811 /* This function assumes that (capa + termlen) bytes of memory
2812 * is allocated, like many other functions in this file.
2813 */
2814 if (str_dependent_p(str)) {
2815 if (!zero_filled(s + len, termlen))
2816 str_make_independent_expand(str, len, 0L, termlen);
2817 }
2818 else {
2819 TERM_FILL(s + len, termlen);
2820 return s;
2821 }
2822 return RSTRING_PTR(str);
2823}
2824
2825void
2826rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2827{
2828 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2829 long len = RSTRING_LEN(str);
2830
2831 RUBY_ASSERT(capa >= len);
2832 if (capa - len < termlen) {
2833 rb_check_lockedtmp(str);
2834 str_make_independent_expand(str, len, 0L, termlen);
2835 }
2836 else if (str_dependent_p(str)) {
2837 if (termlen > oldtermlen)
2838 str_make_independent_expand(str, len, 0L, termlen);
2839 }
2840 else {
2841 if (!STR_EMBED_P(str)) {
2842 /* modify capa instead of realloc */
2843 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2844 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2845 }
2846 if (termlen > oldtermlen) {
2847 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2848 }
2849 }
2850
2851 return;
2852}
2853
2854static char *
2855str_null_check(VALUE str, int *w)
2856{
2857 char *s = RSTRING_PTR(str);
2858 long len = RSTRING_LEN(str);
2859 rb_encoding *enc = rb_enc_get(str);
2860 const int minlen = rb_enc_mbminlen(enc);
2861
2862 if (minlen > 1) {
2863 *w = 1;
2864 if (str_null_char(s, len, minlen, enc)) {
2865 return NULL;
2866 }
2867 return str_fill_term(str, s, len, minlen);
2868 }
2869 *w = 0;
2870 if (!s || memchr(s, 0, len)) {
2871 return NULL;
2872 }
2873 if (s[len]) {
2874 s = str_fill_term(str, s, len, minlen);
2875 }
2876 return s;
2877}
2878
2879char *
2880rb_str_to_cstr(VALUE str)
2881{
2882 int w;
2883 return str_null_check(str, &w);
2884}
2885
2886char *
2888{
2889 VALUE str = rb_string_value(ptr);
2890 int w;
2891 char *s = str_null_check(str, &w);
2892 if (!s) {
2893 if (w) {
2894 rb_raise(rb_eArgError, "string contains null char");
2895 }
2896 rb_raise(rb_eArgError, "string contains null byte");
2897 }
2898 return s;
2899}
2900
2901char *
2902rb_str_fill_terminator(VALUE str, const int newminlen)
2903{
2904 char *s = RSTRING_PTR(str);
2905 long len = RSTRING_LEN(str);
2906 return str_fill_term(str, s, len, newminlen);
2907}
2908
2909VALUE
2911{
2912 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2913 return str;
2914}
2915
2916/*
2917 * call-seq:
2918 * String.try_convert(object) -> object, new_string, or nil
2919 *
2920 * Attempts to convert the given +object+ to a string.
2921 *
2922 * If +object+ is already a string, returns +object+, unmodified.
2923 *
2924 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2925 * calls <tt>object.to_str</tt> and returns the result.
2926 *
2927 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2928 *
2929 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2930 */
2931static VALUE
2932rb_str_s_try_convert(VALUE dummy, VALUE str)
2933{
2934 return rb_check_string_type(str);
2935}
2936
2937static char*
2938str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2939{
2940 long nth = *nthp;
2941 if (rb_enc_mbmaxlen(enc) == 1) {
2942 p += nth;
2943 }
2944 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2945 p += nth * rb_enc_mbmaxlen(enc);
2946 }
2947 else if (rb_enc_asciicompat(enc)) {
2948 const char *p2, *e2;
2949 int n;
2950
2951 while (p < e && 0 < nth) {
2952 e2 = p + nth;
2953 if (e < e2) {
2954 *nthp = nth;
2955 return (char *)e;
2956 }
2957 if (ISASCII(*p)) {
2958 p2 = search_nonascii(p, e2);
2959 if (!p2) {
2960 nth -= e2 - p;
2961 *nthp = nth;
2962 return (char *)e2;
2963 }
2964 nth -= p2 - p;
2965 p = p2;
2966 }
2967 n = rb_enc_mbclen(p, e, enc);
2968 p += n;
2969 nth--;
2970 }
2971 *nthp = nth;
2972 if (nth != 0) {
2973 return (char *)e;
2974 }
2975 return (char *)p;
2976 }
2977 else {
2978 while (p < e && nth--) {
2979 p += rb_enc_mbclen(p, e, enc);
2980 }
2981 }
2982 if (p > e) p = e;
2983 *nthp = nth;
2984 return (char*)p;
2985}
2986
2987char*
2988rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2989{
2990 return str_nth_len(p, e, &nth, enc);
2991}
2992
2993static char*
2994str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2995{
2996 if (singlebyte)
2997 p += nth;
2998 else {
2999 p = str_nth_len(p, e, &nth, enc);
3000 }
3001 if (!p) return 0;
3002 if (p > e) p = e;
3003 return (char *)p;
3004}
3005
3006/* char offset to byte offset */
3007static long
3008str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3009{
3010 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3011 if (!pp) return e - p;
3012 return pp - p;
3013}
3014
3015long
3016rb_str_offset(VALUE str, long pos)
3017{
3018 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3019 STR_ENC_GET(str), single_byte_optimizable(str));
3020}
3021
3022#ifdef NONASCII_MASK
3023static char *
3024str_utf8_nth(const char *p, const char *e, long *nthp)
3025{
3026 long nth = *nthp;
3027 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3028 const uintptr_t *s, *t;
3029 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3030 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3031 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3032 while (p < (const char *)s) {
3033 if (is_utf8_lead_byte(*p)) nth--;
3034 p++;
3035 }
3036 do {
3037 nth -= count_utf8_lead_bytes_with_word(s);
3038 s++;
3039 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3040 p = (char *)s;
3041 }
3042 while (p < e) {
3043 if (is_utf8_lead_byte(*p)) {
3044 if (nth == 0) break;
3045 nth--;
3046 }
3047 p++;
3048 }
3049 *nthp = nth;
3050 return (char *)p;
3051}
3052
3053static long
3054str_utf8_offset(const char *p, const char *e, long nth)
3055{
3056 const char *pp = str_utf8_nth(p, e, &nth);
3057 return pp - p;
3058}
3059#endif
3060
3061/* byte offset to char offset */
3062long
3063rb_str_sublen(VALUE str, long pos)
3064{
3065 if (single_byte_optimizable(str) || pos < 0)
3066 return pos;
3067 else {
3068 char *p = RSTRING_PTR(str);
3069 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3070 }
3071}
3072
3073static VALUE
3074str_subseq(VALUE str, long beg, long len)
3075{
3076 VALUE str2;
3077
3078 RUBY_ASSERT(beg >= 0);
3079 RUBY_ASSERT(len >= 0);
3080 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3081
3082 const int termlen = TERM_LEN(str);
3083 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3084 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3085 RB_GC_GUARD(str);
3086 return str2;
3087 }
3088
3089 str2 = str_alloc_heap(rb_cString);
3090 if (str_embed_capa(str2) >= len + termlen) {
3091 char *ptr2 = RSTRING(str2)->as.embed.ary;
3092 STR_SET_EMBED(str2);
3093 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3094 TERM_FILL(ptr2+len, termlen);
3095
3096 STR_SET_LEN(str2, len);
3097 RB_GC_GUARD(str);
3098 }
3099 else {
3100 str_replace_shared(str2, str);
3101 RUBY_ASSERT(!STR_EMBED_P(str2));
3102 ENC_CODERANGE_CLEAR(str2);
3103 RSTRING(str2)->as.heap.ptr += beg;
3104 if (RSTRING_LEN(str2) > len) {
3105 STR_SET_LEN(str2, len);
3106 }
3107 }
3108
3109 return str2;
3110}
3111
3112VALUE
3113rb_str_subseq(VALUE str, long beg, long len)
3114{
3115 VALUE str2 = str_subseq(str, beg, len);
3116 rb_enc_cr_str_copy_for_substr(str2, str);
3117 return str2;
3118}
3119
3120char *
3121rb_str_subpos(VALUE str, long beg, long *lenp)
3122{
3123 long len = *lenp;
3124 long slen = -1L;
3125 const long blen = RSTRING_LEN(str);
3126 rb_encoding *enc = STR_ENC_GET(str);
3127 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3128
3129 if (len < 0) return 0;
3130 if (beg < 0 && -beg < 0) return 0;
3131 if (!blen) {
3132 len = 0;
3133 }
3134 if (single_byte_optimizable(str)) {
3135 if (beg > blen) return 0;
3136 if (beg < 0) {
3137 beg += blen;
3138 if (beg < 0) return 0;
3139 }
3140 if (len > blen - beg)
3141 len = blen - beg;
3142 if (len < 0) return 0;
3143 p = s + beg;
3144 goto end;
3145 }
3146 if (beg < 0) {
3147 if (len > -beg) len = -beg;
3148 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3149 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3150 beg = -beg;
3151 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3152 p = e;
3153 if (!p) return 0;
3154 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3155 if (!p) return 0;
3156 len = e - p;
3157 goto end;
3158 }
3159 else {
3160 slen = str_strlen(str, enc);
3161 beg += slen;
3162 if (beg < 0) return 0;
3163 p = s + beg;
3164 if (len == 0) goto end;
3165 }
3166 }
3167 else if (beg > 0 && beg > blen) {
3168 return 0;
3169 }
3170 if (len == 0) {
3171 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3172 p = s + beg;
3173 }
3174#ifdef NONASCII_MASK
3175 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3176 enc == rb_utf8_encoding()) {
3177 p = str_utf8_nth(s, e, &beg);
3178 if (beg > 0) return 0;
3179 len = str_utf8_offset(p, e, len);
3180 }
3181#endif
3182 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3183 int char_sz = rb_enc_mbmaxlen(enc);
3184
3185 p = s + beg * char_sz;
3186 if (p > e) {
3187 return 0;
3188 }
3189 else if (len * char_sz > e - p)
3190 len = e - p;
3191 else
3192 len *= char_sz;
3193 }
3194 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3195 if (beg > 0) return 0;
3196 len = 0;
3197 }
3198 else {
3199 len = str_offset(p, e, len, enc, 0);
3200 }
3201 end:
3202 *lenp = len;
3203 RB_GC_GUARD(str);
3204 return p;
3205}
3206
3207static VALUE str_substr(VALUE str, long beg, long len, int empty);
3208
3209VALUE
3210rb_str_substr(VALUE str, long beg, long len)
3211{
3212 return str_substr(str, beg, len, TRUE);
3213}
3214
3215VALUE
3216rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3217{
3218 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3219}
3220
3221static VALUE
3222str_substr(VALUE str, long beg, long len, int empty)
3223{
3224 char *p = rb_str_subpos(str, beg, &len);
3225
3226 if (!p) return Qnil;
3227 if (!len && !empty) return Qnil;
3228
3229 beg = p - RSTRING_PTR(str);
3230
3231 VALUE str2 = str_subseq(str, beg, len);
3232 rb_enc_cr_str_copy_for_substr(str2, str);
3233 return str2;
3234}
3235
3236/* :nodoc: */
3237VALUE
3239{
3240 if (CHILLED_STRING_P(str)) {
3241 FL_UNSET_RAW(str, STR_CHILLED);
3242 }
3243
3244 if (OBJ_FROZEN(str)) return str;
3245 rb_str_resize(str, RSTRING_LEN(str));
3246 return rb_obj_freeze(str);
3247}
3248
3249/*
3250 * call-seq:
3251 * +string -> new_string or self
3252 *
3253 * Returns +self+ if +self+ is not frozen and can be mutated
3254 * without warning issuance.
3255 *
3256 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3257 *
3258 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3259 */
3260static VALUE
3261str_uplus(VALUE str)
3262{
3263 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3264 return rb_str_dup(str);
3265 }
3266 else {
3267 return str;
3268 }
3269}
3270
3271/*
3272 * call-seq:
3273 * -self -> frozen_string
3274 *
3275 * Returns a frozen string equal to +self+.
3276 *
3277 * The returned string is +self+ if and only if all of the following are true:
3278 *
3279 * - +self+ is already frozen.
3280 * - +self+ is an instance of \String (rather than of a subclass of \String)
3281 * - +self+ has no instance variables set on it.
3282 *
3283 * Otherwise, the returned string is a frozen copy of +self+.
3284 *
3285 * Returning +self+, when possible, saves duplicating +self+;
3286 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3287 *
3288 * It may also save duplicating other, already-existing, strings:
3289 *
3290 * s0 = 'foo'
3291 * s1 = 'foo'
3292 * s0.object_id == s1.object_id # => false
3293 * (-s0).object_id == (-s1).object_id # => true
3294 *
3295 * Note that method #-@ is convenient for defining a constant:
3296 *
3297 * FileName = -'config/database.yml'
3298 *
3299 * While its alias #dedup is better suited for chaining:
3300 *
3301 * 'foo'.dedup.gsub!('o')
3302 *
3303 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3304 */
3305static VALUE
3306str_uminus(VALUE str)
3307{
3308 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3309 str = rb_str_dup(str);
3310 }
3311 return rb_fstring(str);
3312}
3313
3314RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3315#define rb_str_dup_frozen rb_str_new_frozen
3316
3317VALUE
3319{
3320 rb_check_frozen(str);
3321 if (FL_TEST(str, STR_TMPLOCK)) {
3322 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3323 }
3324 FL_SET(str, STR_TMPLOCK);
3325 return str;
3326}
3327
3328VALUE
3330{
3331 rb_check_frozen(str);
3332 if (!FL_TEST(str, STR_TMPLOCK)) {
3333 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3334 }
3335 FL_UNSET(str, STR_TMPLOCK);
3336 return str;
3337}
3338
3339VALUE
3340rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3341{
3342 rb_str_locktmp(str);
3343 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3344}
3345
3346void
3348{
3349 RUBY_ASSERT(ruby_thread_has_gvl_p());
3350
3351 long capa;
3352 const int termlen = TERM_LEN(str);
3353
3354 str_modifiable(str);
3355 if (STR_SHARED_P(str)) {
3356 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3357 }
3358 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3359 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3360 }
3361
3362 int cr = ENC_CODERANGE(str);
3363 if (len == 0) {
3364 /* Empty string does not contain non-ASCII */
3366 }
3367 else if (cr == ENC_CODERANGE_UNKNOWN) {
3368 /* Leave unknown. */
3369 }
3370 else if (len > RSTRING_LEN(str)) {
3371 if (ENC_CODERANGE_CLEAN_P(cr)) {
3372 /* Update the coderange regarding the extended part. */
3373 const char *const prev_end = RSTRING_END(str);
3374 const char *const new_end = RSTRING_PTR(str) + len;
3375 rb_encoding *enc = rb_enc_get(str);
3376 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3377 ENC_CODERANGE_SET(str, cr);
3378 }
3379 else if (cr == ENC_CODERANGE_BROKEN) {
3380 /* May be valid now, by appended part. */
3382 }
3383 }
3384 else if (len < RSTRING_LEN(str)) {
3385 if (cr != ENC_CODERANGE_7BIT) {
3386 /* ASCII-only string is keeping after truncated. Valid
3387 * and broken may be invalid or valid, leave unknown. */
3389 }
3390 }
3391
3392 STR_SET_LEN(str, len);
3393 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3394}
3395
3396VALUE
3397rb_str_resize(VALUE str, long len)
3398{
3399 if (len < 0) {
3400 rb_raise(rb_eArgError, "negative string size (or size too big)");
3401 }
3402
3403 int independent = str_independent(str);
3404 long slen = RSTRING_LEN(str);
3405 const int termlen = TERM_LEN(str);
3406
3407 if (slen > len || (termlen != 1 && slen < len)) {
3409 }
3410
3411 {
3412 long capa;
3413 if (STR_EMBED_P(str)) {
3414 if (len == slen) return str;
3415 if (str_embed_capa(str) >= len + termlen) {
3416 STR_SET_LEN(str, len);
3417 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3418 return str;
3419 }
3420 str_make_independent_expand(str, slen, len - slen, termlen);
3421 }
3422 else if (str_embed_capa(str) >= len + termlen) {
3423 char *ptr = STR_HEAP_PTR(str);
3424 STR_SET_EMBED(str);
3425 if (slen > len) slen = len;
3426 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3427 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3428 STR_SET_LEN(str, len);
3429 if (independent) ruby_xfree(ptr);
3430 return str;
3431 }
3432 else if (!independent) {
3433 if (len == slen) return str;
3434 str_make_independent_expand(str, slen, len - slen, termlen);
3435 }
3436 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3437 (capa - len) > (len < 1024 ? len : 1024)) {
3438 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3439 (size_t)len + termlen, STR_HEAP_SIZE(str));
3440 RSTRING(str)->as.heap.aux.capa = len;
3441 }
3442 else if (len == slen) return str;
3443 STR_SET_LEN(str, len);
3444 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3445 }
3446 return str;
3447}
3448
3449static void
3450str_ensure_available_capa(VALUE str, long len)
3451{
3452 str_modify_keep_cr(str);
3453
3454 const int termlen = TERM_LEN(str);
3455 long olen = RSTRING_LEN(str);
3456
3457 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3458 rb_raise(rb_eArgError, "string sizes too big");
3459 }
3460
3461 long total = olen + len;
3462 long capa = str_capacity(str, termlen);
3463
3464 if (capa < total) {
3465 if (total >= LONG_MAX / 2) {
3466 capa = total;
3467 }
3468 while (total > capa) {
3469 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3470 }
3471 RESIZE_CAPA_TERM(str, capa, termlen);
3472 }
3473}
3474
3475static VALUE
3476str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3477{
3478 if (keep_cr) {
3479 str_modify_keep_cr(str);
3480 }
3481 else {
3482 rb_str_modify(str);
3483 }
3484 if (len == 0) return 0;
3485
3486 long total, olen, off = -1;
3487 char *sptr;
3488 const int termlen = TERM_LEN(str);
3489
3490 RSTRING_GETMEM(str, sptr, olen);
3491 if (ptr >= sptr && ptr <= sptr + olen) {
3492 off = ptr - sptr;
3493 }
3494
3495 long capa = str_capacity(str, termlen);
3496
3497 if (olen > LONG_MAX - len) {
3498 rb_raise(rb_eArgError, "string sizes too big");
3499 }
3500 total = olen + len;
3501 if (capa < total) {
3502 if (total >= LONG_MAX / 2) {
3503 capa = total;
3504 }
3505 while (total > capa) {
3506 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3507 }
3508 RESIZE_CAPA_TERM(str, capa, termlen);
3509 sptr = RSTRING_PTR(str);
3510 }
3511 if (off != -1) {
3512 ptr = sptr + off;
3513 }
3514 memcpy(sptr + olen, ptr, len);
3515 STR_SET_LEN(str, total);
3516 TERM_FILL(sptr + total, termlen); /* sentinel */
3517
3518 return str;
3519}
3520
3521#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3522#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3523
3524VALUE
3525rb_str_cat(VALUE str, const char *ptr, long len)
3526{
3527 if (len == 0) return str;
3528 if (len < 0) {
3529 rb_raise(rb_eArgError, "negative string size (or size too big)");
3530 }
3531 return str_buf_cat(str, ptr, len);
3532}
3533
3534VALUE
3535rb_str_cat_cstr(VALUE str, const char *ptr)
3536{
3537 must_not_null(ptr);
3538 return rb_str_buf_cat(str, ptr, strlen(ptr));
3539}
3540
3541static void
3542rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3543{
3544 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3545
3546 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3547 if (UNLIKELY(!str_independent(str))) {
3548 str_make_independent(str);
3549 }
3550
3551 long string_length = -1;
3552 const int null_terminator_length = 1;
3553 char *sptr;
3554 RSTRING_GETMEM(str, sptr, string_length);
3555
3556 // Ensure the resulting string wouldn't be too long.
3557 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3558 rb_raise(rb_eArgError, "string sizes too big");
3559 }
3560
3561 long string_capacity = str_capacity(str, null_terminator_length);
3562
3563 // Get the code range before any modifications since those might clear the code range.
3564 int cr = ENC_CODERANGE(str);
3565
3566 // Check if the string has spare string_capacity to write the new byte.
3567 if (LIKELY(string_capacity >= string_length + 1)) {
3568 // In fast path we can write the new byte and note the string's new length.
3569 sptr[string_length] = byte;
3570 STR_SET_LEN(str, string_length + 1);
3571 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3572 }
3573 else {
3574 // If there's not enough string_capacity, make a call into the general string concatenation function.
3575 str_buf_cat(str, (char *)&byte, 1);
3576 }
3577
3578 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3579 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3580 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3581 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3582 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3583 if (ISASCII(byte)) {
3585 }
3586 else {
3588
3589 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3590 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3591 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3592 }
3593 }
3594 }
3595}
3596
3597RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3598RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3599RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3600
3601static VALUE
3602rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3603 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3604{
3605 int str_encindex = ENCODING_GET(str);
3606 int res_encindex;
3607 int str_cr, res_cr;
3608 rb_encoding *str_enc, *ptr_enc;
3609
3610 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3611
3612 if (str_encindex == ptr_encindex) {
3613 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3614 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3615 }
3616 }
3617 else {
3618 str_enc = rb_enc_from_index(str_encindex);
3619 ptr_enc = rb_enc_from_index(ptr_encindex);
3620 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3621 if (len == 0)
3622 return str;
3623 if (RSTRING_LEN(str) == 0) {
3624 rb_str_buf_cat(str, ptr, len);
3625 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3626 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3627 return str;
3628 }
3629 goto incompatible;
3630 }
3631 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3632 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3633 }
3634 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3635 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3636 str_cr = rb_enc_str_coderange(str);
3637 }
3638 }
3639 }
3640 if (ptr_cr_ret)
3641 *ptr_cr_ret = ptr_cr;
3642
3643 if (str_encindex != ptr_encindex &&
3644 str_cr != ENC_CODERANGE_7BIT &&
3645 ptr_cr != ENC_CODERANGE_7BIT) {
3646 str_enc = rb_enc_from_index(str_encindex);
3647 ptr_enc = rb_enc_from_index(ptr_encindex);
3648 goto incompatible;
3649 }
3650
3651 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3652 res_encindex = str_encindex;
3653 res_cr = ENC_CODERANGE_UNKNOWN;
3654 }
3655 else if (str_cr == ENC_CODERANGE_7BIT) {
3656 if (ptr_cr == ENC_CODERANGE_7BIT) {
3657 res_encindex = str_encindex;
3658 res_cr = ENC_CODERANGE_7BIT;
3659 }
3660 else {
3661 res_encindex = ptr_encindex;
3662 res_cr = ptr_cr;
3663 }
3664 }
3665 else if (str_cr == ENC_CODERANGE_VALID) {
3666 res_encindex = str_encindex;
3667 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3668 res_cr = str_cr;
3669 else
3670 res_cr = ptr_cr;
3671 }
3672 else { /* str_cr == ENC_CODERANGE_BROKEN */
3673 res_encindex = str_encindex;
3674 res_cr = str_cr;
3675 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3676 }
3677
3678 if (len < 0) {
3679 rb_raise(rb_eArgError, "negative string size (or size too big)");
3680 }
3681 str_buf_cat(str, ptr, len);
3682 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3683 return str;
3684
3685 incompatible:
3686 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3687 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3689}
3690
3691VALUE
3692rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3693{
3694 return rb_enc_cr_str_buf_cat(str, ptr, len,
3695 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3696}
3697
3698VALUE
3700{
3701 /* ptr must reference NUL terminated ASCII string. */
3702 int encindex = ENCODING_GET(str);
3703 rb_encoding *enc = rb_enc_from_index(encindex);
3704 if (rb_enc_asciicompat(enc)) {
3705 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3706 encindex, ENC_CODERANGE_7BIT, 0);
3707 }
3708 else {
3709 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3710 while (*ptr) {
3711 unsigned int c = (unsigned char)*ptr;
3712 int len = rb_enc_codelen(c, enc);
3713 rb_enc_mbcput(c, buf, enc);
3714 rb_enc_cr_str_buf_cat(str, buf, len,
3715 encindex, ENC_CODERANGE_VALID, 0);
3716 ptr++;
3717 }
3718 return str;
3719 }
3720}
3721
3722VALUE
3724{
3725 int str2_cr = rb_enc_str_coderange(str2);
3726
3727 if (str_enc_fastpath(str)) {
3728 switch (str2_cr) {
3729 case ENC_CODERANGE_7BIT:
3730 // If RHS is 7bit we can do simple concatenation
3731 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3732 RB_GC_GUARD(str2);
3733 return str;
3735 // If RHS is valid, we can do simple concatenation if encodings are the same
3736 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3737 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3738 int str_cr = ENC_CODERANGE(str);
3739 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3740 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3741 }
3742 RB_GC_GUARD(str2);
3743 return str;
3744 }
3745 }
3746 }
3747
3748 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3749 ENCODING_GET(str2), str2_cr, &str2_cr);
3750
3751 ENC_CODERANGE_SET(str2, str2_cr);
3752
3753 return str;
3754}
3755
3756VALUE
3758{
3759 StringValue(str2);
3760 return rb_str_buf_append(str, str2);
3761}
3762
3763VALUE
3764rb_str_concat_literals(size_t num, const VALUE *strary)
3765{
3766 VALUE str;
3767 size_t i, s = 0;
3768 unsigned long len = 1;
3769
3770 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3771 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3772
3773 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3774 str = rb_str_buf_new(len);
3775 str_enc_copy_direct(str, strary[0]);
3776
3777 for (i = s; i < num; ++i) {
3778 const VALUE v = strary[i];
3779 int encidx = ENCODING_GET(v);
3780
3781 rb_str_buf_append(str, v);
3782 if (encidx != ENCINDEX_US_ASCII) {
3783 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3784 rb_enc_set_index(str, encidx);
3785 }
3786 }
3787 return str;
3788}
3789
3790/*
3791 * call-seq:
3792 * concat(*objects) -> string
3793 *
3794 * :include: doc/string/concat.rdoc
3795 */
3796static VALUE
3797rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3798{
3799 str_modifiable(str);
3800
3801 if (argc == 1) {
3802 return rb_str_concat(str, argv[0]);
3803 }
3804 else if (argc > 1) {
3805 int i;
3806 VALUE arg_str = rb_str_tmp_new(0);
3807 rb_enc_copy(arg_str, str);
3808 for (i = 0; i < argc; i++) {
3809 rb_str_concat(arg_str, argv[i]);
3810 }
3811 rb_str_buf_append(str, arg_str);
3812 }
3813
3814 return str;
3815}
3816
3817/*
3818 * call-seq:
3819 * append_as_bytes(*objects) -> self
3820 *
3821 * Concatenates each object in +objects+ into +self+; returns +self+;
3822 * performs no encoding validation or conversion:
3823 *
3824 * s = 'foo'
3825 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3826 * s.valid_encoding? # => false
3827 * s.append_as_bytes("\xAC 12")
3828 * s.valid_encoding? # => true
3829 *
3830 * When a given object is an integer,
3831 * the value is considered an 8-bit byte;
3832 * if the integer occupies more than one byte (i.e,. is greater than 255),
3833 * appends only the low-order byte (similar to String#setbyte):
3834 *
3835 * s = ""
3836 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3837 * s.bytesize # => 2
3838 *
3839 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3840 */
3841
3842VALUE
3843rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3844{
3845 long needed_capacity = 0;
3846 volatile VALUE t0;
3847 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3848
3849 for (int index = 0; index < argc; index++) {
3850 VALUE obj = argv[index];
3851 enum ruby_value_type type = types[index] = rb_type(obj);
3852 switch (type) {
3853 case T_FIXNUM:
3854 case T_BIGNUM:
3855 needed_capacity++;
3856 break;
3857 case T_STRING:
3858 needed_capacity += RSTRING_LEN(obj);
3859 break;
3860 default:
3861 rb_raise(
3863 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3864 rb_obj_class(obj)
3865 );
3866 break;
3867 }
3868 }
3869
3870 str_ensure_available_capa(str, needed_capacity);
3871 char *sptr = RSTRING_END(str);
3872
3873 for (int index = 0; index < argc; index++) {
3874 VALUE obj = argv[index];
3875 enum ruby_value_type type = types[index];
3876 switch (type) {
3877 case T_FIXNUM:
3878 case T_BIGNUM: {
3879 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3880 char byte = (char)(NUM2INT(obj) & 0xFF);
3881 *sptr = byte;
3882 sptr++;
3883 break;
3884 }
3885 case T_STRING: {
3886 const char *ptr;
3887 long len;
3888 RSTRING_GETMEM(obj, ptr, len);
3889 memcpy(sptr, ptr, len);
3890 sptr += len;
3891 break;
3892 }
3893 default:
3894 rb_bug("append_as_bytes arguments should have been validated");
3895 }
3896 }
3897
3898 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3899 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3900
3901 int cr = ENC_CODERANGE(str);
3902 switch (cr) {
3903 case ENC_CODERANGE_7BIT: {
3904 for (int index = 0; index < argc; index++) {
3905 VALUE obj = argv[index];
3906 enum ruby_value_type type = types[index];
3907 switch (type) {
3908 case T_FIXNUM:
3909 case T_BIGNUM: {
3910 if (!ISASCII(NUM2INT(obj))) {
3911 goto clear_cr;
3912 }
3913 break;
3914 }
3915 case T_STRING: {
3916 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3917 goto clear_cr;
3918 }
3919 break;
3920 }
3921 default:
3922 rb_bug("append_as_bytes arguments should have been validated");
3923 }
3924 }
3925 break;
3926 }
3928 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3929 goto keep_cr;
3930 }
3931 else {
3932 goto clear_cr;
3933 }
3934 break;
3935 default:
3936 goto clear_cr;
3937 break;
3938 }
3939
3940 RB_GC_GUARD(t0);
3941
3942 clear_cr:
3943 // If no fast path was hit, we clear the coderange.
3944 // append_as_bytes is predominently meant to be used in
3945 // buffering situation, hence it's likely the coderange
3946 // will never be scanned, so it's not worth spending time
3947 // precomputing the coderange except for simple and common
3948 // situations.
3950 keep_cr:
3951 return str;
3952}
3953
3954/*
3955 * call-seq:
3956 * self << object -> self
3957 *
3958 * Appends a string representation of +object+ to +self+;
3959 * returns +self+.
3960 *
3961 * If +object+ is a string, appends it to +self+:
3962 *
3963 * s = 'foo'
3964 * s << 'bar' # => "foobar"
3965 * s # => "foobar"
3966 *
3967 * If +object+ is an integer,
3968 * its value is considered a codepoint;
3969 * converts the value to a character before concatenating:
3970 *
3971 * s = 'foo'
3972 * s << 33 # => "foo!"
3973 *
3974 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3975 * and the encoding of +self+ is Encoding::US_ASCII,
3976 * changes the encoding to Encoding::ASCII_8BIT:
3977 *
3978 * s = 'foo'.encode(Encoding::US_ASCII)
3979 * s.encoding # => #<Encoding:US-ASCII>
3980 * s << 0xff # => "foo\xFF"
3981 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3982 *
3983 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3984 *
3985 * s = 'foo'
3986 * s.encoding # => <Encoding:UTF-8>
3987 * s << 0x00110000 # 1114112 out of char range (RangeError)
3988 * s = 'foo'.encode(Encoding::EUC_JP)
3989 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3990 *
3991 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3992 */
3993VALUE
3995{
3996 unsigned int code;
3997 rb_encoding *enc = STR_ENC_GET(str1);
3998 int encidx;
3999
4000 if (RB_INTEGER_TYPE_P(str2)) {
4001 if (rb_num_to_uint(str2, &code) == 0) {
4002 }
4003 else if (FIXNUM_P(str2)) {
4004 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4005 }
4006 else {
4007 rb_raise(rb_eRangeError, "bignum out of char range");
4008 }
4009 }
4010 else {
4011 return rb_str_append(str1, str2);
4012 }
4013
4014 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4015
4016 if (encidx >= 0) {
4017 rb_str_buf_cat_byte(str1, (unsigned char)code);
4018 }
4019 else {
4020 long pos = RSTRING_LEN(str1);
4021 int cr = ENC_CODERANGE(str1);
4022 int len;
4023 char *buf;
4024
4025 switch (len = rb_enc_codelen(code, enc)) {
4026 case ONIGERR_INVALID_CODE_POINT_VALUE:
4027 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4028 break;
4029 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4030 case 0:
4031 rb_raise(rb_eRangeError, "%u out of char range", code);
4032 break;
4033 }
4034 buf = ALLOCA_N(char, len + 1);
4035 rb_enc_mbcput(code, buf, enc);
4036 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4037 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4038 }
4039 rb_str_resize(str1, pos+len);
4040 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4041 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4043 }
4044 else if (cr == ENC_CODERANGE_BROKEN) {
4046 }
4047 ENC_CODERANGE_SET(str1, cr);
4048 }
4049 return str1;
4050}
4051
4052int
4053rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4054{
4055 int encidx = rb_enc_to_index(enc);
4056
4057 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4058 /* US-ASCII automatically extended to ASCII-8BIT */
4059 if (code > 0xFF) {
4060 rb_raise(rb_eRangeError, "%u out of char range", code);
4061 }
4062 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4063 return ENCINDEX_ASCII_8BIT;
4064 }
4065 return encidx;
4066 }
4067 else {
4068 return -1;
4069 }
4070}
4071
4072/*
4073 * call-seq:
4074 * prepend(*other_strings) -> string
4075 *
4076 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4077 *
4078 * s = 'foo'
4079 * s.prepend('bar', 'baz') # => "barbazfoo"
4080 * s # => "barbazfoo"
4081 *
4082 * Related: String#concat.
4083 */
4084
4085static VALUE
4086rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4087{
4088 str_modifiable(str);
4089
4090 if (argc == 1) {
4091 rb_str_update(str, 0L, 0L, argv[0]);
4092 }
4093 else if (argc > 1) {
4094 int i;
4095 VALUE arg_str = rb_str_tmp_new(0);
4096 rb_enc_copy(arg_str, str);
4097 for (i = 0; i < argc; i++) {
4098 rb_str_append(arg_str, argv[i]);
4099 }
4100 rb_str_update(str, 0L, 0L, arg_str);
4101 }
4102
4103 return str;
4104}
4105
4106st_index_t
4108{
4109 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4110 st_index_t precomputed_hash;
4111 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4112
4113 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4114 return precomputed_hash;
4115 }
4116
4117 return str_do_hash(str);
4118}
4119
4120int
4122{
4123 long len1, len2;
4124 const char *ptr1, *ptr2;
4125 RSTRING_GETMEM(str1, ptr1, len1);
4126 RSTRING_GETMEM(str2, ptr2, len2);
4127 return (len1 != len2 ||
4128 !rb_str_comparable(str1, str2) ||
4129 memcmp(ptr1, ptr2, len1) != 0);
4130}
4131
4132/*
4133 * call-seq:
4134 * hash -> integer
4135 *
4136 * Returns the integer hash value for +self+.
4137 * The value is based on the length, content and encoding of +self+.
4138 *
4139 * Related: Object#hash.
4140 */
4141
4142static VALUE
4143rb_str_hash_m(VALUE str)
4144{
4145 st_index_t hval = rb_str_hash(str);
4146 return ST2FIX(hval);
4147}
4148
4149#define lesser(a,b) (((a)>(b))?(b):(a))
4150
4151int
4153{
4154 int idx1, idx2;
4155 int rc1, rc2;
4156
4157 if (RSTRING_LEN(str1) == 0) return TRUE;
4158 if (RSTRING_LEN(str2) == 0) return TRUE;
4159 idx1 = ENCODING_GET(str1);
4160 idx2 = ENCODING_GET(str2);
4161 if (idx1 == idx2) return TRUE;
4162 rc1 = rb_enc_str_coderange(str1);
4163 rc2 = rb_enc_str_coderange(str2);
4164 if (rc1 == ENC_CODERANGE_7BIT) {
4165 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4166 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4167 return TRUE;
4168 }
4169 if (rc2 == ENC_CODERANGE_7BIT) {
4170 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4171 return TRUE;
4172 }
4173 return FALSE;
4174}
4175
4176int
4178{
4179 long len1, len2;
4180 const char *ptr1, *ptr2;
4181 int retval;
4182
4183 if (str1 == str2) return 0;
4184 RSTRING_GETMEM(str1, ptr1, len1);
4185 RSTRING_GETMEM(str2, ptr2, len2);
4186 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4187 if (len1 == len2) {
4188 if (!rb_str_comparable(str1, str2)) {
4189 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4190 return 1;
4191 return -1;
4192 }
4193 return 0;
4194 }
4195 if (len1 > len2) return 1;
4196 return -1;
4197 }
4198 if (retval > 0) return 1;
4199 return -1;
4200}
4201
4202/*
4203 * call-seq:
4204 * self == object -> true or false
4205 *
4206 * Returns whether +object+ is equal to +self+.
4207 *
4208 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4209 *
4210 * s = 'foo'
4211 * s == 'foo' # => true
4212 * s == 'food' # => false
4213 * s == 'FOO' # => false
4214 *
4215 * Returns +false+ if the two strings' encodings are not compatible:
4216 *
4217 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4218 *
4219 * When +object+ is not a string:
4220 *
4221 * - If +object+ responds to method <tt>to_str</tt>,
4222 * <tt>object == self</tt> is called and its return value is returned.
4223 * - If +object+ does not respond to <tt>to_str</tt>,
4224 * +false+ is returned.
4225 *
4226 * Related: {Comparing}[rdoc-ref:String@Comparing].
4227 */
4228
4229VALUE
4231{
4232 if (str1 == str2) return Qtrue;
4233 if (!RB_TYPE_P(str2, T_STRING)) {
4234 if (!rb_respond_to(str2, idTo_str)) {
4235 return Qfalse;
4236 }
4237 return rb_equal(str2, str1);
4238 }
4239 return rb_str_eql_internal(str1, str2);
4240}
4241
4242/*
4243 * call-seq:
4244 * eql?(object) -> true or false
4245 *
4246 * :include: doc/string/eql_p.rdoc
4247 *
4248 */
4249
4250VALUE
4251rb_str_eql(VALUE str1, VALUE str2)
4252{
4253 if (str1 == str2) return Qtrue;
4254 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4255 return rb_str_eql_internal(str1, str2);
4256}
4257
4258/*
4259 * call-seq:
4260 * self <=> other_string -> -1, 0, 1, or nil
4261 *
4262 * Compares +self+ and +other_string+, returning:
4263 *
4264 * - -1 if +other_string+ is larger.
4265 * - 0 if the two are equal.
4266 * - 1 if +other_string+ is smaller.
4267 * - +nil+ if the two are incomparable.
4268 *
4269 * Examples:
4270 *
4271 * 'foo' <=> 'foo' # => 0
4272 * 'foo' <=> 'food' # => -1
4273 * 'food' <=> 'foo' # => 1
4274 * 'FOO' <=> 'foo' # => -1
4275 * 'foo' <=> 'FOO' # => 1
4276 * 'foo' <=> 1 # => nil
4277 *
4278 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4279 */
4280
4281static VALUE
4282rb_str_cmp_m(VALUE str1, VALUE str2)
4283{
4284 int result;
4285 VALUE s = rb_check_string_type(str2);
4286 if (NIL_P(s)) {
4287 return rb_invcmp(str1, str2);
4288 }
4289 result = rb_str_cmp(str1, s);
4290 return INT2FIX(result);
4291}
4292
4293static VALUE str_casecmp(VALUE str1, VALUE str2);
4294static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4295
4296/*
4297 * call-seq:
4298 * casecmp(other_string) -> -1, 0, 1, or nil
4299 *
4300 * Ignoring case, compares +self+ and +other_string+; returns:
4301 *
4302 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4303 * - 0 if the two are equal.
4304 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4305 * - +nil+ if the two are incomparable.
4306 *
4307 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4308 *
4309 * Examples:
4310 *
4311 * 'foo'.casecmp('goo') # => -1
4312 * 'goo'.casecmp('foo') # => 1
4313 * 'foo'.casecmp('food') # => -1
4314 * 'food'.casecmp('foo') # => 1
4315 * 'FOO'.casecmp('foo') # => 0
4316 * 'foo'.casecmp('FOO') # => 0
4317 * 'foo'.casecmp(1) # => nil
4318 *
4319 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4320 */
4321
4322static VALUE
4323rb_str_casecmp(VALUE str1, VALUE str2)
4324{
4325 VALUE s = rb_check_string_type(str2);
4326 if (NIL_P(s)) {
4327 return Qnil;
4328 }
4329 return str_casecmp(str1, s);
4330}
4331
4332static VALUE
4333str_casecmp(VALUE str1, VALUE str2)
4334{
4335 long len;
4336 rb_encoding *enc;
4337 const char *p1, *p1end, *p2, *p2end;
4338
4339 enc = rb_enc_compatible(str1, str2);
4340 if (!enc) {
4341 return Qnil;
4342 }
4343
4344 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4345 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4346 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4347 while (p1 < p1end && p2 < p2end) {
4348 if (*p1 != *p2) {
4349 unsigned int c1 = TOLOWER(*p1 & 0xff);
4350 unsigned int c2 = TOLOWER(*p2 & 0xff);
4351 if (c1 != c2)
4352 return INT2FIX(c1 < c2 ? -1 : 1);
4353 }
4354 p1++;
4355 p2++;
4356 }
4357 }
4358 else {
4359 while (p1 < p1end && p2 < p2end) {
4360 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4361 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4362
4363 if (0 <= c1 && 0 <= c2) {
4364 c1 = TOLOWER(c1);
4365 c2 = TOLOWER(c2);
4366 if (c1 != c2)
4367 return INT2FIX(c1 < c2 ? -1 : 1);
4368 }
4369 else {
4370 int r;
4371 l1 = rb_enc_mbclen(p1, p1end, enc);
4372 l2 = rb_enc_mbclen(p2, p2end, enc);
4373 len = l1 < l2 ? l1 : l2;
4374 r = memcmp(p1, p2, len);
4375 if (r != 0)
4376 return INT2FIX(r < 0 ? -1 : 1);
4377 if (l1 != l2)
4378 return INT2FIX(l1 < l2 ? -1 : 1);
4379 }
4380 p1 += l1;
4381 p2 += l2;
4382 }
4383 }
4384 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4385 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4386 return INT2FIX(-1);
4387}
4388
4389/*
4390 * call-seq:
4391 * casecmp?(other_string) -> true, false, or nil
4392 *
4393 * Returns +true+ if +self+ and +other_string+ are equal after
4394 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4395 *
4396 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4397 *
4398 * Examples:
4399 *
4400 * 'foo'.casecmp?('goo') # => false
4401 * 'goo'.casecmp?('foo') # => false
4402 * 'foo'.casecmp?('food') # => false
4403 * 'food'.casecmp?('foo') # => false
4404 * 'FOO'.casecmp?('foo') # => true
4405 * 'foo'.casecmp?('FOO') # => true
4406 * 'foo'.casecmp?(1) # => nil
4407 *
4408 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4409 */
4410
4411static VALUE
4412rb_str_casecmp_p(VALUE str1, VALUE str2)
4413{
4414 VALUE s = rb_check_string_type(str2);
4415 if (NIL_P(s)) {
4416 return Qnil;
4417 }
4418 return str_casecmp_p(str1, s);
4419}
4420
4421static VALUE
4422str_casecmp_p(VALUE str1, VALUE str2)
4423{
4424 rb_encoding *enc;
4425 VALUE folded_str1, folded_str2;
4426 VALUE fold_opt = sym_fold;
4427
4428 enc = rb_enc_compatible(str1, str2);
4429 if (!enc) {
4430 return Qnil;
4431 }
4432
4433 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4434 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4435
4436 return rb_str_eql(folded_str1, folded_str2);
4437}
4438
4439static long
4440strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4441 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4442{
4443 const char *search_start = str_ptr;
4444 long pos, search_len = str_len - offset;
4445
4446 for (;;) {
4447 const char *t;
4448 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4449 if (pos < 0) return pos;
4450 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4451 if (t == search_start + pos) break;
4452 search_len -= t - search_start;
4453 if (search_len <= 0) return -1;
4454 offset += t - search_start;
4455 search_start = t;
4456 }
4457 return pos + offset;
4458}
4459
4460/* found index in byte */
4461#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4462#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4463
4464static long
4465rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4466{
4467 const char *str_ptr, *str_ptr_end, *sub_ptr;
4468 long str_len, sub_len;
4469 rb_encoding *enc;
4470
4471 enc = rb_enc_check(str, sub);
4472 if (is_broken_string(sub)) return -1;
4473
4474 str_ptr = RSTRING_PTR(str);
4475 str_ptr_end = RSTRING_END(str);
4476 str_len = RSTRING_LEN(str);
4477 sub_ptr = RSTRING_PTR(sub);
4478 sub_len = RSTRING_LEN(sub);
4479
4480 if (str_len < sub_len) return -1;
4481
4482 if (offset != 0) {
4483 long str_len_char, sub_len_char;
4484 int single_byte = single_byte_optimizable(str);
4485 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4486 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4487 if (offset < 0) {
4488 offset += str_len_char;
4489 if (offset < 0) return -1;
4490 }
4491 if (str_len_char - offset < sub_len_char) return -1;
4492 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4493 str_ptr += offset;
4494 }
4495 if (sub_len == 0) return offset;
4496
4497 /* need proceed one character at a time */
4498 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4499}
4500
4501
4502/*
4503 * call-seq:
4504 * index(substring, offset = 0) -> integer or nil
4505 * index(regexp, offset = 0) -> integer or nil
4506 *
4507 * :include: doc/string/index.rdoc
4508 *
4509 */
4510
4511static VALUE
4512rb_str_index_m(int argc, VALUE *argv, VALUE str)
4513{
4514 VALUE sub;
4515 VALUE initpos;
4516 rb_encoding *enc = STR_ENC_GET(str);
4517 long pos;
4518
4519 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4520 long slen = str_strlen(str, enc); /* str's enc */
4521 pos = NUM2LONG(initpos);
4522 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4523 if (RB_TYPE_P(sub, T_REGEXP)) {
4525 }
4526 return Qnil;
4527 }
4528 }
4529 else {
4530 pos = 0;
4531 }
4532
4533 if (RB_TYPE_P(sub, T_REGEXP)) {
4534 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4535 enc, single_byte_optimizable(str));
4536
4537 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4538 VALUE match = rb_backref_get();
4539 struct re_registers *regs = RMATCH_REGS(match);
4540 pos = rb_str_sublen(str, BEG(0));
4541 return LONG2NUM(pos);
4542 }
4543 }
4544 else {
4545 StringValue(sub);
4546 pos = rb_str_index(str, sub, pos);
4547 if (pos >= 0) {
4548 pos = rb_str_sublen(str, pos);
4549 return LONG2NUM(pos);
4550 }
4551 }
4552 return Qnil;
4553}
4554
4555/* Ensure that the given pos is a valid character boundary.
4556 * Note that in this function, "character" means a code point
4557 * (Unicode scalar value), not a grapheme cluster.
4558 */
4559static void
4560str_ensure_byte_pos(VALUE str, long pos)
4561{
4562 if (!single_byte_optimizable(str)) {
4563 const char *s = RSTRING_PTR(str);
4564 const char *e = RSTRING_END(str);
4565 const char *p = s + pos;
4566 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4567 rb_raise(rb_eIndexError,
4568 "offset %ld does not land on character boundary", pos);
4569 }
4570 }
4571}
4572
4573/*
4574 * call-seq:
4575 * byteindex(object, offset = 0) -> integer or nil
4576 *
4577 * Returns the 0-based integer index of a substring of +self+
4578 * specified by +object+ (a string or Regexp) and +offset+,
4579 * or +nil+ if there is no such substring;
4580 * the returned index is the count of _bytes_ (not characters).
4581 *
4582 * When +object+ is a string,
4583 * returns the index of the first found substring equal to +object+:
4584 *
4585 * s = 'foo' # => "foo"
4586 * s.size # => 3 # Three 1-byte characters.
4587 * s.bytesize # => 3 # Three bytes.
4588 * s.byteindex('f') # => 0
4589 * s.byteindex('o') # => 1
4590 * s.byteindex('oo') # => 1
4591 * s.byteindex('ooo') # => nil
4592 *
4593 * When +object+ is a Regexp,
4594 * returns the index of the first found substring matching +object+;
4595 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4596 *
4597 * s = 'foo'
4598 * s.byteindex(/f/) # => 0
4599 * $~ # => #<MatchData "f">
4600 * s.byteindex(/o/) # => 1
4601 * s.byteindex(/oo/) # => 1
4602 * s.byteindex(/ooo/) # => nil
4603 * $~ # => nil
4604 *
4605 * \Integer argument +offset+, if given, specifies the 0-based index
4606 * of the byte where searching is to begin.
4607 *
4608 * When +offset+ is non-negative,
4609 * searching begins at byte position +offset+:
4610 *
4611 * s = 'foo'
4612 * s.byteindex('o', 1) # => 1
4613 * s.byteindex('o', 2) # => 2
4614 * s.byteindex('o', 3) # => nil
4615 *
4616 * When +offset+ is negative, counts backward from the end of +self+:
4617 *
4618 * s = 'foo'
4619 * s.byteindex('o', -1) # => 2
4620 * s.byteindex('o', -2) # => 1
4621 * s.byteindex('o', -3) # => 1
4622 * s.byteindex('o', -4) # => nil
4623 *
4624 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4625 *
4626 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4627 * s.size # => 2 # Two 3-byte characters.
4628 * s.bytesize # => 6 # Six bytes.
4629 * s.byteindex("\uFFFF") # => 0
4630 * s.byteindex("\uFFFF", 1) # Raises IndexError
4631 * s.byteindex("\uFFFF", 2) # Raises IndexError
4632 * s.byteindex("\uFFFF", 3) # => 3
4633 * s.byteindex("\uFFFF", 4) # Raises IndexError
4634 * s.byteindex("\uFFFF", 5) # Raises IndexError
4635 * s.byteindex("\uFFFF", 6) # => nil
4636 *
4637 * Related: see {Querying}[rdoc-ref:String@Querying].
4638 */
4639
4640static VALUE
4641rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4642{
4643 VALUE sub;
4644 VALUE initpos;
4645 long pos;
4646
4647 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4648 long slen = RSTRING_LEN(str);
4649 pos = NUM2LONG(initpos);
4650 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4651 if (RB_TYPE_P(sub, T_REGEXP)) {
4653 }
4654 return Qnil;
4655 }
4656 }
4657 else {
4658 pos = 0;
4659 }
4660
4661 str_ensure_byte_pos(str, pos);
4662
4663 if (RB_TYPE_P(sub, T_REGEXP)) {
4664 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4665 VALUE match = rb_backref_get();
4666 struct re_registers *regs = RMATCH_REGS(match);
4667 pos = BEG(0);
4668 return LONG2NUM(pos);
4669 }
4670 }
4671 else {
4672 StringValue(sub);
4673 pos = rb_str_byteindex(str, sub, pos);
4674 if (pos >= 0) return LONG2NUM(pos);
4675 }
4676 return Qnil;
4677}
4678
4679#ifndef HAVE_MEMRCHR
4680static void*
4681memrchr(const char *search_str, int chr, long search_len)
4682{
4683 const char *ptr = search_str + search_len;
4684 while (ptr > search_str) {
4685 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4686 }
4687
4688 return ((void *)0);
4689}
4690#endif
4691
4692static long
4693str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4694{
4695 char *hit, *adjusted;
4696 int c;
4697 long slen, searchlen;
4698 char *sbeg, *e, *t;
4699
4700 sbeg = RSTRING_PTR(str);
4701 slen = RSTRING_LEN(sub);
4702 if (slen == 0) return s - sbeg;
4703 e = RSTRING_END(str);
4704 t = RSTRING_PTR(sub);
4705 c = *t & 0xff;
4706 searchlen = s - sbeg + 1;
4707
4708 if (memcmp(s, t, slen) == 0) {
4709 return s - sbeg;
4710 }
4711
4712 do {
4713 hit = memrchr(sbeg, c, searchlen);
4714 if (!hit) break;
4715 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4716 if (hit != adjusted) {
4717 searchlen = adjusted - sbeg;
4718 continue;
4719 }
4720 if (memcmp(hit, t, slen) == 0)
4721 return hit - sbeg;
4722 searchlen = adjusted - sbeg;
4723 } while (searchlen > 0);
4724
4725 return -1;
4726}
4727
4728/* found index in byte */
4729static long
4730rb_str_rindex(VALUE str, VALUE sub, long pos)
4731{
4732 long len, slen;
4733 char *sbeg, *s;
4734 rb_encoding *enc;
4735 int singlebyte;
4736
4737 enc = rb_enc_check(str, sub);
4738 if (is_broken_string(sub)) return -1;
4739 singlebyte = single_byte_optimizable(str);
4740 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4741 slen = str_strlen(sub, enc); /* rb_enc_check */
4742
4743 /* substring longer than string */
4744 if (len < slen) return -1;
4745 if (len - pos < slen) pos = len - slen;
4746 if (len == 0) return pos;
4747
4748 sbeg = RSTRING_PTR(str);
4749
4750 if (pos == 0) {
4751 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4752 return 0;
4753 else
4754 return -1;
4755 }
4756
4757 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4758 return str_rindex(str, sub, s, enc);
4759}
4760
4761/*
4762 * call-seq:
4763 * rindex(substring, offset = self.length) -> integer or nil
4764 * rindex(regexp, offset = self.length) -> integer or nil
4765 *
4766 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4767 * or +nil+ if none found:
4768 *
4769 * 'foo'.rindex('f') # => 0
4770 * 'foo'.rindex('o') # => 2
4771 * 'foo'.rindex('oo') # => 1
4772 * 'foo'.rindex('ooo') # => nil
4773 *
4774 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4775 * or +nil+ if none found:
4776 *
4777 * 'foo'.rindex(/f/) # => 0
4778 * 'foo'.rindex(/o/) # => 2
4779 * 'foo'.rindex(/oo/) # => 1
4780 * 'foo'.rindex(/ooo/) # => nil
4781 *
4782 * The _last_ match means starting at the possible last position, not
4783 * the last of longest matches.
4784 *
4785 * 'foo'.rindex(/o+/) # => 2
4786 * $~ #=> #<MatchData "o">
4787 *
4788 * To get the last longest match, needs to combine with negative
4789 * lookbehind.
4790 *
4791 * 'foo'.rindex(/(?<!o)o+/) # => 1
4792 * $~ #=> #<MatchData "oo">
4793 *
4794 * Or String#index with negative lookforward.
4795 *
4796 * 'foo'.index(/o+(?!.*o)/) # => 1
4797 * $~ #=> #<MatchData "oo">
4798 *
4799 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4800 * string to _end_ the search:
4801 *
4802 * 'foo'.rindex('o', 0) # => nil
4803 * 'foo'.rindex('o', 1) # => 1
4804 * 'foo'.rindex('o', 2) # => 2
4805 * 'foo'.rindex('o', 3) # => 2
4806 *
4807 * If +offset+ is a negative Integer, the maximum starting position in the
4808 * string to _end_ the search is the sum of the string's length and +offset+:
4809 *
4810 * 'foo'.rindex('o', -1) # => 2
4811 * 'foo'.rindex('o', -2) # => 1
4812 * 'foo'.rindex('o', -3) # => nil
4813 * 'foo'.rindex('o', -4) # => nil
4814 *
4815 * Related: String#index.
4816 */
4817
4818static VALUE
4819rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4820{
4821 VALUE sub;
4822 VALUE initpos;
4823 rb_encoding *enc = STR_ENC_GET(str);
4824 long pos, len = str_strlen(str, enc); /* str's enc */
4825
4826 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4827 pos = NUM2LONG(initpos);
4828 if (pos < 0 && (pos += len) < 0) {
4829 if (RB_TYPE_P(sub, T_REGEXP)) {
4831 }
4832 return Qnil;
4833 }
4834 if (pos > len) pos = len;
4835 }
4836 else {
4837 pos = len;
4838 }
4839
4840 if (RB_TYPE_P(sub, T_REGEXP)) {
4841 /* enc = rb_enc_check(str, sub); */
4842 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4843 enc, single_byte_optimizable(str));
4844
4845 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4846 VALUE match = rb_backref_get();
4847 struct re_registers *regs = RMATCH_REGS(match);
4848 pos = rb_str_sublen(str, BEG(0));
4849 return LONG2NUM(pos);
4850 }
4851 }
4852 else {
4853 StringValue(sub);
4854 pos = rb_str_rindex(str, sub, pos);
4855 if (pos >= 0) {
4856 pos = rb_str_sublen(str, pos);
4857 return LONG2NUM(pos);
4858 }
4859 }
4860 return Qnil;
4861}
4862
4863static long
4864rb_str_byterindex(VALUE str, VALUE sub, long pos)
4865{
4866 long len, slen;
4867 char *sbeg, *s;
4868 rb_encoding *enc;
4869
4870 enc = rb_enc_check(str, sub);
4871 if (is_broken_string(sub)) return -1;
4872 len = RSTRING_LEN(str);
4873 slen = RSTRING_LEN(sub);
4874
4875 /* substring longer than string */
4876 if (len < slen) return -1;
4877 if (len - pos < slen) pos = len - slen;
4878 if (len == 0) return pos;
4879
4880 sbeg = RSTRING_PTR(str);
4881
4882 if (pos == 0) {
4883 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4884 return 0;
4885 else
4886 return -1;
4887 }
4888
4889 s = sbeg + pos;
4890 return str_rindex(str, sub, s, enc);
4891}
4892
4893/*
4894 * call-seq:
4895 * byterindex(object, offset = self.bytesize) -> integer or nil
4896 *
4897 * Returns the 0-based integer index of a substring of +self+
4898 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4899 * or +nil+ if there is no such substring;
4900 * the returned index is the count of _bytes_ (not characters).
4901 *
4902 * When +object+ is a string,
4903 * returns the index of the _last_ found substring equal to +object+:
4904 *
4905 * s = 'foo' # => "foo"
4906 * s.size # => 3 # Three 1-byte characters.
4907 * s.bytesize # => 3 # Three bytes.
4908 * s.byterindex('f') # => 0
4909 s.byterindex('o') # => 2
4910 s.byterindex('oo') # => 1
4911 s.byterindex('ooo') # => nil
4912 *
4913 * When +object+ is a Regexp,
4914 * returns the index of the last found substring matching +object+;
4915 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4916 *
4917 * s = 'foo'
4918 * s.byterindex(/f/) # => 0
4919 * $~ # => #<MatchData "f">
4920 * s.byterindex(/o/) # => 2
4921 * s.byterindex(/oo/) # => 1
4922 * s.byterindex(/ooo/) # => nil
4923 * $~ # => nil
4924 *
4925 * The last match means starting at the possible last position,
4926 * not the last of the longest matches:
4927 *
4928 * s = 'foo'
4929 * s.byterindex(/o+/) # => 2
4930 * $~ #=> #<MatchData "o">
4931 *
4932 * To get the last longest match, use a negative lookbehind:
4933 *
4934 * s = 'foo'
4935 * s.byterindex(/(?<!o)o+/) # => 1
4936 * $~ # => #<MatchData "oo">
4937 *
4938 * Or use method #byteindex with negative lookahead:
4939 *
4940 * s = 'foo'
4941 * s.byteindex(/o+(?!.*o)/) # => 1
4942 * $~ #=> #<MatchData "oo">
4943 *
4944 * \Integer argument +offset+, if given, specifies the 0-based index
4945 * of the byte where searching is to end.
4946 *
4947 * When +offset+ is non-negative,
4948 * searching ends at byte position +offset+:
4949 *
4950 * s = 'foo'
4951 * s.byterindex('o', 0) # => nil
4952 * s.byterindex('o', 1) # => 1
4953 * s.byterindex('o', 2) # => 2
4954 * s.byterindex('o', 3) # => 2
4955 *
4956 * When +offset+ is negative, counts backward from the end of +self+:
4957 *
4958 * s = 'foo'
4959 * s.byterindex('o', -1) # => 2
4960 * s.byterindex('o', -2) # => 1
4961 * s.byterindex('o', -3) # => nil
4962 *
4963 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4964 *
4965 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4966 * s.size # => 2 # Two 3-byte characters.
4967 * s.bytesize # => 6 # Six bytes.
4968 * s.byterindex("\uFFFF") # => 3
4969 * s.byterindex("\uFFFF", 1) # Raises IndexError
4970 * s.byterindex("\uFFFF", 2) # Raises IndexError
4971 * s.byterindex("\uFFFF", 3) # => 3
4972 * s.byterindex("\uFFFF", 4) # Raises IndexError
4973 * s.byterindex("\uFFFF", 5) # Raises IndexError
4974 * s.byterindex("\uFFFF", 6) # => nil
4975 *
4976 * Related: see {Querying}[rdoc-ref:String@Querying].
4977 */
4978
4979static VALUE
4980rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4981{
4982 VALUE sub;
4983 VALUE initpos;
4984 long pos, len = RSTRING_LEN(str);
4985
4986 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4987 pos = NUM2LONG(initpos);
4988 if (pos < 0 && (pos += len) < 0) {
4989 if (RB_TYPE_P(sub, T_REGEXP)) {
4991 }
4992 return Qnil;
4993 }
4994 if (pos > len) pos = len;
4995 }
4996 else {
4997 pos = len;
4998 }
4999
5000 str_ensure_byte_pos(str, pos);
5001
5002 if (RB_TYPE_P(sub, T_REGEXP)) {
5003 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5004 VALUE match = rb_backref_get();
5005 struct re_registers *regs = RMATCH_REGS(match);
5006 pos = BEG(0);
5007 return LONG2NUM(pos);
5008 }
5009 }
5010 else {
5011 StringValue(sub);
5012 pos = rb_str_byterindex(str, sub, pos);
5013 if (pos >= 0) return LONG2NUM(pos);
5014 }
5015 return Qnil;
5016}
5017
5018/*
5019 * call-seq:
5020 * self =~ object -> integer or nil
5021 *
5022 * When +object+ is a Regexp, returns the index of the first substring in +self+
5023 * matched by +object+,
5024 * or +nil+ if no match is found;
5025 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5026 *
5027 * 'foo' =~ /f/ # => 0
5028 * $~ # => #<MatchData "f">
5029 * 'foo' =~ /o/ # => 1
5030 * $~ # => #<MatchData "o">
5031 * 'foo' =~ /x/ # => nil
5032 * $~ # => nil
5033 *
5034 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5035 * (see Regexp#=~):
5036 *
5037 * number = nil
5038 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5039 * number # => nil # Not assigned.
5040 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5041 * number # => "9" # Assigned.
5042 *
5043 * If +object+ is not a Regexp, returns the value
5044 * returned by <tt>object =~ self</tt>.
5045 *
5046 * Related: see {Querying}[rdoc-ref:String@Querying].
5047 */
5048
5049static VALUE
5050rb_str_match(VALUE x, VALUE y)
5051{
5052 switch (OBJ_BUILTIN_TYPE(y)) {
5053 case T_STRING:
5054 rb_raise(rb_eTypeError, "type mismatch: String given");
5055
5056 case T_REGEXP:
5057 return rb_reg_match(y, x);
5058
5059 default:
5060 return rb_funcall(y, idEqTilde, 1, x);
5061 }
5062}
5063
5064
5065static VALUE get_pat(VALUE);
5066
5067
5068/*
5069 * call-seq:
5070 * match(pattern, offset = 0) -> matchdata or nil
5071 * match(pattern, offset = 0) {|matchdata| ... } -> object
5072 *
5073 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5074 *
5075 * Note: also updates Regexp@Global+Variables.
5076 *
5077 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5078 * regexp = Regexp.new(pattern)
5079 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5080 * (see Regexp#match):
5081 * matchdata = regexp.match(self)
5082 *
5083 * With no block given, returns the computed +matchdata+:
5084 *
5085 * 'foo'.match('f') # => #<MatchData "f">
5086 * 'foo'.match('o') # => #<MatchData "o">
5087 * 'foo'.match('x') # => nil
5088 *
5089 * If Integer argument +offset+ is given, the search begins at index +offset+:
5090 *
5091 * 'foo'.match('f', 1) # => nil
5092 * 'foo'.match('o', 1) # => #<MatchData "o">
5093 *
5094 * With a block given, calls the block with the computed +matchdata+
5095 * and returns the block's return value:
5096 *
5097 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5098 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5099 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5100 *
5101 */
5102
5103static VALUE
5104rb_str_match_m(int argc, VALUE *argv, VALUE str)
5105{
5106 VALUE re, result;
5107 if (argc < 1)
5108 rb_check_arity(argc, 1, 2);
5109 re = argv[0];
5110 argv[0] = str;
5111 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5112 if (!NIL_P(result) && rb_block_given_p()) {
5113 return rb_yield(result);
5114 }
5115 return result;
5116}
5117
5118/*
5119 * call-seq:
5120 * match?(pattern, offset = 0) -> true or false
5121 *
5122 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5123 *
5124 * Note: does not update Regexp@Global+Variables.
5125 *
5126 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5127 * regexp = Regexp.new(pattern)
5128 *
5129 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5130 * +false+ otherwise:
5131 *
5132 * 'foo'.match?(/o/) # => true
5133 * 'foo'.match?('o') # => true
5134 * 'foo'.match?(/x/) # => false
5135 *
5136 * If Integer argument +offset+ is given, the search begins at index +offset+:
5137 * 'foo'.match?('f', 1) # => false
5138 * 'foo'.match?('o', 1) # => true
5139 *
5140 */
5141
5142static VALUE
5143rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5144{
5145 VALUE re;
5146 rb_check_arity(argc, 1, 2);
5147 re = get_pat(argv[0]);
5148 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5149}
5150
5151enum neighbor_char {
5152 NEIGHBOR_NOT_CHAR,
5153 NEIGHBOR_FOUND,
5154 NEIGHBOR_WRAPPED
5155};
5156
5157static enum neighbor_char
5158enc_succ_char(char *p, long len, rb_encoding *enc)
5159{
5160 long i;
5161 int l;
5162
5163 if (rb_enc_mbminlen(enc) > 1) {
5164 /* wchar, trivial case */
5165 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5166 if (!MBCLEN_CHARFOUND_P(r)) {
5167 return NEIGHBOR_NOT_CHAR;
5168 }
5169 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5170 l = rb_enc_code_to_mbclen(c, enc);
5171 if (!l) return NEIGHBOR_NOT_CHAR;
5172 if (l != len) return NEIGHBOR_WRAPPED;
5173 rb_enc_mbcput(c, p, enc);
5174 r = rb_enc_precise_mbclen(p, p + len, enc);
5175 if (!MBCLEN_CHARFOUND_P(r)) {
5176 return NEIGHBOR_NOT_CHAR;
5177 }
5178 return NEIGHBOR_FOUND;
5179 }
5180 while (1) {
5181 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5182 p[i] = '\0';
5183 if (i < 0)
5184 return NEIGHBOR_WRAPPED;
5185 ++((unsigned char*)p)[i];
5186 l = rb_enc_precise_mbclen(p, p+len, enc);
5187 if (MBCLEN_CHARFOUND_P(l)) {
5188 l = MBCLEN_CHARFOUND_LEN(l);
5189 if (l == len) {
5190 return NEIGHBOR_FOUND;
5191 }
5192 else {
5193 memset(p+l, 0xff, len-l);
5194 }
5195 }
5196 if (MBCLEN_INVALID_P(l) && i < len-1) {
5197 long len2;
5198 int l2;
5199 for (len2 = len-1; 0 < len2; len2--) {
5200 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5201 if (!MBCLEN_INVALID_P(l2))
5202 break;
5203 }
5204 memset(p+len2+1, 0xff, len-(len2+1));
5205 }
5206 }
5207}
5208
5209static enum neighbor_char
5210enc_pred_char(char *p, long len, rb_encoding *enc)
5211{
5212 long i;
5213 int l;
5214 if (rb_enc_mbminlen(enc) > 1) {
5215 /* wchar, trivial case */
5216 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5217 if (!MBCLEN_CHARFOUND_P(r)) {
5218 return NEIGHBOR_NOT_CHAR;
5219 }
5220 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5221 if (!c) return NEIGHBOR_NOT_CHAR;
5222 --c;
5223 l = rb_enc_code_to_mbclen(c, enc);
5224 if (!l) return NEIGHBOR_NOT_CHAR;
5225 if (l != len) return NEIGHBOR_WRAPPED;
5226 rb_enc_mbcput(c, p, enc);
5227 r = rb_enc_precise_mbclen(p, p + len, enc);
5228 if (!MBCLEN_CHARFOUND_P(r)) {
5229 return NEIGHBOR_NOT_CHAR;
5230 }
5231 return NEIGHBOR_FOUND;
5232 }
5233 while (1) {
5234 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5235 p[i] = '\xff';
5236 if (i < 0)
5237 return NEIGHBOR_WRAPPED;
5238 --((unsigned char*)p)[i];
5239 l = rb_enc_precise_mbclen(p, p+len, enc);
5240 if (MBCLEN_CHARFOUND_P(l)) {
5241 l = MBCLEN_CHARFOUND_LEN(l);
5242 if (l == len) {
5243 return NEIGHBOR_FOUND;
5244 }
5245 else {
5246 memset(p+l, 0, len-l);
5247 }
5248 }
5249 if (MBCLEN_INVALID_P(l) && i < len-1) {
5250 long len2;
5251 int l2;
5252 for (len2 = len-1; 0 < len2; len2--) {
5253 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5254 if (!MBCLEN_INVALID_P(l2))
5255 break;
5256 }
5257 memset(p+len2+1, 0, len-(len2+1));
5258 }
5259 }
5260}
5261
5262/*
5263 overwrite +p+ by succeeding letter in +enc+ and returns
5264 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5265 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5266 assuming each ranges are successive, and mbclen
5267 never change in each ranges.
5268 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5269 character.
5270 */
5271static enum neighbor_char
5272enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5273{
5274 enum neighbor_char ret;
5275 unsigned int c;
5276 int ctype;
5277 int range;
5278 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5279
5280 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5281 int try;
5282 const int max_gaps = 1;
5283
5284 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5285 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5286 ctype = ONIGENC_CTYPE_DIGIT;
5287 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5288 ctype = ONIGENC_CTYPE_ALPHA;
5289 else
5290 return NEIGHBOR_NOT_CHAR;
5291
5292 MEMCPY(save, p, char, len);
5293 for (try = 0; try <= max_gaps; ++try) {
5294 ret = enc_succ_char(p, len, enc);
5295 if (ret == NEIGHBOR_FOUND) {
5296 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5297 if (rb_enc_isctype(c, ctype, enc))
5298 return NEIGHBOR_FOUND;
5299 }
5300 }
5301 MEMCPY(p, save, char, len);
5302 range = 1;
5303 while (1) {
5304 MEMCPY(save, p, char, len);
5305 ret = enc_pred_char(p, len, enc);
5306 if (ret == NEIGHBOR_FOUND) {
5307 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5308 if (!rb_enc_isctype(c, ctype, enc)) {
5309 MEMCPY(p, save, char, len);
5310 break;
5311 }
5312 }
5313 else {
5314 MEMCPY(p, save, char, len);
5315 break;
5316 }
5317 range++;
5318 }
5319 if (range == 1) {
5320 return NEIGHBOR_NOT_CHAR;
5321 }
5322
5323 if (ctype != ONIGENC_CTYPE_DIGIT) {
5324 MEMCPY(carry, p, char, len);
5325 return NEIGHBOR_WRAPPED;
5326 }
5327
5328 MEMCPY(carry, p, char, len);
5329 enc_succ_char(carry, len, enc);
5330 return NEIGHBOR_WRAPPED;
5331}
5332
5333
5334static VALUE str_succ(VALUE str);
5335
5336/*
5337 * call-seq:
5338 * succ -> new_str
5339 *
5340 * Returns the successor to +self+. The successor is calculated by
5341 * incrementing characters.
5342 *
5343 * The first character to be incremented is the rightmost alphanumeric:
5344 * or, if no alphanumerics, the rightmost character:
5345 *
5346 * 'THX1138'.succ # => "THX1139"
5347 * '<<koala>>'.succ # => "<<koalb>>"
5348 * '***'.succ # => '**+'
5349 *
5350 * The successor to a digit is another digit, "carrying" to the next-left
5351 * character for a "rollover" from 9 to 0, and prepending another digit
5352 * if necessary:
5353 *
5354 * '00'.succ # => "01"
5355 * '09'.succ # => "10"
5356 * '99'.succ # => "100"
5357 *
5358 * The successor to a letter is another letter of the same case,
5359 * carrying to the next-left character for a rollover,
5360 * and prepending another same-case letter if necessary:
5361 *
5362 * 'aa'.succ # => "ab"
5363 * 'az'.succ # => "ba"
5364 * 'zz'.succ # => "aaa"
5365 * 'AA'.succ # => "AB"
5366 * 'AZ'.succ # => "BA"
5367 * 'ZZ'.succ # => "AAA"
5368 *
5369 * The successor to a non-alphanumeric character is the next character
5370 * in the underlying character set's collating sequence,
5371 * carrying to the next-left character for a rollover,
5372 * and prepending another character if necessary:
5373 *
5374 * s = 0.chr * 3
5375 * s # => "\x00\x00\x00"
5376 * s.succ # => "\x00\x00\x01"
5377 * s = 255.chr * 3
5378 * s # => "\xFF\xFF\xFF"
5379 * s.succ # => "\x01\x00\x00\x00"
5380 *
5381 * Carrying can occur between and among mixtures of alphanumeric characters:
5382 *
5383 * s = 'zz99zz99'
5384 * s.succ # => "aaa00aa00"
5385 * s = '99zz99zz'
5386 * s.succ # => "100aa00aa"
5387 *
5388 * The successor to an empty +String+ is a new empty +String+:
5389 *
5390 * ''.succ # => ""
5391 *
5392 */
5393
5394VALUE
5396{
5397 VALUE str;
5398 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5399 rb_enc_cr_str_copy_for_substr(str, orig);
5400 return str_succ(str);
5401}
5402
5403static VALUE
5404str_succ(VALUE str)
5405{
5406 rb_encoding *enc;
5407 char *sbeg, *s, *e, *last_alnum = 0;
5408 int found_alnum = 0;
5409 long l, slen;
5410 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5411 long carry_pos = 0, carry_len = 1;
5412 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5413
5414 slen = RSTRING_LEN(str);
5415 if (slen == 0) return str;
5416
5417 enc = STR_ENC_GET(str);
5418 sbeg = RSTRING_PTR(str);
5419 s = e = sbeg + slen;
5420
5421 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5422 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5423 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5424 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5425 break;
5426 }
5427 }
5428 l = rb_enc_precise_mbclen(s, e, enc);
5429 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5430 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5431 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5432 switch (neighbor) {
5433 case NEIGHBOR_NOT_CHAR:
5434 continue;
5435 case NEIGHBOR_FOUND:
5436 return str;
5437 case NEIGHBOR_WRAPPED:
5438 last_alnum = s;
5439 break;
5440 }
5441 found_alnum = 1;
5442 carry_pos = s - sbeg;
5443 carry_len = l;
5444 }
5445 if (!found_alnum) { /* str contains no alnum */
5446 s = e;
5447 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5448 enum neighbor_char neighbor;
5449 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5450 l = rb_enc_precise_mbclen(s, e, enc);
5451 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5452 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5453 MEMCPY(tmp, s, char, l);
5454 neighbor = enc_succ_char(tmp, l, enc);
5455 switch (neighbor) {
5456 case NEIGHBOR_FOUND:
5457 MEMCPY(s, tmp, char, l);
5458 return str;
5459 break;
5460 case NEIGHBOR_WRAPPED:
5461 MEMCPY(s, tmp, char, l);
5462 break;
5463 case NEIGHBOR_NOT_CHAR:
5464 break;
5465 }
5466 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5467 /* wrapped to \0...\0. search next valid char. */
5468 enc_succ_char(s, l, enc);
5469 }
5470 if (!rb_enc_asciicompat(enc)) {
5471 MEMCPY(carry, s, char, l);
5472 carry_len = l;
5473 }
5474 carry_pos = s - sbeg;
5475 }
5477 }
5478 RESIZE_CAPA(str, slen + carry_len);
5479 sbeg = RSTRING_PTR(str);
5480 s = sbeg + carry_pos;
5481 memmove(s + carry_len, s, slen - carry_pos);
5482 memmove(s, carry, carry_len);
5483 slen += carry_len;
5484 STR_SET_LEN(str, slen);
5485 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5487 return str;
5488}
5489
5490
5491/*
5492 * call-seq:
5493 * succ! -> self
5494 *
5495 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5496 */
5497
5498static VALUE
5499rb_str_succ_bang(VALUE str)
5500{
5501 rb_str_modify(str);
5502 str_succ(str);
5503 return str;
5504}
5505
5506static int
5507all_digits_p(const char *s, long len)
5508{
5509 while (len-- > 0) {
5510 if (!ISDIGIT(*s)) return 0;
5511 s++;
5512 }
5513 return 1;
5514}
5515
5516static int
5517str_upto_i(VALUE str, VALUE arg)
5518{
5519 rb_yield(str);
5520 return 0;
5521}
5522
5523/*
5524 * call-seq:
5525 * upto(other_string, exclusive = false) {|string| ... } -> self
5526 * upto(other_string, exclusive = false) -> new_enumerator
5527 *
5528 * With a block given, calls the block with each +String+ value
5529 * returned by successive calls to String#succ;
5530 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5531 * the sequence terminates when value +other_string+ is reached;
5532 * returns +self+:
5533 *
5534 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5535 * Output:
5536 *
5537 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5538 *
5539 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5540 *
5541 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5542 *
5543 * Output:
5544 *
5545 * a8 a9 b0 b1 b2 b3 b4 b5
5546 *
5547 * If +other_string+ would not be reached, does not call the block:
5548 *
5549 * '25'.upto('5') {|s| fail s }
5550 * 'aa'.upto('a') {|s| fail s }
5551 *
5552 * With no block given, returns a new Enumerator:
5553 *
5554 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5555 *
5556 */
5557
5558static VALUE
5559rb_str_upto(int argc, VALUE *argv, VALUE beg)
5560{
5561 VALUE end, exclusive;
5562
5563 rb_scan_args(argc, argv, "11", &end, &exclusive);
5564 RETURN_ENUMERATOR(beg, argc, argv);
5565 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5566}
5567
5568VALUE
5569rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5570{
5571 VALUE current, after_end;
5572 ID succ;
5573 int n, ascii;
5574 rb_encoding *enc;
5575
5576 CONST_ID(succ, "succ");
5577 StringValue(end);
5578 enc = rb_enc_check(beg, end);
5579 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5580 /* single character */
5581 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5582 char c = RSTRING_PTR(beg)[0];
5583 char e = RSTRING_PTR(end)[0];
5584
5585 if (c > e || (excl && c == e)) return beg;
5586 for (;;) {
5587 VALUE str = rb_enc_str_new(&c, 1, enc);
5589 if ((*each)(str, arg)) break;
5590 if (!excl && c == e) break;
5591 c++;
5592 if (excl && c == e) break;
5593 }
5594 return beg;
5595 }
5596 /* both edges are all digits */
5597 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5598 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5599 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5600 VALUE b, e;
5601 int width;
5602
5603 width = RSTRING_LENINT(beg);
5604 b = rb_str_to_inum(beg, 10, FALSE);
5605 e = rb_str_to_inum(end, 10, FALSE);
5606 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5607 long bi = FIX2LONG(b);
5608 long ei = FIX2LONG(e);
5609 rb_encoding *usascii = rb_usascii_encoding();
5610
5611 while (bi <= ei) {
5612 if (excl && bi == ei) break;
5613 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5614 bi++;
5615 }
5616 }
5617 else {
5618 ID op = excl ? '<' : idLE;
5619 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5620
5621 args[0] = INT2FIX(width);
5622 while (rb_funcall(b, op, 1, e)) {
5623 args[1] = b;
5624 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5625 b = rb_funcallv(b, succ, 0, 0);
5626 }
5627 }
5628 return beg;
5629 }
5630 /* normal case */
5631 n = rb_str_cmp(beg, end);
5632 if (n > 0 || (excl && n == 0)) return beg;
5633
5634 after_end = rb_funcallv(end, succ, 0, 0);
5635 current = str_duplicate(rb_cString, beg);
5636 while (!rb_str_equal(current, after_end)) {
5637 VALUE next = Qnil;
5638 if (excl || !rb_str_equal(current, end))
5639 next = rb_funcallv(current, succ, 0, 0);
5640 if ((*each)(current, arg)) break;
5641 if (NIL_P(next)) break;
5642 current = next;
5643 StringValue(current);
5644 if (excl && rb_str_equal(current, end)) break;
5645 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5646 break;
5647 }
5648
5649 return beg;
5650}
5651
5652VALUE
5653rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5654{
5655 VALUE current;
5656 ID succ;
5657
5658 CONST_ID(succ, "succ");
5659 /* both edges are all digits */
5660 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5661 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5662 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5663 int width = RSTRING_LENINT(beg);
5664 b = rb_str_to_inum(beg, 10, FALSE);
5665 if (FIXNUM_P(b)) {
5666 long bi = FIX2LONG(b);
5667 rb_encoding *usascii = rb_usascii_encoding();
5668
5669 while (FIXABLE(bi)) {
5670 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5671 bi++;
5672 }
5673 b = LONG2NUM(bi);
5674 }
5675 args[0] = INT2FIX(width);
5676 while (1) {
5677 args[1] = b;
5678 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5679 b = rb_funcallv(b, succ, 0, 0);
5680 }
5681 }
5682 /* normal case */
5683 current = str_duplicate(rb_cString, beg);
5684 while (1) {
5685 VALUE next = rb_funcallv(current, succ, 0, 0);
5686 if ((*each)(current, arg)) break;
5687 current = next;
5688 StringValue(current);
5689 if (RSTRING_LEN(current) == 0)
5690 break;
5691 }
5692
5693 return beg;
5694}
5695
5696static int
5697include_range_i(VALUE str, VALUE arg)
5698{
5699 VALUE *argp = (VALUE *)arg;
5700 if (!rb_equal(str, *argp)) return 0;
5701 *argp = Qnil;
5702 return 1;
5703}
5704
5705VALUE
5706rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5707{
5708 beg = rb_str_new_frozen(beg);
5709 StringValue(end);
5710 end = rb_str_new_frozen(end);
5711 if (NIL_P(val)) return Qfalse;
5712 val = rb_check_string_type(val);
5713 if (NIL_P(val)) return Qfalse;
5714 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5715 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5716 rb_enc_asciicompat(STR_ENC_GET(val))) {
5717 const char *bp = RSTRING_PTR(beg);
5718 const char *ep = RSTRING_PTR(end);
5719 const char *vp = RSTRING_PTR(val);
5720 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5721 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5722 return Qfalse;
5723 else {
5724 char b = *bp;
5725 char e = *ep;
5726 char v = *vp;
5727
5728 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5729 if (b <= v && v < e) return Qtrue;
5730 return RBOOL(!RTEST(exclusive) && v == e);
5731 }
5732 }
5733 }
5734#if 0
5735 /* both edges are all digits */
5736 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5737 all_digits_p(bp, RSTRING_LEN(beg)) &&
5738 all_digits_p(ep, RSTRING_LEN(end))) {
5739 /* TODO */
5740 }
5741#endif
5742 }
5743 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5744
5745 return RBOOL(NIL_P(val));
5746}
5747
5748static VALUE
5749rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5750{
5751 if (rb_reg_search(re, str, 0, 0) >= 0) {
5752 VALUE match = rb_backref_get();
5753 int nth = rb_reg_backref_number(match, backref);
5754 return rb_reg_nth_match(nth, match);
5755 }
5756 return Qnil;
5757}
5758
5759static VALUE
5760rb_str_aref(VALUE str, VALUE indx)
5761{
5762 long idx;
5763
5764 if (FIXNUM_P(indx)) {
5765 idx = FIX2LONG(indx);
5766 }
5767 else if (RB_TYPE_P(indx, T_REGEXP)) {
5768 return rb_str_subpat(str, indx, INT2FIX(0));
5769 }
5770 else if (RB_TYPE_P(indx, T_STRING)) {
5771 if (rb_str_index(str, indx, 0) != -1)
5772 return str_duplicate(rb_cString, indx);
5773 return Qnil;
5774 }
5775 else {
5776 /* check if indx is Range */
5777 long beg, len = str_strlen(str, NULL);
5778 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5779 case Qfalse:
5780 break;
5781 case Qnil:
5782 return Qnil;
5783 default:
5784 return rb_str_substr(str, beg, len);
5785 }
5786 idx = NUM2LONG(indx);
5787 }
5788
5789 return str_substr(str, idx, 1, FALSE);
5790}
5791
5792
5793/*
5794 * call-seq:
5795 * self[index] -> new_string or nil
5796 * self[start, length] -> new_string or nil
5797 * self[range] -> new_string or nil
5798 * self[regexp, capture = 0] -> new_string or nil
5799 * self[substring] -> new_string or nil
5800 *
5801 * Returns the substring of +self+ specified by the arguments.
5802 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5803 *
5804 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5805 */
5806
5807static VALUE
5808rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5809{
5810 if (argc == 2) {
5811 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5812 return rb_str_subpat(str, argv[0], argv[1]);
5813 }
5814 else {
5815 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5816 }
5817 }
5818 rb_check_arity(argc, 1, 2);
5819 return rb_str_aref(str, argv[0]);
5820}
5821
5822VALUE
5824{
5825 char *ptr = RSTRING_PTR(str);
5826 long olen = RSTRING_LEN(str), nlen;
5827
5828 str_modifiable(str);
5829 if (len > olen) len = olen;
5830 nlen = olen - len;
5831 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5832 char *oldptr = ptr;
5833 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5834 STR_SET_EMBED(str);
5835 ptr = RSTRING(str)->as.embed.ary;
5836 memmove(ptr, oldptr + len, nlen);
5837 if (fl == STR_NOEMBED) xfree(oldptr);
5838 }
5839 else {
5840 if (!STR_SHARED_P(str)) {
5841 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5842 rb_enc_cr_str_exact_copy(shared, str);
5843 OBJ_FREEZE(shared);
5844 }
5845 ptr = RSTRING(str)->as.heap.ptr += len;
5846 }
5847 STR_SET_LEN(str, nlen);
5848
5849 if (!SHARABLE_MIDDLE_SUBSTRING) {
5850 TERM_FILL(ptr + nlen, TERM_LEN(str));
5851 }
5853 return str;
5854}
5855
5856static void
5857rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5858{
5859 char *sptr;
5860 long slen;
5861 int cr;
5862
5863 if (beg == 0 && vlen == 0) {
5864 rb_str_drop_bytes(str, len);
5865 return;
5866 }
5867
5868 str_modify_keep_cr(str);
5869 RSTRING_GETMEM(str, sptr, slen);
5870 if (len < vlen) {
5871 /* expand string */
5872 RESIZE_CAPA(str, slen + vlen - len);
5873 sptr = RSTRING_PTR(str);
5874 }
5875
5877 cr = rb_enc_str_coderange(val);
5878 else
5880
5881 if (vlen != len) {
5882 memmove(sptr + beg + vlen,
5883 sptr + beg + len,
5884 slen - (beg + len));
5885 }
5886 if (vlen < beg && len < 0) {
5887 MEMZERO(sptr + slen, char, -len);
5888 }
5889 if (vlen > 0) {
5890 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5891 }
5892 slen += vlen - len;
5893 STR_SET_LEN(str, slen);
5894 TERM_FILL(&sptr[slen], TERM_LEN(str));
5895 ENC_CODERANGE_SET(str, cr);
5896}
5897
5898static inline void
5899rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5900{
5901 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5902}
5903
5904void
5905rb_str_update(VALUE str, long beg, long len, VALUE val)
5906{
5907 long slen;
5908 char *p, *e;
5909 rb_encoding *enc;
5910 int singlebyte = single_byte_optimizable(str);
5911 int cr;
5912
5913 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5914
5915 StringValue(val);
5916 enc = rb_enc_check(str, val);
5917 slen = str_strlen(str, enc); /* rb_enc_check */
5918
5919 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5920 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5921 }
5922 if (beg < 0) {
5923 beg += slen;
5924 }
5925 RUBY_ASSERT(beg >= 0);
5926 RUBY_ASSERT(beg <= slen);
5927
5928 if (len > slen - beg) {
5929 len = slen - beg;
5930 }
5931 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5932 if (!p) p = RSTRING_END(str);
5933 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5934 if (!e) e = RSTRING_END(str);
5935 /* error check */
5936 beg = p - RSTRING_PTR(str); /* physical position */
5937 len = e - p; /* physical length */
5938 rb_str_update_0(str, beg, len, val);
5939 rb_enc_associate(str, enc);
5941 if (cr != ENC_CODERANGE_BROKEN)
5942 ENC_CODERANGE_SET(str, cr);
5943}
5944
5945static void
5946rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5947{
5948 int nth;
5949 VALUE match;
5950 long start, end, len;
5951 rb_encoding *enc;
5952 struct re_registers *regs;
5953
5954 if (rb_reg_search(re, str, 0, 0) < 0) {
5955 rb_raise(rb_eIndexError, "regexp not matched");
5956 }
5957 match = rb_backref_get();
5958 nth = rb_reg_backref_number(match, backref);
5959 regs = RMATCH_REGS(match);
5960 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5961 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5962 }
5963 if (nth < 0) {
5964 nth += regs->num_regs;
5965 }
5966
5967 start = BEG(nth);
5968 if (start == -1) {
5969 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5970 }
5971 end = END(nth);
5972 len = end - start;
5973 StringValue(val);
5974 enc = rb_enc_check_str(str, val);
5975 rb_str_update_0(str, start, len, val);
5976 rb_enc_associate(str, enc);
5977}
5978
5979static VALUE
5980rb_str_aset(VALUE str, VALUE indx, VALUE val)
5981{
5982 long idx, beg;
5983
5984 switch (TYPE(indx)) {
5985 case T_REGEXP:
5986 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5987 return val;
5988
5989 case T_STRING:
5990 beg = rb_str_index(str, indx, 0);
5991 if (beg < 0) {
5992 rb_raise(rb_eIndexError, "string not matched");
5993 }
5994 beg = rb_str_sublen(str, beg);
5995 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5996 return val;
5997
5998 default:
5999 /* check if indx is Range */
6000 {
6001 long beg, len;
6002 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6003 rb_str_update(str, beg, len, val);
6004 return val;
6005 }
6006 }
6007 /* FALLTHROUGH */
6008
6009 case T_FIXNUM:
6010 idx = NUM2LONG(indx);
6011 rb_str_update(str, idx, 1, val);
6012 return val;
6013 }
6014}
6015
6016/*
6017 * call-seq:
6018 * self[index] = new_string
6019 * self[start, length] = new_string
6020 * self[range] = new_string
6021 * self[regexp, capture = 0] = new_string
6022 * self[substring] = new_string
6023 *
6024 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6025 * See {String Slices}[rdoc-ref:String@String+Slices].
6026 *
6027 * A few examples:
6028 *
6029 * s = 'foo'
6030 * s[2] = 'rtune' # => "rtune"
6031 * s # => "fortune"
6032 * s[1, 5] = 'init' # => "init"
6033 * s # => "finite"
6034 * s[3..4] = 'al' # => "al"
6035 * s # => "finale"
6036 * s[/e$/] = 'ly' # => "ly"
6037 * s # => "finally"
6038 * s['lly'] = 'ncial' # => "ncial"
6039 * s # => "financial"
6040 *
6041 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6042 */
6043
6044static VALUE
6045rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6046{
6047 if (argc == 3) {
6048 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6049 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6050 }
6051 else {
6052 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6053 }
6054 return argv[2];
6055 }
6056 rb_check_arity(argc, 2, 3);
6057 return rb_str_aset(str, argv[0], argv[1]);
6058}
6059
6060/*
6061 * call-seq:
6062 * insert(index, other_string) -> self
6063 *
6064 * Inserts the given +other_string+ into +self+; returns +self+.
6065 *
6066 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6067 *
6068 * 'foo'.insert(1, 'bar') # => "fbaroo"
6069 *
6070 * If the Integer +index+ is negative, counts backward from the end of +self+
6071 * and inserts +other_string+ at offset <tt>index+1</tt>
6072 * (that is, _after_ <tt>self[index]</tt>):
6073 *
6074 * 'foo'.insert(-2, 'bar') # => "fobaro"
6075 *
6076 */
6077
6078static VALUE
6079rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6080{
6081 long pos = NUM2LONG(idx);
6082
6083 if (pos == -1) {
6084 return rb_str_append(str, str2);
6085 }
6086 else if (pos < 0) {
6087 pos++;
6088 }
6089 rb_str_update(str, pos, 0, str2);
6090 return str;
6091}
6092
6093
6094/*
6095 * call-seq:
6096 * slice!(index) -> new_string or nil
6097 * slice!(start, length) -> new_string or nil
6098 * slice!(range) -> new_string or nil
6099 * slice!(regexp, capture = 0) -> new_string or nil
6100 * slice!(substring) -> new_string or nil
6101 *
6102 * Removes and returns the substring of +self+ specified by the arguments.
6103 * See {String Slices}[rdoc-ref:String@String+Slices].
6104 *
6105 * A few examples:
6106 *
6107 * string = "This is a string"
6108 * string.slice!(2) #=> "i"
6109 * string.slice!(3..6) #=> " is "
6110 * string.slice!(/s.*t/) #=> "sa st"
6111 * string.slice!("r") #=> "r"
6112 * string #=> "Thing"
6113 *
6114 */
6115
6116static VALUE
6117rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6118{
6119 VALUE result = Qnil;
6120 VALUE indx;
6121 long beg, len = 1;
6122 char *p;
6123
6124 rb_check_arity(argc, 1, 2);
6125 str_modify_keep_cr(str);
6126 indx = argv[0];
6127 if (RB_TYPE_P(indx, T_REGEXP)) {
6128 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6129 VALUE match = rb_backref_get();
6130 struct re_registers *regs = RMATCH_REGS(match);
6131 int nth = 0;
6132 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6133 if ((nth += regs->num_regs) <= 0) return Qnil;
6134 }
6135 else if (nth >= regs->num_regs) return Qnil;
6136 beg = BEG(nth);
6137 len = END(nth) - beg;
6138 goto subseq;
6139 }
6140 else if (argc == 2) {
6141 beg = NUM2LONG(indx);
6142 len = NUM2LONG(argv[1]);
6143 goto num_index;
6144 }
6145 else if (FIXNUM_P(indx)) {
6146 beg = FIX2LONG(indx);
6147 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6148 if (!len) return Qnil;
6149 beg = p - RSTRING_PTR(str);
6150 goto subseq;
6151 }
6152 else if (RB_TYPE_P(indx, T_STRING)) {
6153 beg = rb_str_index(str, indx, 0);
6154 if (beg == -1) return Qnil;
6155 len = RSTRING_LEN(indx);
6156 result = str_duplicate(rb_cString, indx);
6157 goto squash;
6158 }
6159 else {
6160 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6161 case Qnil:
6162 return Qnil;
6163 case Qfalse:
6164 beg = NUM2LONG(indx);
6165 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6166 if (!len) return Qnil;
6167 beg = p - RSTRING_PTR(str);
6168 goto subseq;
6169 default:
6170 goto num_index;
6171 }
6172 }
6173
6174 num_index:
6175 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6176 beg = p - RSTRING_PTR(str);
6177
6178 subseq:
6179 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6180 rb_enc_cr_str_copy_for_substr(result, str);
6181
6182 squash:
6183 if (len > 0) {
6184 if (beg == 0) {
6185 rb_str_drop_bytes(str, len);
6186 }
6187 else {
6188 char *sptr = RSTRING_PTR(str);
6189 long slen = RSTRING_LEN(str);
6190 if (beg + len > slen) /* pathological check */
6191 len = slen - beg;
6192 memmove(sptr + beg,
6193 sptr + beg + len,
6194 slen - (beg + len));
6195 slen -= len;
6196 STR_SET_LEN(str, slen);
6197 TERM_FILL(&sptr[slen], TERM_LEN(str));
6198 }
6199 }
6200 return result;
6201}
6202
6203static VALUE
6204get_pat(VALUE pat)
6205{
6206 VALUE val;
6207
6208 switch (OBJ_BUILTIN_TYPE(pat)) {
6209 case T_REGEXP:
6210 return pat;
6211
6212 case T_STRING:
6213 break;
6214
6215 default:
6216 val = rb_check_string_type(pat);
6217 if (NIL_P(val)) {
6218 Check_Type(pat, T_REGEXP);
6219 }
6220 pat = val;
6221 }
6222
6223 return rb_reg_regcomp(pat);
6224}
6225
6226static VALUE
6227get_pat_quoted(VALUE pat, int check)
6228{
6229 VALUE val;
6230
6231 switch (OBJ_BUILTIN_TYPE(pat)) {
6232 case T_REGEXP:
6233 return pat;
6234
6235 case T_STRING:
6236 break;
6237
6238 default:
6239 val = rb_check_string_type(pat);
6240 if (NIL_P(val)) {
6241 Check_Type(pat, T_REGEXP);
6242 }
6243 pat = val;
6244 }
6245 if (check && is_broken_string(pat)) {
6246 rb_exc_raise(rb_reg_check_preprocess(pat));
6247 }
6248 return pat;
6249}
6250
6251static long
6252rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6253{
6254 if (BUILTIN_TYPE(pat) == T_STRING) {
6255 pos = rb_str_byteindex(str, pat, pos);
6256 if (set_backref_str) {
6257 if (pos >= 0) {
6258 str = rb_str_new_frozen_String(str);
6259 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6260 if (match) {
6261 *match = match_data;
6262 }
6263 }
6264 else {
6266 }
6267 }
6268 return pos;
6269 }
6270 else {
6271 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6272 }
6273}
6274
6275static long
6276rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6277{
6278 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6279}
6280
6281
6282/*
6283 * call-seq:
6284 * sub!(pattern, replacement) -> self or nil
6285 * sub!(pattern) {|match| ... } -> self or nil
6286 *
6287 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6288 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6289 *
6290 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6291 *
6292 * Related: String#sub, String#gsub, String#gsub!.
6293 *
6294 */
6295
6296static VALUE
6297rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6298{
6299 VALUE pat, repl, hash = Qnil;
6300 int iter = 0;
6301 long plen;
6302 int min_arity = rb_block_given_p() ? 1 : 2;
6303 long beg;
6304
6305 rb_check_arity(argc, min_arity, 2);
6306 if (argc == 1) {
6307 iter = 1;
6308 }
6309 else {
6310 repl = argv[1];
6311 hash = rb_check_hash_type(argv[1]);
6312 if (NIL_P(hash)) {
6313 StringValue(repl);
6314 }
6315 }
6316
6317 pat = get_pat_quoted(argv[0], 1);
6318
6319 str_modifiable(str);
6320 beg = rb_pat_search(pat, str, 0, 1);
6321 if (beg >= 0) {
6322 rb_encoding *enc;
6323 int cr = ENC_CODERANGE(str);
6324 long beg0, end0;
6325 VALUE match, match0 = Qnil;
6326 struct re_registers *regs;
6327 char *p, *rp;
6328 long len, rlen;
6329
6330 match = rb_backref_get();
6331 regs = RMATCH_REGS(match);
6332 if (RB_TYPE_P(pat, T_STRING)) {
6333 beg0 = beg;
6334 end0 = beg0 + RSTRING_LEN(pat);
6335 match0 = pat;
6336 }
6337 else {
6338 beg0 = BEG(0);
6339 end0 = END(0);
6340 if (iter) match0 = rb_reg_nth_match(0, match);
6341 }
6342
6343 if (iter || !NIL_P(hash)) {
6344 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6345
6346 if (iter) {
6347 repl = rb_obj_as_string(rb_yield(match0));
6348 }
6349 else {
6350 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6351 repl = rb_obj_as_string(repl);
6352 }
6353 str_mod_check(str, p, len);
6354 rb_check_frozen(str);
6355 }
6356 else {
6357 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6358 }
6359
6360 enc = rb_enc_compatible(str, repl);
6361 if (!enc) {
6362 rb_encoding *str_enc = STR_ENC_GET(str);
6363 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6364 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6365 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6366 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6367 rb_enc_inspect_name(str_enc),
6368 rb_enc_inspect_name(STR_ENC_GET(repl)));
6369 }
6370 enc = STR_ENC_GET(repl);
6371 }
6372 rb_str_modify(str);
6373 rb_enc_associate(str, enc);
6375 int cr2 = ENC_CODERANGE(repl);
6376 if (cr2 == ENC_CODERANGE_BROKEN ||
6377 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6379 else
6380 cr = cr2;
6381 }
6382 plen = end0 - beg0;
6383 rlen = RSTRING_LEN(repl);
6384 len = RSTRING_LEN(str);
6385 if (rlen > plen) {
6386 RESIZE_CAPA(str, len + rlen - plen);
6387 }
6388 p = RSTRING_PTR(str);
6389 if (rlen != plen) {
6390 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6391 }
6392 rp = RSTRING_PTR(repl);
6393 memmove(p + beg0, rp, rlen);
6394 len += rlen - plen;
6395 STR_SET_LEN(str, len);
6396 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6397 ENC_CODERANGE_SET(str, cr);
6398
6399 RB_GC_GUARD(match);
6400
6401 return str;
6402 }
6403 return Qnil;
6404}
6405
6406
6407/*
6408 * call-seq:
6409 * sub(pattern, replacement) -> new_string
6410 * sub(pattern) {|match| ... } -> new_string
6411 *
6412 * Returns a copy of +self+ with only the first occurrence
6413 * (not all occurrences) of the given +pattern+ replaced.
6414 *
6415 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6416 *
6417 * Related: String#sub!, String#gsub, String#gsub!.
6418 *
6419 */
6420
6421static VALUE
6422rb_str_sub(int argc, VALUE *argv, VALUE str)
6423{
6424 str = str_duplicate(rb_cString, str);
6425 rb_str_sub_bang(argc, argv, str);
6426 return str;
6427}
6428
6429static VALUE
6430str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6431{
6432 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6433 long beg, beg0, end0;
6434 long offset, blen, slen, len, last;
6435 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6436 char *sp, *cp;
6437 int need_backref_str = -1;
6438 rb_encoding *str_enc;
6439
6440 switch (argc) {
6441 case 1:
6442 RETURN_ENUMERATOR(str, argc, argv);
6443 mode = ITER;
6444 break;
6445 case 2:
6446 repl = argv[1];
6447 hash = rb_check_hash_type(argv[1]);
6448 if (NIL_P(hash)) {
6449 StringValue(repl);
6450 }
6451 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6452 mode = FAST_MAP;
6453 }
6454 else {
6455 mode = MAP;
6456 }
6457 break;
6458 default:
6459 rb_error_arity(argc, 1, 2);
6460 }
6461
6462 pat = get_pat_quoted(argv[0], 1);
6463 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6464
6465 if (beg < 0) {
6466 if (bang) return Qnil; /* no match, no substitution */
6467 return str_duplicate(rb_cString, str);
6468 }
6469
6470 offset = 0;
6471 blen = RSTRING_LEN(str) + 30; /* len + margin */
6472 dest = rb_str_buf_new(blen);
6473 sp = RSTRING_PTR(str);
6474 slen = RSTRING_LEN(str);
6475 cp = sp;
6476 str_enc = STR_ENC_GET(str);
6477 rb_enc_associate(dest, str_enc);
6478 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6479
6480 do {
6481 struct re_registers *regs = RMATCH_REGS(match);
6482 if (RB_TYPE_P(pat, T_STRING)) {
6483 beg0 = beg;
6484 end0 = beg0 + RSTRING_LEN(pat);
6485 match0 = pat;
6486 }
6487 else {
6488 beg0 = BEG(0);
6489 end0 = END(0);
6490 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6491 }
6492
6493 if (mode != STR) {
6494 if (mode == ITER) {
6495 val = rb_obj_as_string(rb_yield(match0));
6496 }
6497 else {
6498 struct RString fake_str;
6499 VALUE key;
6500 if (mode == FAST_MAP) {
6501 // It is safe to use a fake_str here because we established that it won't escape,
6502 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6503 // default proc.
6504 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6505 }
6506 else {
6507 key = rb_str_subseq(str, beg0, end0 - beg0);
6508 }
6509 val = rb_hash_aref(hash, key);
6510 val = rb_obj_as_string(val);
6511 }
6512 str_mod_check(str, sp, slen);
6513 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6514 rb_raise(rb_eRuntimeError, "block should not cheat");
6515 }
6516 }
6517 else if (need_backref_str) {
6518 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6519 if (need_backref_str < 0) {
6520 need_backref_str = val != repl;
6521 }
6522 }
6523 else {
6524 val = repl;
6525 }
6526
6527 len = beg0 - offset; /* copy pre-match substr */
6528 if (len) {
6529 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6530 }
6531
6532 rb_str_buf_append(dest, val);
6533
6534 last = offset;
6535 offset = end0;
6536 if (beg0 == end0) {
6537 /*
6538 * Always consume at least one character of the input string
6539 * in order to prevent infinite loops.
6540 */
6541 if (RSTRING_LEN(str) <= end0) break;
6542 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6543 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6544 offset = end0 + len;
6545 }
6546 cp = RSTRING_PTR(str) + offset;
6547 if (offset > RSTRING_LEN(str)) break;
6548
6549 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6550 if (mode != FAST_MAP && mode != STR) {
6551 match = Qnil;
6552 }
6553 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6554
6555 RB_GC_GUARD(match);
6556 } while (beg >= 0);
6557
6558 if (RSTRING_LEN(str) > offset) {
6559 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6560 }
6561 rb_pat_search0(pat, str, last, 1, &match);
6562 if (bang) {
6563 str_shared_replace(str, dest);
6564 }
6565 else {
6566 str = dest;
6567 }
6568
6569 return str;
6570}
6571
6572
6573/*
6574 * call-seq:
6575 * gsub!(pattern, replacement) -> self or nil
6576 * gsub!(pattern) {|match| ... } -> self or nil
6577 * gsub!(pattern) -> an_enumerator
6578 *
6579 * Like String#gsub, except that:
6580 *
6581 * - Performs substitutions in +self+ (not in a copy of +self+).
6582 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6583 *
6584 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6585 */
6586
6587static VALUE
6588rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6589{
6590 str_modify_keep_cr(str);
6591 return str_gsub(argc, argv, str, 1);
6592}
6593
6594
6595/*
6596 * call-seq:
6597 * gsub(pattern, replacement) -> new_string
6598 * gsub(pattern) {|match| ... } -> new_string
6599 * gsub(pattern) -> enumerator
6600 *
6601 * Returns a copy of +self+ with zero or more substrings replaced.
6602 *
6603 * Argument +pattern+ may be a string or a Regexp;
6604 * argument +replacement+ may be a string or a Hash.
6605 * Varying types for the argument values makes this method very versatile.
6606 *
6607 * Below are some simple examples;
6608 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6609 *
6610 * With arguments +pattern+ and string +replacement+ given,
6611 * replaces each matching substring with the given +replacement+ string:
6612 *
6613 * s = 'abracadabra'
6614 * s.gsub('ab', 'AB') # => "ABracadABra"
6615 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6616 *
6617 * With arguments +pattern+ and hash +replacement+ given,
6618 * replaces each matching substring with a value from the given +replacement+ hash,
6619 * or removes it:
6620 *
6621 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6622 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6623 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6624 *
6625 * With argument +pattern+ and a block given,
6626 * calls the block with each matching substring;
6627 * replaces that substring with the block's return value:
6628 *
6629 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6630 * # => "ABrACADABrA"
6631 *
6632 * With argument +pattern+ and no block given,
6633 * returns a new Enumerator.
6634 *
6635 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6636 */
6637
6638static VALUE
6639rb_str_gsub(int argc, VALUE *argv, VALUE str)
6640{
6641 return str_gsub(argc, argv, str, 0);
6642}
6643
6644
6645/*
6646 * call-seq:
6647 * replace(other_string) -> self
6648 *
6649 * Replaces the contents of +self+ with the contents of +other_string+:
6650 *
6651 * s = 'foo' # => "foo"
6652 * s.replace('bar') # => "bar"
6653 *
6654 */
6655
6656VALUE
6658{
6659 str_modifiable(str);
6660 if (str == str2) return str;
6661
6662 StringValue(str2);
6663 str_discard(str);
6664 return str_replace(str, str2);
6665}
6666
6667/*
6668 * call-seq:
6669 * clear -> self
6670 *
6671 * Removes the contents of +self+:
6672 *
6673 * s = 'foo'
6674 * s.clear # => ""
6675 * s # => ""
6676 *
6677 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6678 */
6679
6680static VALUE
6681rb_str_clear(VALUE str)
6682{
6683 str_discard(str);
6684 STR_SET_EMBED(str);
6685 STR_SET_LEN(str, 0);
6686 RSTRING_PTR(str)[0] = 0;
6687 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6689 else
6691 return str;
6692}
6693
6694/*
6695 * call-seq:
6696 * chr -> string
6697 *
6698 * :include: doc/string/chr.rdoc
6699 *
6700 */
6701
6702static VALUE
6703rb_str_chr(VALUE str)
6704{
6705 return rb_str_substr(str, 0, 1);
6706}
6707
6708/*
6709 * call-seq:
6710 * getbyte(index) -> integer or nil
6711 *
6712 * :include: doc/string/getbyte.rdoc
6713 *
6714 */
6715VALUE
6716rb_str_getbyte(VALUE str, VALUE index)
6717{
6718 long pos = NUM2LONG(index);
6719
6720 if (pos < 0)
6721 pos += RSTRING_LEN(str);
6722 if (pos < 0 || RSTRING_LEN(str) <= pos)
6723 return Qnil;
6724
6725 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6726}
6727
6728/*
6729 * call-seq:
6730 * setbyte(index, integer) -> integer
6731 *
6732 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6733 *
6734 * s = 'abcde' # => "abcde"
6735 * s.setbyte(0, 98) # => 98
6736 * s # => "bbcde"
6737 *
6738 * Related: String#getbyte.
6739 */
6740VALUE
6741rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6742{
6743 long pos = NUM2LONG(index);
6744 long len = RSTRING_LEN(str);
6745 char *ptr, *head, *left = 0;
6746 rb_encoding *enc;
6747 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6748
6749 if (pos < -len || len <= pos)
6750 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6751 if (pos < 0)
6752 pos += len;
6753
6754 VALUE v = rb_to_int(value);
6755 VALUE w = rb_int_and(v, INT2FIX(0xff));
6756 char byte = (char)(NUM2INT(w) & 0xFF);
6757
6758 if (!str_independent(str))
6759 str_make_independent(str);
6760 enc = STR_ENC_GET(str);
6761 head = RSTRING_PTR(str);
6762 ptr = &head[pos];
6763 if (!STR_EMBED_P(str)) {
6764 cr = ENC_CODERANGE(str);
6765 switch (cr) {
6766 case ENC_CODERANGE_7BIT:
6767 left = ptr;
6768 *ptr = byte;
6769 if (ISASCII(byte)) goto end;
6770 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6771 if (!MBCLEN_CHARFOUND_P(nlen))
6773 else
6775 goto end;
6777 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6778 width = rb_enc_precise_mbclen(left, head+len, enc);
6779 *ptr = byte;
6780 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6781 if (!MBCLEN_CHARFOUND_P(nlen))
6783 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6785 goto end;
6786 }
6787 }
6789 *ptr = byte;
6790
6791 end:
6792 return value;
6793}
6794
6795static VALUE
6796str_byte_substr(VALUE str, long beg, long len, int empty)
6797{
6798 long n = RSTRING_LEN(str);
6799
6800 if (beg > n || len < 0) return Qnil;
6801 if (beg < 0) {
6802 beg += n;
6803 if (beg < 0) return Qnil;
6804 }
6805 if (len > n - beg)
6806 len = n - beg;
6807 if (len <= 0) {
6808 if (!empty) return Qnil;
6809 len = 0;
6810 }
6811
6812 VALUE str2 = str_subseq(str, beg, len);
6813
6814 str_enc_copy_direct(str2, str);
6815
6816 if (RSTRING_LEN(str2) == 0) {
6817 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6819 else
6821 }
6822 else {
6823 switch (ENC_CODERANGE(str)) {
6824 case ENC_CODERANGE_7BIT:
6826 break;
6827 default:
6829 break;
6830 }
6831 }
6832
6833 return str2;
6834}
6835
6836VALUE
6837rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6838{
6839 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6840}
6841
6842static VALUE
6843str_byte_aref(VALUE str, VALUE indx)
6844{
6845 long idx;
6846 if (FIXNUM_P(indx)) {
6847 idx = FIX2LONG(indx);
6848 }
6849 else {
6850 /* check if indx is Range */
6851 long beg, len = RSTRING_LEN(str);
6852
6853 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6854 case Qfalse:
6855 break;
6856 case Qnil:
6857 return Qnil;
6858 default:
6859 return str_byte_substr(str, beg, len, TRUE);
6860 }
6861
6862 idx = NUM2LONG(indx);
6863 }
6864 return str_byte_substr(str, idx, 1, FALSE);
6865}
6866
6867/*
6868 * call-seq:
6869 * byteslice(offset, length = 1) -> string or nil
6870 * byteslice(range) -> string or nil
6871 *
6872 * :include: doc/string/byteslice.rdoc
6873 */
6874
6875static VALUE
6876rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6877{
6878 if (argc == 2) {
6879 long beg = NUM2LONG(argv[0]);
6880 long len = NUM2LONG(argv[1]);
6881 return str_byte_substr(str, beg, len, TRUE);
6882 }
6883 rb_check_arity(argc, 1, 2);
6884 return str_byte_aref(str, argv[0]);
6885}
6886
6887static void
6888str_check_beg_len(VALUE str, long *beg, long *len)
6889{
6890 long end, slen = RSTRING_LEN(str);
6891
6892 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6893 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6894 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6895 }
6896 if (*beg < 0) {
6897 *beg += slen;
6898 }
6899 RUBY_ASSERT(*beg >= 0);
6900 RUBY_ASSERT(*beg <= slen);
6901
6902 if (*len > slen - *beg) {
6903 *len = slen - *beg;
6904 }
6905 end = *beg + *len;
6906 str_ensure_byte_pos(str, *beg);
6907 str_ensure_byte_pos(str, end);
6908}
6909
6910/*
6911 * call-seq:
6912 * bytesplice(offset, length, str) -> self
6913 * bytesplice(offset, length, str, str_offset, str_length) -> self
6914 * bytesplice(range, str) -> self
6915 * bytesplice(range, str, str_range) -> self
6916 *
6917 * :include: doc/string/bytesplice.rdoc
6918 */
6919
6920static VALUE
6921rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6922{
6923 long beg, len, vbeg, vlen;
6924 VALUE val;
6925 int cr;
6926
6927 rb_check_arity(argc, 2, 5);
6928 if (!(argc == 2 || argc == 3 || argc == 5)) {
6929 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6930 }
6931 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6932 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6933 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6934 rb_builtin_class_name(argv[0]));
6935 }
6936 val = argv[1];
6937 StringValue(val);
6938 if (argc == 2) {
6939 /* bytesplice(range, str) */
6940 vbeg = 0;
6941 vlen = RSTRING_LEN(val);
6942 }
6943 else {
6944 /* bytesplice(range, str, str_range) */
6945 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6946 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6947 rb_builtin_class_name(argv[2]));
6948 }
6949 }
6950 }
6951 else {
6952 beg = NUM2LONG(argv[0]);
6953 len = NUM2LONG(argv[1]);
6954 val = argv[2];
6955 StringValue(val);
6956 if (argc == 3) {
6957 /* bytesplice(index, length, str) */
6958 vbeg = 0;
6959 vlen = RSTRING_LEN(val);
6960 }
6961 else {
6962 /* bytesplice(index, length, str, str_index, str_length) */
6963 vbeg = NUM2LONG(argv[3]);
6964 vlen = NUM2LONG(argv[4]);
6965 }
6966 }
6967 str_check_beg_len(str, &beg, &len);
6968 str_check_beg_len(val, &vbeg, &vlen);
6969 str_modify_keep_cr(str);
6970
6971 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6972 rb_enc_associate(str, rb_enc_check(str, val));
6973 }
6974
6975 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6977 if (cr != ENC_CODERANGE_BROKEN)
6978 ENC_CODERANGE_SET(str, cr);
6979 return str;
6980}
6981
6982/*
6983 * call-seq:
6984 * reverse -> string
6985 *
6986 * Returns a new string with the characters from +self+ in reverse order.
6987 *
6988 * 'stressed'.reverse # => "desserts"
6989 *
6990 */
6991
6992static VALUE
6993rb_str_reverse(VALUE str)
6994{
6995 rb_encoding *enc;
6996 VALUE rev;
6997 char *s, *e, *p;
6998 int cr;
6999
7000 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7001 enc = STR_ENC_GET(str);
7002 rev = rb_str_new(0, RSTRING_LEN(str));
7003 s = RSTRING_PTR(str); e = RSTRING_END(str);
7004 p = RSTRING_END(rev);
7005 cr = ENC_CODERANGE(str);
7006
7007 if (RSTRING_LEN(str) > 1) {
7008 if (single_byte_optimizable(str)) {
7009 while (s < e) {
7010 *--p = *s++;
7011 }
7012 }
7013 else if (cr == ENC_CODERANGE_VALID) {
7014 while (s < e) {
7015 int clen = rb_enc_fast_mbclen(s, e, enc);
7016
7017 p -= clen;
7018 memcpy(p, s, clen);
7019 s += clen;
7020 }
7021 }
7022 else {
7023 cr = rb_enc_asciicompat(enc) ?
7025 while (s < e) {
7026 int clen = rb_enc_mbclen(s, e, enc);
7027
7028 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7029 p -= clen;
7030 memcpy(p, s, clen);
7031 s += clen;
7032 }
7033 }
7034 }
7035 STR_SET_LEN(rev, RSTRING_LEN(str));
7036 str_enc_copy_direct(rev, str);
7037 ENC_CODERANGE_SET(rev, cr);
7038
7039 return rev;
7040}
7041
7042
7043/*
7044 * call-seq:
7045 * reverse! -> self
7046 *
7047 * Returns +self+ with its characters reversed:
7048 *
7049 * s = 'stressed'
7050 * s.reverse! # => "desserts"
7051 * s # => "desserts"
7052 *
7053 */
7054
7055static VALUE
7056rb_str_reverse_bang(VALUE str)
7057{
7058 if (RSTRING_LEN(str) > 1) {
7059 if (single_byte_optimizable(str)) {
7060 char *s, *e, c;
7061
7062 str_modify_keep_cr(str);
7063 s = RSTRING_PTR(str);
7064 e = RSTRING_END(str) - 1;
7065 while (s < e) {
7066 c = *s;
7067 *s++ = *e;
7068 *e-- = c;
7069 }
7070 }
7071 else {
7072 str_shared_replace(str, rb_str_reverse(str));
7073 }
7074 }
7075 else {
7076 str_modify_keep_cr(str);
7077 }
7078 return str;
7079}
7080
7081
7082/*
7083 * call-seq:
7084 * include?(other_string) -> true or false
7085 *
7086 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7087 *
7088 * s = 'foo'
7089 * s.include?('f') # => true
7090 * s.include?('fo') # => true
7091 * s.include?('food') # => false
7092 *
7093 */
7094
7095VALUE
7096rb_str_include(VALUE str, VALUE arg)
7097{
7098 long i;
7099
7100 StringValue(arg);
7101 i = rb_str_index(str, arg, 0);
7102
7103 return RBOOL(i != -1);
7104}
7105
7106
7107/*
7108 * call-seq:
7109 * to_i(base = 10) -> integer
7110 *
7111 * Returns the result of interpreting leading characters in +self+
7112 * as an integer in the given +base+ (which must be in (0, 2..36)):
7113 *
7114 * '123456'.to_i # => 123456
7115 * '123def'.to_i(16) # => 1195503
7116 *
7117 * With +base+ zero, string +object+ may contain leading characters
7118 * to specify the actual base:
7119 *
7120 * '123def'.to_i(0) # => 123
7121 * '0123def'.to_i(0) # => 83
7122 * '0b123def'.to_i(0) # => 1
7123 * '0o123def'.to_i(0) # => 83
7124 * '0d123def'.to_i(0) # => 123
7125 * '0x123def'.to_i(0) # => 1195503
7126 *
7127 * Characters past a leading valid number (in the given +base+) are ignored:
7128 *
7129 * '12.345'.to_i # => 12
7130 * '12345'.to_i(2) # => 1
7131 *
7132 * Returns zero if there is no leading valid number:
7133 *
7134 * 'abcdef'.to_i # => 0
7135 * '2'.to_i(2) # => 0
7136 *
7137 */
7138
7139static VALUE
7140rb_str_to_i(int argc, VALUE *argv, VALUE str)
7141{
7142 int base = 10;
7143
7144 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7145 rb_raise(rb_eArgError, "invalid radix %d", base);
7146 }
7147 return rb_str_to_inum(str, base, FALSE);
7148}
7149
7150
7151/*
7152 * call-seq:
7153 * to_f -> float
7154 *
7155 * Returns the result of interpreting leading characters in +self+ as a Float:
7156 *
7157 * '3.14159'.to_f # => 3.14159
7158 * '1.234e-2'.to_f # => 0.01234
7159 *
7160 * Characters past a leading valid number (in the given +base+) are ignored:
7161 *
7162 * '3.14 (pi to two places)'.to_f # => 3.14
7163 *
7164 * Returns zero if there is no leading valid number:
7165 *
7166 * 'abcdef'.to_f # => 0.0
7167 *
7168 */
7169
7170static VALUE
7171rb_str_to_f(VALUE str)
7172{
7173 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7174}
7175
7176
7177/*
7178 * call-seq:
7179 * to_s -> self or string
7180 *
7181 * Returns +self+ if +self+ is a +String+,
7182 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7183 */
7184
7185static VALUE
7186rb_str_to_s(VALUE str)
7187{
7188 if (rb_obj_class(str) != rb_cString) {
7189 return str_duplicate(rb_cString, str);
7190 }
7191 return str;
7192}
7193
7194#if 0
7195static void
7196str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7197{
7198 char s[RUBY_MAX_CHAR_LEN];
7199 int n = rb_enc_codelen(c, enc);
7200
7201 rb_enc_mbcput(c, s, enc);
7202 rb_enc_str_buf_cat(str, s, n, enc);
7203}
7204#endif
7205
7206#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7207
7208int
7209rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7210{
7211 char buf[CHAR_ESC_LEN + 1];
7212 int l;
7213
7214#if SIZEOF_INT > 4
7215 c &= 0xffffffff;
7216#endif
7217 if (unicode_p) {
7218 if (c < 0x7F && ISPRINT(c)) {
7219 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7220 }
7221 else if (c < 0x10000) {
7222 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7223 }
7224 else {
7225 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7226 }
7227 }
7228 else {
7229 if (c < 0x100) {
7230 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7231 }
7232 else {
7233 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7234 }
7235 }
7236 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7237 rb_str_buf_cat(result, buf, l);
7238 return l;
7239}
7240
7241const char *
7242ruby_escaped_char(int c)
7243{
7244 switch (c) {
7245 case '\0': return "\\0";
7246 case '\n': return "\\n";
7247 case '\r': return "\\r";
7248 case '\t': return "\\t";
7249 case '\f': return "\\f";
7250 case '\013': return "\\v";
7251 case '\010': return "\\b";
7252 case '\007': return "\\a";
7253 case '\033': return "\\e";
7254 case '\x7f': return "\\c?";
7255 }
7256 return NULL;
7257}
7258
7259VALUE
7260rb_str_escape(VALUE str)
7261{
7262 int encidx = ENCODING_GET(str);
7263 rb_encoding *enc = rb_enc_from_index(encidx);
7264 const char *p = RSTRING_PTR(str);
7265 const char *pend = RSTRING_END(str);
7266 const char *prev = p;
7267 char buf[CHAR_ESC_LEN + 1];
7268 VALUE result = rb_str_buf_new(0);
7269 int unicode_p = rb_enc_unicode_p(enc);
7270 int asciicompat = rb_enc_asciicompat(enc);
7271
7272 while (p < pend) {
7273 unsigned int c;
7274 const char *cc;
7275 int n = rb_enc_precise_mbclen(p, pend, enc);
7276 if (!MBCLEN_CHARFOUND_P(n)) {
7277 if (p > prev) str_buf_cat(result, prev, p - prev);
7278 n = rb_enc_mbminlen(enc);
7279 if (pend < p + n)
7280 n = (int)(pend - p);
7281 while (n--) {
7282 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7283 str_buf_cat(result, buf, strlen(buf));
7284 prev = ++p;
7285 }
7286 continue;
7287 }
7288 n = MBCLEN_CHARFOUND_LEN(n);
7289 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7290 p += n;
7291 cc = ruby_escaped_char(c);
7292 if (cc) {
7293 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7294 str_buf_cat(result, cc, strlen(cc));
7295 prev = p;
7296 }
7297 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7298 }
7299 else {
7300 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7301 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7302 prev = p;
7303 }
7304 }
7305 if (p > prev) str_buf_cat(result, prev, p - prev);
7306 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7307
7308 return result;
7309}
7310
7311/*
7312 * call-seq:
7313 * inspect -> string
7314 *
7315 * Returns a printable version of +self+, enclosed in double-quotes,
7316 * and with special characters escaped:
7317 *
7318 * s = "foo\tbar\tbaz\n"
7319 * s.inspect
7320 * # => "\"foo\\tbar\\tbaz\\n\""
7321 *
7322 */
7323
7324VALUE
7326{
7327 int encidx = ENCODING_GET(str);
7328 rb_encoding *enc = rb_enc_from_index(encidx);
7329 const char *p, *pend, *prev;
7330 char buf[CHAR_ESC_LEN + 1];
7331 VALUE result = rb_str_buf_new(0);
7332 rb_encoding *resenc = rb_default_internal_encoding();
7333 int unicode_p = rb_enc_unicode_p(enc);
7334 int asciicompat = rb_enc_asciicompat(enc);
7335
7336 if (resenc == NULL) resenc = rb_default_external_encoding();
7337 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7338 rb_enc_associate(result, resenc);
7339 str_buf_cat2(result, "\"");
7340
7341 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7342 prev = p;
7343 while (p < pend) {
7344 unsigned int c, cc;
7345 int n;
7346
7347 n = rb_enc_precise_mbclen(p, pend, enc);
7348 if (!MBCLEN_CHARFOUND_P(n)) {
7349 if (p > prev) str_buf_cat(result, prev, p - prev);
7350 n = rb_enc_mbminlen(enc);
7351 if (pend < p + n)
7352 n = (int)(pend - p);
7353 while (n--) {
7354 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7355 str_buf_cat(result, buf, strlen(buf));
7356 prev = ++p;
7357 }
7358 continue;
7359 }
7360 n = MBCLEN_CHARFOUND_LEN(n);
7361 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7362 p += n;
7363 if ((asciicompat || unicode_p) &&
7364 (c == '"'|| c == '\\' ||
7365 (c == '#' &&
7366 p < pend &&
7367 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7368 (cc = rb_enc_codepoint(p,pend,enc),
7369 (cc == '$' || cc == '@' || cc == '{'))))) {
7370 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7371 str_buf_cat2(result, "\\");
7372 if (asciicompat || enc == resenc) {
7373 prev = p - n;
7374 continue;
7375 }
7376 }
7377 switch (c) {
7378 case '\n': cc = 'n'; break;
7379 case '\r': cc = 'r'; break;
7380 case '\t': cc = 't'; break;
7381 case '\f': cc = 'f'; break;
7382 case '\013': cc = 'v'; break;
7383 case '\010': cc = 'b'; break;
7384 case '\007': cc = 'a'; break;
7385 case 033: cc = 'e'; break;
7386 default: cc = 0; break;
7387 }
7388 if (cc) {
7389 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7390 buf[0] = '\\';
7391 buf[1] = (char)cc;
7392 str_buf_cat(result, buf, 2);
7393 prev = p;
7394 continue;
7395 }
7396 /* The special casing of 0x85 (NEXT_LINE) here is because
7397 * Oniguruma historically treats it as printable, but it
7398 * doesn't match the print POSIX bracket class or character
7399 * property in regexps.
7400 *
7401 * See Ruby Bug #16842 for details:
7402 * https://bugs.ruby-lang.org/issues/16842
7403 */
7404 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7405 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7406 continue;
7407 }
7408 else {
7409 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7410 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7411 prev = p;
7412 continue;
7413 }
7414 }
7415 if (p > prev) str_buf_cat(result, prev, p - prev);
7416 str_buf_cat2(result, "\"");
7417
7418 return result;
7419}
7420
7421#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7422
7423/*
7424 * call-seq:
7425 * dump -> new_string
7426 *
7427 * :include: doc/string/dump.rdoc
7428 *
7429 */
7430
7431VALUE
7433{
7434 int encidx = rb_enc_get_index(str);
7435 rb_encoding *enc = rb_enc_from_index(encidx);
7436 long len;
7437 const char *p, *pend;
7438 char *q, *qend;
7439 VALUE result;
7440 int u8 = (encidx == rb_utf8_encindex());
7441 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7442
7443 len = 2; /* "" */
7444 if (!rb_enc_asciicompat(enc)) {
7445 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7446 len += strlen(enc->name);
7447 }
7448
7449 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7450 while (p < pend) {
7451 int clen;
7452 unsigned char c = *p++;
7453
7454 switch (c) {
7455 case '"': case '\\':
7456 case '\n': case '\r':
7457 case '\t': case '\f':
7458 case '\013': case '\010': case '\007': case '\033':
7459 clen = 2;
7460 break;
7461
7462 case '#':
7463 clen = IS_EVSTR(p, pend) ? 2 : 1;
7464 break;
7465
7466 default:
7467 if (ISPRINT(c)) {
7468 clen = 1;
7469 }
7470 else {
7471 if (u8 && c > 0x7F) { /* \u notation */
7472 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7473 if (MBCLEN_CHARFOUND_P(n)) {
7474 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7475 if (cc <= 0xFFFF)
7476 clen = 6; /* \uXXXX */
7477 else if (cc <= 0xFFFFF)
7478 clen = 9; /* \u{XXXXX} */
7479 else
7480 clen = 10; /* \u{XXXXXX} */
7481 p += MBCLEN_CHARFOUND_LEN(n)-1;
7482 break;
7483 }
7484 }
7485 clen = 4; /* \xNN */
7486 }
7487 break;
7488 }
7489
7490 if (clen > LONG_MAX - len) {
7491 rb_raise(rb_eRuntimeError, "string size too big");
7492 }
7493 len += clen;
7494 }
7495
7496 result = rb_str_new(0, len);
7497 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7498 q = RSTRING_PTR(result); qend = q + len + 1;
7499
7500 *q++ = '"';
7501 while (p < pend) {
7502 unsigned char c = *p++;
7503
7504 if (c == '"' || c == '\\') {
7505 *q++ = '\\';
7506 *q++ = c;
7507 }
7508 else if (c == '#') {
7509 if (IS_EVSTR(p, pend)) *q++ = '\\';
7510 *q++ = '#';
7511 }
7512 else if (c == '\n') {
7513 *q++ = '\\';
7514 *q++ = 'n';
7515 }
7516 else if (c == '\r') {
7517 *q++ = '\\';
7518 *q++ = 'r';
7519 }
7520 else if (c == '\t') {
7521 *q++ = '\\';
7522 *q++ = 't';
7523 }
7524 else if (c == '\f') {
7525 *q++ = '\\';
7526 *q++ = 'f';
7527 }
7528 else if (c == '\013') {
7529 *q++ = '\\';
7530 *q++ = 'v';
7531 }
7532 else if (c == '\010') {
7533 *q++ = '\\';
7534 *q++ = 'b';
7535 }
7536 else if (c == '\007') {
7537 *q++ = '\\';
7538 *q++ = 'a';
7539 }
7540 else if (c == '\033') {
7541 *q++ = '\\';
7542 *q++ = 'e';
7543 }
7544 else if (ISPRINT(c)) {
7545 *q++ = c;
7546 }
7547 else {
7548 *q++ = '\\';
7549 if (u8) {
7550 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7551 if (MBCLEN_CHARFOUND_P(n)) {
7552 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7553 p += n;
7554 if (cc <= 0xFFFF)
7555 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7556 else
7557 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7558 q += strlen(q);
7559 continue;
7560 }
7561 }
7562 snprintf(q, qend-q, "x%02X", c);
7563 q += 3;
7564 }
7565 }
7566 *q++ = '"';
7567 *q = '\0';
7568 if (!rb_enc_asciicompat(enc)) {
7569 snprintf(q, qend-q, nonascii_suffix, enc->name);
7570 encidx = rb_ascii8bit_encindex();
7571 }
7572 /* result from dump is ASCII */
7573 rb_enc_associate_index(result, encidx);
7575 return result;
7576}
7577
7578static int
7579unescape_ascii(unsigned int c)
7580{
7581 switch (c) {
7582 case 'n':
7583 return '\n';
7584 case 'r':
7585 return '\r';
7586 case 't':
7587 return '\t';
7588 case 'f':
7589 return '\f';
7590 case 'v':
7591 return '\13';
7592 case 'b':
7593 return '\010';
7594 case 'a':
7595 return '\007';
7596 case 'e':
7597 return 033;
7598 }
7600}
7601
7602static void
7603undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7604{
7605 const char *s = *ss;
7606 unsigned int c;
7607 int codelen;
7608 size_t hexlen;
7609 unsigned char buf[6];
7610 static rb_encoding *enc_utf8 = NULL;
7611
7612 switch (*s) {
7613 case '\\':
7614 case '"':
7615 case '#':
7616 rb_str_cat(undumped, s, 1); /* cat itself */
7617 s++;
7618 break;
7619 case 'n':
7620 case 'r':
7621 case 't':
7622 case 'f':
7623 case 'v':
7624 case 'b':
7625 case 'a':
7626 case 'e':
7627 *buf = unescape_ascii(*s);
7628 rb_str_cat(undumped, (char *)buf, 1);
7629 s++;
7630 break;
7631 case 'u':
7632 if (*binary) {
7633 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7634 }
7635 *utf8 = true;
7636 if (++s >= s_end) {
7637 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7638 }
7639 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7640 if (*penc != enc_utf8) {
7641 *penc = enc_utf8;
7642 rb_enc_associate(undumped, enc_utf8);
7643 }
7644 if (*s == '{') { /* handle \u{...} form */
7645 s++;
7646 for (;;) {
7647 if (s >= s_end) {
7648 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7649 }
7650 if (*s == '}') {
7651 s++;
7652 break;
7653 }
7654 if (ISSPACE(*s)) {
7655 s++;
7656 continue;
7657 }
7658 c = scan_hex(s, s_end-s, &hexlen);
7659 if (hexlen == 0 || hexlen > 6) {
7660 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7661 }
7662 if (c > 0x10ffff) {
7663 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7664 }
7665 if (0xd800 <= c && c <= 0xdfff) {
7666 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7667 }
7668 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7669 rb_str_cat(undumped, (char *)buf, codelen);
7670 s += hexlen;
7671 }
7672 }
7673 else { /* handle \uXXXX form */
7674 c = scan_hex(s, 4, &hexlen);
7675 if (hexlen != 4) {
7676 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7677 }
7678 if (0xd800 <= c && c <= 0xdfff) {
7679 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7680 }
7681 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7682 rb_str_cat(undumped, (char *)buf, codelen);
7683 s += hexlen;
7684 }
7685 break;
7686 case 'x':
7687 if (*utf8) {
7688 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7689 }
7690 *binary = true;
7691 if (++s >= s_end) {
7692 rb_raise(rb_eRuntimeError, "invalid hex escape");
7693 }
7694 *buf = scan_hex(s, 2, &hexlen);
7695 if (hexlen != 2) {
7696 rb_raise(rb_eRuntimeError, "invalid hex escape");
7697 }
7698 rb_str_cat(undumped, (char *)buf, 1);
7699 s += hexlen;
7700 break;
7701 default:
7702 rb_str_cat(undumped, s-1, 2);
7703 s++;
7704 }
7705
7706 *ss = s;
7707}
7708
7709static VALUE rb_str_is_ascii_only_p(VALUE str);
7710
7711/*
7712 * call-seq:
7713 * undump -> string
7714 *
7715 * Returns an unescaped version of +self+:
7716 *
7717 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7718 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7719 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7720 * s_undumped == s_orig # => true
7721 *
7722 * Related: String#dump (inverse of String#undump).
7723 *
7724 */
7725
7726static VALUE
7727str_undump(VALUE str)
7728{
7729 const char *s = RSTRING_PTR(str);
7730 const char *s_end = RSTRING_END(str);
7731 rb_encoding *enc = rb_enc_get(str);
7732 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7733 bool utf8 = false;
7734 bool binary = false;
7735 int w;
7736
7738 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7739 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7740 }
7741 if (!str_null_check(str, &w)) {
7742 rb_raise(rb_eRuntimeError, "string contains null byte");
7743 }
7744 if (RSTRING_LEN(str) < 2) goto invalid_format;
7745 if (*s != '"') goto invalid_format;
7746
7747 /* strip '"' at the start */
7748 s++;
7749
7750 for (;;) {
7751 if (s >= s_end) {
7752 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7753 }
7754
7755 if (*s == '"') {
7756 /* epilogue */
7757 s++;
7758 if (s == s_end) {
7759 /* ascii compatible dumped string */
7760 break;
7761 }
7762 else {
7763 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7764 static const char dup_suffix[] = ".dup";
7765 const char *encname;
7766 int encidx;
7767 ptrdiff_t size;
7768
7769 /* check separately for strings dumped by older versions */
7770 size = sizeof(dup_suffix) - 1;
7771 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7772
7773 size = sizeof(force_encoding_suffix) - 1;
7774 if (s_end - s <= size) goto invalid_format;
7775 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7776 s += size;
7777
7778 if (utf8) {
7779 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7780 }
7781
7782 encname = s;
7783 s = memchr(s, '"', s_end-s);
7784 size = s - encname;
7785 if (!s) goto invalid_format;
7786 if (s_end - s != 2) goto invalid_format;
7787 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7788
7789 encidx = rb_enc_find_index2(encname, (long)size);
7790 if (encidx < 0) {
7791 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7792 }
7793 rb_enc_associate_index(undumped, encidx);
7794 }
7795 break;
7796 }
7797
7798 if (*s == '\\') {
7799 s++;
7800 if (s >= s_end) {
7801 rb_raise(rb_eRuntimeError, "invalid escape");
7802 }
7803 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7804 }
7805 else {
7806 rb_str_cat(undumped, s++, 1);
7807 }
7808 }
7809
7810 RB_GC_GUARD(str);
7811
7812 return undumped;
7813invalid_format:
7814 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7815}
7816
7817static void
7818rb_str_check_dummy_enc(rb_encoding *enc)
7819{
7820 if (rb_enc_dummy_p(enc)) {
7821 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7822 rb_enc_name(enc));
7823 }
7824}
7825
7826static rb_encoding *
7827str_true_enc(VALUE str)
7828{
7829 rb_encoding *enc = STR_ENC_GET(str);
7830 rb_str_check_dummy_enc(enc);
7831 return enc;
7832}
7833
7834static OnigCaseFoldType
7835check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7836{
7837 if (argc==0)
7838 return flags;
7839 if (argc>2)
7840 rb_raise(rb_eArgError, "too many options");
7841 if (argv[0]==sym_turkic) {
7842 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7843 if (argc==2) {
7844 if (argv[1]==sym_lithuanian)
7845 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7846 else
7847 rb_raise(rb_eArgError, "invalid second option");
7848 }
7849 }
7850 else if (argv[0]==sym_lithuanian) {
7851 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7852 if (argc==2) {
7853 if (argv[1]==sym_turkic)
7854 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7855 else
7856 rb_raise(rb_eArgError, "invalid second option");
7857 }
7858 }
7859 else if (argc>1)
7860 rb_raise(rb_eArgError, "too many options");
7861 else if (argv[0]==sym_ascii)
7862 flags |= ONIGENC_CASE_ASCII_ONLY;
7863 else if (argv[0]==sym_fold) {
7864 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7865 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7866 else
7867 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7868 }
7869 else
7870 rb_raise(rb_eArgError, "invalid option");
7871 return flags;
7872}
7873
7874static inline bool
7875case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7876{
7877 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7878 return true;
7879 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7880}
7881
7882/* 16 should be long enough to absorb any kind of single character length increase */
7883#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7884#ifndef CASEMAP_DEBUG
7885# define CASEMAP_DEBUG 0
7886#endif
7887
7888struct mapping_buffer;
7889typedef struct mapping_buffer {
7890 size_t capa;
7891 size_t used;
7892 struct mapping_buffer *next;
7893 OnigUChar space[FLEX_ARY_LEN];
7895
7896static void
7897mapping_buffer_free(void *p)
7898{
7899 mapping_buffer *previous_buffer;
7900 mapping_buffer *current_buffer = p;
7901 while (current_buffer) {
7902 previous_buffer = current_buffer;
7903 current_buffer = current_buffer->next;
7904 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7905 }
7906}
7907
7908static const rb_data_type_t mapping_buffer_type = {
7909 "mapping_buffer",
7910 {0, mapping_buffer_free,},
7911 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7912};
7913
7914static VALUE
7915rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7916{
7917 VALUE target;
7918
7919 const OnigUChar *source_current, *source_end;
7920 int target_length = 0;
7921 VALUE buffer_anchor;
7922 mapping_buffer *current_buffer = 0;
7923 mapping_buffer **pre_buffer;
7924 size_t buffer_count = 0;
7925 int buffer_length_or_invalid;
7926
7927 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7928
7929 source_current = (OnigUChar*)RSTRING_PTR(source);
7930 source_end = (OnigUChar*)RSTRING_END(source);
7931
7932 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7933 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7934 while (source_current < source_end) {
7935 /* increase multiplier using buffer count to converge quickly */
7936 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7937 if (CASEMAP_DEBUG) {
7938 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7939 }
7940 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7941 *pre_buffer = current_buffer;
7942 pre_buffer = &current_buffer->next;
7943 current_buffer->next = NULL;
7944 current_buffer->capa = capa;
7945 buffer_length_or_invalid = enc->case_map(flags,
7946 &source_current, source_end,
7947 current_buffer->space,
7948 current_buffer->space+current_buffer->capa,
7949 enc);
7950 if (buffer_length_or_invalid < 0) {
7951 current_buffer = DATA_PTR(buffer_anchor);
7952 DATA_PTR(buffer_anchor) = 0;
7953 mapping_buffer_free(current_buffer);
7954 rb_raise(rb_eArgError, "input string invalid");
7955 }
7956 target_length += current_buffer->used = buffer_length_or_invalid;
7957 }
7958 if (CASEMAP_DEBUG) {
7959 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7960 }
7961
7962 if (buffer_count==1) {
7963 target = rb_str_new((const char*)current_buffer->space, target_length);
7964 }
7965 else {
7966 char *target_current;
7967
7968 target = rb_str_new(0, target_length);
7969 target_current = RSTRING_PTR(target);
7970 current_buffer = DATA_PTR(buffer_anchor);
7971 while (current_buffer) {
7972 memcpy(target_current, current_buffer->space, current_buffer->used);
7973 target_current += current_buffer->used;
7974 current_buffer = current_buffer->next;
7975 }
7976 }
7977 current_buffer = DATA_PTR(buffer_anchor);
7978 DATA_PTR(buffer_anchor) = 0;
7979 mapping_buffer_free(current_buffer);
7980
7981 RB_GC_GUARD(buffer_anchor);
7982
7983 /* TODO: check about string terminator character */
7984 str_enc_copy_direct(target, source);
7985 /*ENC_CODERANGE_SET(mapped, cr);*/
7986
7987 return target;
7988}
7989
7990static VALUE
7991rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7992{
7993 const OnigUChar *source_current, *source_end;
7994 OnigUChar *target_current, *target_end;
7995 long old_length = RSTRING_LEN(source);
7996 int length_or_invalid;
7997
7998 if (old_length == 0) return Qnil;
7999
8000 source_current = (OnigUChar*)RSTRING_PTR(source);
8001 source_end = (OnigUChar*)RSTRING_END(source);
8002 if (source == target) {
8003 target_current = (OnigUChar*)source_current;
8004 target_end = (OnigUChar*)source_end;
8005 }
8006 else {
8007 target_current = (OnigUChar*)RSTRING_PTR(target);
8008 target_end = (OnigUChar*)RSTRING_END(target);
8009 }
8010
8011 length_or_invalid = onigenc_ascii_only_case_map(flags,
8012 &source_current, source_end,
8013 target_current, target_end, enc);
8014 if (length_or_invalid < 0)
8015 rb_raise(rb_eArgError, "input string invalid");
8016 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8017 fprintf(stderr, "problem with rb_str_ascii_casemap"
8018 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8019 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8020 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8021 }
8022
8023 str_enc_copy(target, source);
8024
8025 return target;
8026}
8027
8028static bool
8029upcase_single(VALUE str)
8030{
8031 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8032 bool modified = false;
8033
8034 while (s < send) {
8035 unsigned int c = *(unsigned char*)s;
8036
8037 if ('a' <= c && c <= 'z') {
8038 *s = 'A' + (c - 'a');
8039 modified = true;
8040 }
8041 s++;
8042 }
8043 return modified;
8044}
8045
8046/*
8047 * call-seq:
8048 * upcase!(mapping) -> self or nil
8049 *
8050 * Upcases the characters in +self+;
8051 * returns +self+ if any changes were made, +nil+ otherwise:
8052 *
8053 * s = 'Hello World!' # => "Hello World!"
8054 * s.upcase! # => "HELLO WORLD!"
8055 * s # => "HELLO WORLD!"
8056 * s.upcase! # => nil
8057 *
8058 * The casing may be affected by the given +mapping+;
8059 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8060 *
8061 * Related: String#upcase, String#downcase, String#downcase!.
8062 *
8063 */
8064
8065static VALUE
8066rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8067{
8068 rb_encoding *enc;
8069 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8070
8071 flags = check_case_options(argc, argv, flags);
8072 str_modify_keep_cr(str);
8073 enc = str_true_enc(str);
8074 if (case_option_single_p(flags, enc, str)) {
8075 if (upcase_single(str))
8076 flags |= ONIGENC_CASE_MODIFIED;
8077 }
8078 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8079 rb_str_ascii_casemap(str, str, &flags, enc);
8080 else
8081 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8082
8083 if (ONIGENC_CASE_MODIFIED&flags) return str;
8084 return Qnil;
8085}
8086
8087
8088/*
8089 * call-seq:
8090 * upcase(mapping) -> string
8091 *
8092 * Returns a string containing the upcased characters in +self+:
8093 *
8094 * s = 'Hello World!' # => "Hello World!"
8095 * s.upcase # => "HELLO WORLD!"
8096 *
8097 * The casing may be affected by the given +mapping+;
8098 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8099 *
8100 * Related: String#upcase!, String#downcase, String#downcase!.
8101 *
8102 */
8103
8104static VALUE
8105rb_str_upcase(int argc, VALUE *argv, VALUE str)
8106{
8107 rb_encoding *enc;
8108 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8109 VALUE ret;
8110
8111 flags = check_case_options(argc, argv, flags);
8112 enc = str_true_enc(str);
8113 if (case_option_single_p(flags, enc, str)) {
8114 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8115 str_enc_copy_direct(ret, str);
8116 upcase_single(ret);
8117 }
8118 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8119 ret = rb_str_new(0, RSTRING_LEN(str));
8120 rb_str_ascii_casemap(str, ret, &flags, enc);
8121 }
8122 else {
8123 ret = rb_str_casemap(str, &flags, enc);
8124 }
8125
8126 return ret;
8127}
8128
8129static bool
8130downcase_single(VALUE str)
8131{
8132 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8133 bool modified = false;
8134
8135 while (s < send) {
8136 unsigned int c = *(unsigned char*)s;
8137
8138 if ('A' <= c && c <= 'Z') {
8139 *s = 'a' + (c - 'A');
8140 modified = true;
8141 }
8142 s++;
8143 }
8144
8145 return modified;
8146}
8147
8148/*
8149 * call-seq:
8150 * downcase!(mapping) -> self or nil
8151 *
8152 * Like String#downcase, except that:
8153 *
8154 * - Changes character casings in +self+ (not in a copy of +self+).
8155 * - Returns +self+ if any changes are made, +nil+ otherwise.
8156 *
8157 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8158 */
8159
8160static VALUE
8161rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8162{
8163 rb_encoding *enc;
8164 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8165
8166 flags = check_case_options(argc, argv, flags);
8167 str_modify_keep_cr(str);
8168 enc = str_true_enc(str);
8169 if (case_option_single_p(flags, enc, str)) {
8170 if (downcase_single(str))
8171 flags |= ONIGENC_CASE_MODIFIED;
8172 }
8173 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8174 rb_str_ascii_casemap(str, str, &flags, enc);
8175 else
8176 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8177
8178 if (ONIGENC_CASE_MODIFIED&flags) return str;
8179 return Qnil;
8180}
8181
8182
8183/*
8184 * call-seq:
8185 * downcase(mapping) -> string
8186 *
8187 * :include: doc/string/downcase.rdoc
8188 *
8189 */
8190
8191static VALUE
8192rb_str_downcase(int argc, VALUE *argv, VALUE str)
8193{
8194 rb_encoding *enc;
8195 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8196 VALUE ret;
8197
8198 flags = check_case_options(argc, argv, flags);
8199 enc = str_true_enc(str);
8200 if (case_option_single_p(flags, enc, str)) {
8201 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8202 str_enc_copy_direct(ret, str);
8203 downcase_single(ret);
8204 }
8205 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8206 ret = rb_str_new(0, RSTRING_LEN(str));
8207 rb_str_ascii_casemap(str, ret, &flags, enc);
8208 }
8209 else {
8210 ret = rb_str_casemap(str, &flags, enc);
8211 }
8212
8213 return ret;
8214}
8215
8216
8217/*
8218 * call-seq:
8219 * capitalize!(mapping = :ascii) -> self or nil
8220 *
8221 * Like String#capitalize, except that:
8222 *
8223 * - Changes character casings in +self+ (not in a copy of +self+).
8224 * - Returns +self+ if any changes are made, +nil+ otherwise.
8225 *
8226 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8227 */
8228
8229static VALUE
8230rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8231{
8232 rb_encoding *enc;
8233 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8234
8235 flags = check_case_options(argc, argv, flags);
8236 str_modify_keep_cr(str);
8237 enc = str_true_enc(str);
8238 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8239 if (flags&ONIGENC_CASE_ASCII_ONLY)
8240 rb_str_ascii_casemap(str, str, &flags, enc);
8241 else
8242 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8243
8244 if (ONIGENC_CASE_MODIFIED&flags) return str;
8245 return Qnil;
8246}
8247
8248
8249/*
8250 * call-seq:
8251 * capitalize(mapping = :ascii) -> string
8252 *
8253 * Returns a string containing the characters in +self+,
8254 * each with possibly changed case:
8255 *
8256 * - The first character is upcased.
8257 * - All other characters are downcased.
8258 *
8259 * Examples:
8260 *
8261 * 'hello world'.capitalize # => "Hello world"
8262 * 'HELLO WORLD'.capitalize # => "Hello world"
8263 *
8264 * Some characters do not have upcase and downcase, and so are not changed;
8265 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8266 *
8267 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8268 *
8269 * The casing is affected by the given +mapping+,
8270 * which may be +:ascii+, +:fold+, or +:turkic+;
8271 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8272 *
8273 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8274 */
8275
8276static VALUE
8277rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8278{
8279 rb_encoding *enc;
8280 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8281 VALUE ret;
8282
8283 flags = check_case_options(argc, argv, flags);
8284 enc = str_true_enc(str);
8285 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8286 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8287 ret = rb_str_new(0, RSTRING_LEN(str));
8288 rb_str_ascii_casemap(str, ret, &flags, enc);
8289 }
8290 else {
8291 ret = rb_str_casemap(str, &flags, enc);
8292 }
8293 return ret;
8294}
8295
8296
8297/*
8298 * call-seq:
8299 * swapcase!(mapping) -> self or nil
8300 *
8301 * Upcases each lowercase character in +self+;
8302 * downcases uppercase character;
8303 * returns +self+ if any changes were made, +nil+ otherwise:
8304 *
8305 * s = 'Hello World!' # => "Hello World!"
8306 * s.swapcase! # => "hELLO wORLD!"
8307 * s # => "hELLO wORLD!"
8308 * ''.swapcase! # => nil
8309 *
8310 * The casing may be affected by the given +mapping+;
8311 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8312 *
8313 * Related: String#swapcase.
8314 *
8315 */
8316
8317static VALUE
8318rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8319{
8320 rb_encoding *enc;
8321 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8322
8323 flags = check_case_options(argc, argv, flags);
8324 str_modify_keep_cr(str);
8325 enc = str_true_enc(str);
8326 if (flags&ONIGENC_CASE_ASCII_ONLY)
8327 rb_str_ascii_casemap(str, str, &flags, enc);
8328 else
8329 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8330
8331 if (ONIGENC_CASE_MODIFIED&flags) return str;
8332 return Qnil;
8333}
8334
8335
8336/*
8337 * call-seq:
8338 * swapcase(mapping) -> string
8339 *
8340 * Returns a string containing the characters in +self+, with cases reversed;
8341 * each uppercase character is downcased;
8342 * each lowercase character is upcased:
8343 *
8344 * s = 'Hello World!' # => "Hello World!"
8345 * s.swapcase # => "hELLO wORLD!"
8346 *
8347 * The casing may be affected by the given +mapping+;
8348 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8349 *
8350 * Related: String#swapcase!.
8351 *
8352 */
8353
8354static VALUE
8355rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8356{
8357 rb_encoding *enc;
8358 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8359 VALUE ret;
8360
8361 flags = check_case_options(argc, argv, flags);
8362 enc = str_true_enc(str);
8363 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8364 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8365 ret = rb_str_new(0, RSTRING_LEN(str));
8366 rb_str_ascii_casemap(str, ret, &flags, enc);
8367 }
8368 else {
8369 ret = rb_str_casemap(str, &flags, enc);
8370 }
8371 return ret;
8372}
8373
8374typedef unsigned char *USTR;
8375
8376struct tr {
8377 int gen;
8378 unsigned int now, max;
8379 char *p, *pend;
8380};
8381
8382static unsigned int
8383trnext(struct tr *t, rb_encoding *enc)
8384{
8385 int n;
8386
8387 for (;;) {
8388 nextpart:
8389 if (!t->gen) {
8390 if (t->p == t->pend) return -1;
8391 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8392 t->p += n;
8393 }
8394 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8395 t->p += n;
8396 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8397 t->p += n;
8398 if (t->p < t->pend) {
8399 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8400 t->p += n;
8401 if (t->now > c) {
8402 if (t->now < 0x80 && c < 0x80) {
8403 rb_raise(rb_eArgError,
8404 "invalid range \"%c-%c\" in string transliteration",
8405 t->now, c);
8406 }
8407 else {
8408 rb_raise(rb_eArgError, "invalid range in string transliteration");
8409 }
8410 continue; /* not reached */
8411 }
8412 else if (t->now < c) {
8413 t->gen = 1;
8414 t->max = c;
8415 }
8416 }
8417 }
8418 return t->now;
8419 }
8420 else {
8421 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8422 if (t->now == t->max) {
8423 t->gen = 0;
8424 goto nextpart;
8425 }
8426 }
8427 if (t->now < t->max) {
8428 return t->now;
8429 }
8430 else {
8431 t->gen = 0;
8432 return t->max;
8433 }
8434 }
8435 }
8436}
8437
8438static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8439
8440static VALUE
8441tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8442{
8443 const unsigned int errc = -1;
8444 unsigned int trans[256];
8445 rb_encoding *enc, *e1, *e2;
8446 struct tr trsrc, trrepl;
8447 int cflag = 0;
8448 unsigned int c, c0, last = 0;
8449 int modify = 0, i, l;
8450 unsigned char *s, *send;
8451 VALUE hash = 0;
8452 int singlebyte = single_byte_optimizable(str);
8453 int termlen;
8454 int cr;
8455
8456#define CHECK_IF_ASCII(c) \
8457 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8458 (cr = ENC_CODERANGE_VALID) : 0)
8459
8460 StringValue(src);
8461 StringValue(repl);
8462 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8463 if (RSTRING_LEN(repl) == 0) {
8464 return rb_str_delete_bang(1, &src, str);
8465 }
8466
8467 cr = ENC_CODERANGE(str);
8468 e1 = rb_enc_check(str, src);
8469 e2 = rb_enc_check(str, repl);
8470 if (e1 == e2) {
8471 enc = e1;
8472 }
8473 else {
8474 enc = rb_enc_check(src, repl);
8475 }
8476 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8477 if (RSTRING_LEN(src) > 1 &&
8478 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8479 trsrc.p + l < trsrc.pend) {
8480 cflag = 1;
8481 trsrc.p += l;
8482 }
8483 trrepl.p = RSTRING_PTR(repl);
8484 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8485 trsrc.gen = trrepl.gen = 0;
8486 trsrc.now = trrepl.now = 0;
8487 trsrc.max = trrepl.max = 0;
8488
8489 if (cflag) {
8490 for (i=0; i<256; i++) {
8491 trans[i] = 1;
8492 }
8493 while ((c = trnext(&trsrc, enc)) != errc) {
8494 if (c < 256) {
8495 trans[c] = errc;
8496 }
8497 else {
8498 if (!hash) hash = rb_hash_new();
8499 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8500 }
8501 }
8502 while ((c = trnext(&trrepl, enc)) != errc)
8503 /* retrieve last replacer */;
8504 last = trrepl.now;
8505 for (i=0; i<256; i++) {
8506 if (trans[i] != errc) {
8507 trans[i] = last;
8508 }
8509 }
8510 }
8511 else {
8512 unsigned int r;
8513
8514 for (i=0; i<256; i++) {
8515 trans[i] = errc;
8516 }
8517 while ((c = trnext(&trsrc, enc)) != errc) {
8518 r = trnext(&trrepl, enc);
8519 if (r == errc) r = trrepl.now;
8520 if (c < 256) {
8521 trans[c] = r;
8522 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8523 }
8524 else {
8525 if (!hash) hash = rb_hash_new();
8526 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8527 }
8528 }
8529 }
8530
8531 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8532 cr = ENC_CODERANGE_7BIT;
8533 str_modify_keep_cr(str);
8534 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8535 termlen = rb_enc_mbminlen(enc);
8536 if (sflag) {
8537 int clen, tlen;
8538 long offset, max = RSTRING_LEN(str);
8539 unsigned int save = -1;
8540 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8541
8542 while (s < send) {
8543 int may_modify = 0;
8544
8545 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8546 if (!MBCLEN_CHARFOUND_P(r)) {
8547 xfree(buf);
8548 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8549 }
8550 clen = MBCLEN_CHARFOUND_LEN(r);
8551 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8552
8553 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8554
8555 s += clen;
8556 if (c < 256) {
8557 c = trans[c];
8558 }
8559 else if (hash) {
8560 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8561 if (NIL_P(tmp)) {
8562 if (cflag) c = last;
8563 else c = errc;
8564 }
8565 else if (cflag) c = errc;
8566 else c = NUM2INT(tmp);
8567 }
8568 else {
8569 c = errc;
8570 }
8571 if (c != (unsigned int)-1) {
8572 if (save == c) {
8573 CHECK_IF_ASCII(c);
8574 continue;
8575 }
8576 save = c;
8577 tlen = rb_enc_codelen(c, enc);
8578 modify = 1;
8579 }
8580 else {
8581 save = -1;
8582 c = c0;
8583 if (enc != e1) may_modify = 1;
8584 }
8585 if ((offset = t - buf) + tlen > max) {
8586 size_t MAYBE_UNUSED(old) = max + termlen;
8587 max = offset + tlen + (send - s);
8588 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8589 t = buf + offset;
8590 }
8591 rb_enc_mbcput(c, t, enc);
8592 if (may_modify && memcmp(s, t, tlen) != 0) {
8593 modify = 1;
8594 }
8595 CHECK_IF_ASCII(c);
8596 t += tlen;
8597 }
8598 if (!STR_EMBED_P(str)) {
8599 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8600 }
8601 TERM_FILL((char *)t, termlen);
8602 RSTRING(str)->as.heap.ptr = (char *)buf;
8603 STR_SET_LEN(str, t - buf);
8604 STR_SET_NOEMBED(str);
8605 RSTRING(str)->as.heap.aux.capa = max;
8606 }
8607 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8608 while (s < send) {
8609 c = (unsigned char)*s;
8610 if (trans[c] != errc) {
8611 if (!cflag) {
8612 c = trans[c];
8613 *s = c;
8614 modify = 1;
8615 }
8616 else {
8617 *s = last;
8618 modify = 1;
8619 }
8620 }
8621 CHECK_IF_ASCII(c);
8622 s++;
8623 }
8624 }
8625 else {
8626 int clen, tlen;
8627 long offset, max = (long)((send - s) * 1.2);
8628 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8629
8630 while (s < send) {
8631 int may_modify = 0;
8632
8633 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8634 if (!MBCLEN_CHARFOUND_P(r)) {
8635 xfree(buf);
8636 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8637 }
8638 clen = MBCLEN_CHARFOUND_LEN(r);
8639 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8640
8641 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8642
8643 if (c < 256) {
8644 c = trans[c];
8645 }
8646 else if (hash) {
8647 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8648 if (NIL_P(tmp)) {
8649 if (cflag) c = last;
8650 else c = errc;
8651 }
8652 else if (cflag) c = errc;
8653 else c = NUM2INT(tmp);
8654 }
8655 else {
8656 c = cflag ? last : errc;
8657 }
8658 if (c != errc) {
8659 tlen = rb_enc_codelen(c, enc);
8660 modify = 1;
8661 }
8662 else {
8663 c = c0;
8664 if (enc != e1) may_modify = 1;
8665 }
8666 if ((offset = t - buf) + tlen > max) {
8667 size_t MAYBE_UNUSED(old) = max + termlen;
8668 max = offset + tlen + (long)((send - s) * 1.2);
8669 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8670 t = buf + offset;
8671 }
8672 if (s != t) {
8673 rb_enc_mbcput(c, t, enc);
8674 if (may_modify && memcmp(s, t, tlen) != 0) {
8675 modify = 1;
8676 }
8677 }
8678 CHECK_IF_ASCII(c);
8679 s += clen;
8680 t += tlen;
8681 }
8682 if (!STR_EMBED_P(str)) {
8683 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8684 }
8685 TERM_FILL((char *)t, termlen);
8686 RSTRING(str)->as.heap.ptr = (char *)buf;
8687 STR_SET_LEN(str, t - buf);
8688 STR_SET_NOEMBED(str);
8689 RSTRING(str)->as.heap.aux.capa = max;
8690 }
8691
8692 if (modify) {
8693 if (cr != ENC_CODERANGE_BROKEN)
8694 ENC_CODERANGE_SET(str, cr);
8695 rb_enc_associate(str, enc);
8696 return str;
8697 }
8698 return Qnil;
8699}
8700
8701
8702/*
8703 * call-seq:
8704 * tr!(selector, replacements) -> self or nil
8705 *
8706 * Like String#tr, but modifies +self+ in place.
8707 * Returns +self+ if any changes were made, +nil+ otherwise.
8708 *
8709 */
8710
8711static VALUE
8712rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8713{
8714 return tr_trans(str, src, repl, 0);
8715}
8716
8717
8718/*
8719 * call-seq:
8720 * tr(selector, replacements) -> new_string
8721 *
8722 * Returns a copy of +self+ with each character specified by string +selector+
8723 * translated to the corresponding character in string +replacements+.
8724 * The correspondence is _positional_:
8725 *
8726 * - Each occurrence of the first character specified by +selector+
8727 * is translated to the first character in +replacements+.
8728 * - Each occurrence of the second character specified by +selector+
8729 * is translated to the second character in +replacements+.
8730 * - And so on.
8731 *
8732 * Example:
8733 *
8734 * 'hello'.tr('el', 'ip') #=> "hippo"
8735 *
8736 * If +replacements+ is shorter than +selector+,
8737 * it is implicitly padded with its own last character:
8738 *
8739 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8740 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8741 *
8742 * Arguments +selector+ and +replacements+ must be valid character selectors
8743 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8744 * and may use any of its valid forms, including negation, ranges, and escaping:
8745 *
8746 * # Negation.
8747 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8748 * # Ranges.
8749 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8750 * # Escapes.
8751 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8752 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8753 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8754 *
8755 */
8756
8757static VALUE
8758rb_str_tr(VALUE str, VALUE src, VALUE repl)
8759{
8760 str = str_duplicate(rb_cString, str);
8761 tr_trans(str, src, repl, 0);
8762 return str;
8763}
8764
8765#define TR_TABLE_MAX (UCHAR_MAX+1)
8766#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8767static void
8768tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8769 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8770{
8771 const unsigned int errc = -1;
8772 char buf[TR_TABLE_MAX];
8773 struct tr tr;
8774 unsigned int c;
8775 VALUE table = 0, ptable = 0;
8776 int i, l, cflag = 0;
8777
8778 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8779 tr.gen = tr.now = tr.max = 0;
8780
8781 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8782 cflag = 1;
8783 tr.p += l;
8784 }
8785 if (first) {
8786 for (i=0; i<TR_TABLE_MAX; i++) {
8787 stable[i] = 1;
8788 }
8789 stable[TR_TABLE_MAX] = cflag;
8790 }
8791 else if (stable[TR_TABLE_MAX] && !cflag) {
8792 stable[TR_TABLE_MAX] = 0;
8793 }
8794 for (i=0; i<TR_TABLE_MAX; i++) {
8795 buf[i] = cflag;
8796 }
8797
8798 while ((c = trnext(&tr, enc)) != errc) {
8799 if (c < TR_TABLE_MAX) {
8800 buf[(unsigned char)c] = !cflag;
8801 }
8802 else {
8803 VALUE key = UINT2NUM(c);
8804
8805 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8806 if (cflag) {
8807 ptable = *ctablep;
8808 table = ptable ? ptable : rb_hash_new();
8809 *ctablep = table;
8810 }
8811 else {
8812 table = rb_hash_new();
8813 ptable = *tablep;
8814 *tablep = table;
8815 }
8816 }
8817 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8818 rb_hash_aset(table, key, Qtrue);
8819 }
8820 }
8821 }
8822 for (i=0; i<TR_TABLE_MAX; i++) {
8823 stable[i] = stable[i] && buf[i];
8824 }
8825 if (!table && !cflag) {
8826 *tablep = 0;
8827 }
8828}
8829
8830
8831static int
8832tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8833{
8834 if (c < TR_TABLE_MAX) {
8835 return table[c] != 0;
8836 }
8837 else {
8838 VALUE v = UINT2NUM(c);
8839
8840 if (del) {
8841 if (!NIL_P(rb_hash_lookup(del, v)) &&
8842 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8843 return TRUE;
8844 }
8845 }
8846 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8847 return FALSE;
8848 }
8849 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8850 }
8851}
8852
8853/*
8854 * call-seq:
8855 * delete!(*selectors) -> self or nil
8856 *
8857 * Like String#delete, but modifies +self+ in place;
8858 * returns +self+ if any characters were deleted, +nil+ otherwise.
8859 *
8860 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8861 */
8862
8863static VALUE
8864rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8865{
8866 char squeez[TR_TABLE_SIZE];
8867 rb_encoding *enc = 0;
8868 char *s, *send, *t;
8869 VALUE del = 0, nodel = 0;
8870 int modify = 0;
8871 int i, ascompat, cr;
8872
8873 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8875 for (i=0; i<argc; i++) {
8876 VALUE s = argv[i];
8877
8878 StringValue(s);
8879 enc = rb_enc_check(str, s);
8880 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8881 }
8882
8883 str_modify_keep_cr(str);
8884 ascompat = rb_enc_asciicompat(enc);
8885 s = t = RSTRING_PTR(str);
8886 send = RSTRING_END(str);
8887 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8888 while (s < send) {
8889 unsigned int c;
8890 int clen;
8891
8892 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8893 if (squeez[c]) {
8894 modify = 1;
8895 }
8896 else {
8897 if (t != s) *t = c;
8898 t++;
8899 }
8900 s++;
8901 }
8902 else {
8903 c = rb_enc_codepoint_len(s, send, &clen, enc);
8904
8905 if (tr_find(c, squeez, del, nodel)) {
8906 modify = 1;
8907 }
8908 else {
8909 if (t != s) rb_enc_mbcput(c, t, enc);
8910 t += clen;
8912 }
8913 s += clen;
8914 }
8915 }
8916 TERM_FILL(t, TERM_LEN(str));
8917 STR_SET_LEN(str, t - RSTRING_PTR(str));
8918 ENC_CODERANGE_SET(str, cr);
8919
8920 if (modify) return str;
8921 return Qnil;
8922}
8923
8924
8925/*
8926 * call-seq:
8927 * delete(*selectors) -> new_string
8928 *
8929 * :include: doc/string/delete.rdoc
8930 *
8931 */
8932
8933static VALUE
8934rb_str_delete(int argc, VALUE *argv, VALUE str)
8935{
8936 str = str_duplicate(rb_cString, str);
8937 rb_str_delete_bang(argc, argv, str);
8938 return str;
8939}
8940
8941
8942/*
8943 * call-seq:
8944 * squeeze!(*selectors) -> self or nil
8945 *
8946 * Like String#squeeze, but modifies +self+ in place.
8947 * Returns +self+ if any changes were made, +nil+ otherwise.
8948 */
8949
8950static VALUE
8951rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8952{
8953 char squeez[TR_TABLE_SIZE];
8954 rb_encoding *enc = 0;
8955 VALUE del = 0, nodel = 0;
8956 unsigned char *s, *send, *t;
8957 int i, modify = 0;
8958 int ascompat, singlebyte = single_byte_optimizable(str);
8959 unsigned int save;
8960
8961 if (argc == 0) {
8962 enc = STR_ENC_GET(str);
8963 }
8964 else {
8965 for (i=0; i<argc; i++) {
8966 VALUE s = argv[i];
8967
8968 StringValue(s);
8969 enc = rb_enc_check(str, s);
8970 if (singlebyte && !single_byte_optimizable(s))
8971 singlebyte = 0;
8972 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8973 }
8974 }
8975
8976 str_modify_keep_cr(str);
8977 s = t = (unsigned char *)RSTRING_PTR(str);
8978 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8979 send = (unsigned char *)RSTRING_END(str);
8980 save = -1;
8981 ascompat = rb_enc_asciicompat(enc);
8982
8983 if (singlebyte) {
8984 while (s < send) {
8985 unsigned int c = *s++;
8986 if (c != save || (argc > 0 && !squeez[c])) {
8987 *t++ = save = c;
8988 }
8989 }
8990 }
8991 else {
8992 while (s < send) {
8993 unsigned int c;
8994 int clen;
8995
8996 if (ascompat && (c = *s) < 0x80) {
8997 if (c != save || (argc > 0 && !squeez[c])) {
8998 *t++ = save = c;
8999 }
9000 s++;
9001 }
9002 else {
9003 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9004
9005 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9006 if (t != s) rb_enc_mbcput(c, t, enc);
9007 save = c;
9008 t += clen;
9009 }
9010 s += clen;
9011 }
9012 }
9013 }
9014
9015 TERM_FILL((char *)t, TERM_LEN(str));
9016 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9017 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9018 modify = 1;
9019 }
9020
9021 if (modify) return str;
9022 return Qnil;
9023}
9024
9025
9026/*
9027 * call-seq:
9028 * squeeze(*selectors) -> new_string
9029 *
9030 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9031 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9032 *
9033 * "Squeezed" means that each multiple-character run of a selected character
9034 * is squeezed down to a single character;
9035 * with no arguments given, squeezes all characters:
9036 *
9037 * "yellow moon".squeeze #=> "yelow mon"
9038 * " now is the".squeeze(" ") #=> " now is the"
9039 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9040 *
9041 */
9042
9043static VALUE
9044rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9045{
9046 str = str_duplicate(rb_cString, str);
9047 rb_str_squeeze_bang(argc, argv, str);
9048 return str;
9049}
9050
9051
9052/*
9053 * call-seq:
9054 * tr_s!(selector, replacements) -> self or nil
9055 *
9056 * Like String#tr_s, but modifies +self+ in place.
9057 * Returns +self+ if any changes were made, +nil+ otherwise.
9058 *
9059 * Related: String#squeeze!.
9060 */
9061
9062static VALUE
9063rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9064{
9065 return tr_trans(str, src, repl, 1);
9066}
9067
9068
9069/*
9070 * call-seq:
9071 * tr_s(selector, replacements) -> string
9072 *
9073 * Like String#tr, but also squeezes the modified portions of the translated string;
9074 * returns a new string (translated and squeezed).
9075 *
9076 * 'hello'.tr_s('l', 'r') #=> "hero"
9077 * 'hello'.tr_s('el', '-') #=> "h-o"
9078 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9079 *
9080 * Related: String#squeeze.
9081 *
9082 */
9083
9084static VALUE
9085rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9086{
9087 str = str_duplicate(rb_cString, str);
9088 tr_trans(str, src, repl, 1);
9089 return str;
9090}
9091
9092
9093/*
9094 * call-seq:
9095 * count(*selectors) -> integer
9096 *
9097 * :include: doc/string/count.rdoc
9098 */
9099
9100static VALUE
9101rb_str_count(int argc, VALUE *argv, VALUE str)
9102{
9103 char table[TR_TABLE_SIZE];
9104 rb_encoding *enc = 0;
9105 VALUE del = 0, nodel = 0, tstr;
9106 char *s, *send;
9107 int i;
9108 int ascompat;
9109 size_t n = 0;
9110
9112
9113 tstr = argv[0];
9114 StringValue(tstr);
9115 enc = rb_enc_check(str, tstr);
9116 if (argc == 1) {
9117 const char *ptstr;
9118 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9119 (ptstr = RSTRING_PTR(tstr),
9120 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9121 !is_broken_string(str)) {
9122 int clen;
9123 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9124
9125 s = RSTRING_PTR(str);
9126 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9127 send = RSTRING_END(str);
9128 while (s < send) {
9129 if (*(unsigned char*)s++ == c) n++;
9130 }
9131 return SIZET2NUM(n);
9132 }
9133 }
9134
9135 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9136 for (i=1; i<argc; i++) {
9137 tstr = argv[i];
9138 StringValue(tstr);
9139 enc = rb_enc_check(str, tstr);
9140 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9141 }
9142
9143 s = RSTRING_PTR(str);
9144 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9145 send = RSTRING_END(str);
9146 ascompat = rb_enc_asciicompat(enc);
9147 while (s < send) {
9148 unsigned int c;
9149
9150 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9151 if (table[c]) {
9152 n++;
9153 }
9154 s++;
9155 }
9156 else {
9157 int clen;
9158 c = rb_enc_codepoint_len(s, send, &clen, enc);
9159 if (tr_find(c, table, del, nodel)) {
9160 n++;
9161 }
9162 s += clen;
9163 }
9164 }
9165
9166 return SIZET2NUM(n);
9167}
9168
9169static VALUE
9170rb_fs_check(VALUE val)
9171{
9172 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9173 val = rb_check_string_type(val);
9174 if (NIL_P(val)) return 0;
9175 }
9176 return val;
9177}
9178
9179static const char isspacetable[256] = {
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9182 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9196};
9197
9198#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9199
9200static long
9201split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9202{
9203 if (empty_count >= 0 && len == 0) {
9204 return empty_count + 1;
9205 }
9206 if (empty_count > 0) {
9207 /* make different substrings */
9208 if (result) {
9209 do {
9210 rb_ary_push(result, str_new_empty_String(str));
9211 } while (--empty_count > 0);
9212 }
9213 else {
9214 do {
9215 rb_yield(str_new_empty_String(str));
9216 } while (--empty_count > 0);
9217 }
9218 }
9219 str = rb_str_subseq(str, beg, len);
9220 if (result) {
9221 rb_ary_push(result, str);
9222 }
9223 else {
9224 rb_yield(str);
9225 }
9226 return empty_count;
9227}
9228
9229typedef enum {
9230 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9231} split_type_t;
9232
9233static split_type_t
9234literal_split_pattern(VALUE spat, split_type_t default_type)
9235{
9236 rb_encoding *enc = STR_ENC_GET(spat);
9237 const char *ptr;
9238 long len;
9239 RSTRING_GETMEM(spat, ptr, len);
9240 if (len == 0) {
9241 /* Special case - split into chars */
9242 return SPLIT_TYPE_CHARS;
9243 }
9244 else if (rb_enc_asciicompat(enc)) {
9245 if (len == 1 && ptr[0] == ' ') {
9246 return SPLIT_TYPE_AWK;
9247 }
9248 }
9249 else {
9250 int l;
9251 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9252 return SPLIT_TYPE_AWK;
9253 }
9254 }
9255 return default_type;
9256}
9257
9258/*
9259 * call-seq:
9260 * split(field_sep = $;, limit = 0) -> array
9261 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9262 *
9263 * :include: doc/string/split.rdoc
9264 *
9265 */
9266
9267static VALUE
9268rb_str_split_m(int argc, VALUE *argv, VALUE str)
9269{
9270 rb_encoding *enc;
9271 VALUE spat;
9272 VALUE limit;
9273 split_type_t split_type;
9274 long beg, end, i = 0, empty_count = -1;
9275 int lim = 0;
9276 VALUE result, tmp;
9277
9278 result = rb_block_given_p() ? Qfalse : Qnil;
9279 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9280 lim = NUM2INT(limit);
9281 if (lim <= 0) limit = Qnil;
9282 else if (lim == 1) {
9283 if (RSTRING_LEN(str) == 0)
9284 return result ? rb_ary_new2(0) : str;
9285 tmp = str_duplicate(rb_cString, str);
9286 if (!result) {
9287 rb_yield(tmp);
9288 return str;
9289 }
9290 return rb_ary_new3(1, tmp);
9291 }
9292 i = 1;
9293 }
9294 if (NIL_P(limit) && !lim) empty_count = 0;
9295
9296 enc = STR_ENC_GET(str);
9297 split_type = SPLIT_TYPE_REGEXP;
9298 if (!NIL_P(spat)) {
9299 spat = get_pat_quoted(spat, 0);
9300 }
9301 else if (NIL_P(spat = rb_fs)) {
9302 split_type = SPLIT_TYPE_AWK;
9303 }
9304 else if (!(spat = rb_fs_check(spat))) {
9305 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9306 }
9307 else {
9308 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9309 }
9310 if (split_type != SPLIT_TYPE_AWK) {
9311 switch (BUILTIN_TYPE(spat)) {
9312 case T_REGEXP:
9313 rb_reg_options(spat); /* check if uninitialized */
9314 tmp = RREGEXP_SRC(spat);
9315 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9316 if (split_type == SPLIT_TYPE_AWK) {
9317 spat = tmp;
9318 split_type = SPLIT_TYPE_STRING;
9319 }
9320 break;
9321
9322 case T_STRING:
9323 mustnot_broken(spat);
9324 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9325 break;
9326
9327 default:
9329 }
9330 }
9331
9332#define SPLIT_STR(beg, len) ( \
9333 empty_count = split_string(result, str, beg, len, empty_count), \
9334 str_mod_check(str, str_start, str_len))
9335
9336 beg = 0;
9337 char *ptr = RSTRING_PTR(str);
9338 char *const str_start = ptr;
9339 const long str_len = RSTRING_LEN(str);
9340 char *const eptr = str_start + str_len;
9341 if (split_type == SPLIT_TYPE_AWK) {
9342 char *bptr = ptr;
9343 int skip = 1;
9344 unsigned int c;
9345
9346 if (result) result = rb_ary_new();
9347 end = beg;
9348 if (is_ascii_string(str)) {
9349 while (ptr < eptr) {
9350 c = (unsigned char)*ptr++;
9351 if (skip) {
9352 if (ascii_isspace(c)) {
9353 beg = ptr - bptr;
9354 }
9355 else {
9356 end = ptr - bptr;
9357 skip = 0;
9358 if (!NIL_P(limit) && lim <= i) break;
9359 }
9360 }
9361 else if (ascii_isspace(c)) {
9362 SPLIT_STR(beg, end-beg);
9363 skip = 1;
9364 beg = ptr - bptr;
9365 if (!NIL_P(limit)) ++i;
9366 }
9367 else {
9368 end = ptr - bptr;
9369 }
9370 }
9371 }
9372 else {
9373 while (ptr < eptr) {
9374 int n;
9375
9376 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9377 ptr += n;
9378 if (skip) {
9379 if (rb_isspace(c)) {
9380 beg = ptr - bptr;
9381 }
9382 else {
9383 end = ptr - bptr;
9384 skip = 0;
9385 if (!NIL_P(limit) && lim <= i) break;
9386 }
9387 }
9388 else if (rb_isspace(c)) {
9389 SPLIT_STR(beg, end-beg);
9390 skip = 1;
9391 beg = ptr - bptr;
9392 if (!NIL_P(limit)) ++i;
9393 }
9394 else {
9395 end = ptr - bptr;
9396 }
9397 }
9398 }
9399 }
9400 else if (split_type == SPLIT_TYPE_STRING) {
9401 char *substr_start = ptr;
9402 char *sptr = RSTRING_PTR(spat);
9403 long slen = RSTRING_LEN(spat);
9404
9405 if (result) result = rb_ary_new();
9406 mustnot_broken(str);
9407 enc = rb_enc_check(str, spat);
9408 while (ptr < eptr &&
9409 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9410 /* Check we are at the start of a char */
9411 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9412 if (t != ptr + end) {
9413 ptr = t;
9414 continue;
9415 }
9416 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9417 str_mod_check(spat, sptr, slen);
9418 ptr += end + slen;
9419 substr_start = ptr;
9420 if (!NIL_P(limit) && lim <= ++i) break;
9421 }
9422 beg = ptr - str_start;
9423 }
9424 else if (split_type == SPLIT_TYPE_CHARS) {
9425 int n;
9426
9427 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9428 mustnot_broken(str);
9429 enc = rb_enc_get(str);
9430 while (ptr < eptr &&
9431 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9432 SPLIT_STR(ptr - str_start, n);
9433 ptr += n;
9434 if (!NIL_P(limit) && lim <= ++i) break;
9435 }
9436 beg = ptr - str_start;
9437 }
9438 else {
9439 if (result) result = rb_ary_new();
9440 long len = RSTRING_LEN(str);
9441 long start = beg;
9442 long idx;
9443 int last_null = 0;
9444 struct re_registers *regs;
9445 VALUE match = 0;
9446
9447 for (; rb_reg_search(spat, str, start, 0) >= 0;
9448 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9449 match = rb_backref_get();
9450 if (!result) rb_match_busy(match);
9451 regs = RMATCH_REGS(match);
9452 end = BEG(0);
9453 if (start == end && BEG(0) == END(0)) {
9454 if (!ptr) {
9455 SPLIT_STR(0, 0);
9456 break;
9457 }
9458 else if (last_null == 1) {
9459 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9460 beg = start;
9461 }
9462 else {
9463 if (start == len)
9464 start++;
9465 else
9466 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9467 last_null = 1;
9468 continue;
9469 }
9470 }
9471 else {
9472 SPLIT_STR(beg, end-beg);
9473 beg = start = END(0);
9474 }
9475 last_null = 0;
9476
9477 for (idx=1; idx < regs->num_regs; idx++) {
9478 if (BEG(idx) == -1) continue;
9479 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9480 }
9481 if (!NIL_P(limit) && lim <= ++i) break;
9482 }
9483 if (match) rb_match_unbusy(match);
9484 }
9485 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9486 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9487 }
9488
9489 return result ? result : str;
9490}
9491
9492VALUE
9493rb_str_split(VALUE str, const char *sep0)
9494{
9495 VALUE sep;
9496
9497 StringValue(str);
9498 sep = rb_str_new_cstr(sep0);
9499 return rb_str_split_m(1, &sep, str);
9500}
9501
9502#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9503
9504static inline int
9505enumerator_element(VALUE ary, VALUE e)
9506{
9507 if (ary) {
9508 rb_ary_push(ary, e);
9509 return 0;
9510 }
9511 else {
9512 rb_yield(e);
9513 return 1;
9514 }
9515}
9516
9517#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9518
9519static const char *
9520chomp_newline(const char *p, const char *e, rb_encoding *enc)
9521{
9522 const char *prev = rb_enc_prev_char(p, e, e, enc);
9523 if (rb_enc_is_newline(prev, e, enc)) {
9524 e = prev;
9525 prev = rb_enc_prev_char(p, e, e, enc);
9526 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9527 e = prev;
9528 }
9529 return e;
9530}
9531
9532static VALUE
9533get_rs(void)
9534{
9535 VALUE rs = rb_rs;
9536 if (!NIL_P(rs) &&
9537 (!RB_TYPE_P(rs, T_STRING) ||
9538 RSTRING_LEN(rs) != 1 ||
9539 RSTRING_PTR(rs)[0] != '\n')) {
9540 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9541 }
9542 return rs;
9543}
9544
9545#define rb_rs get_rs()
9546
9547static VALUE
9548rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9549{
9550 rb_encoding *enc;
9551 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9552 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9553 long pos, len, rslen;
9554 int rsnewline = 0;
9555
9556 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9557 rs = rb_rs;
9558 if (!NIL_P(opts)) {
9559 static ID keywords[1];
9560 if (!keywords[0]) {
9561 keywords[0] = rb_intern_const("chomp");
9562 }
9563 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9564 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9565 }
9566
9567 if (NIL_P(rs)) {
9568 if (!ENUM_ELEM(ary, str)) {
9569 return ary;
9570 }
9571 else {
9572 return orig;
9573 }
9574 }
9575
9576 if (!RSTRING_LEN(str)) goto end;
9577 str = rb_str_new_frozen(str);
9578 ptr = subptr = RSTRING_PTR(str);
9579 pend = RSTRING_END(str);
9580 len = RSTRING_LEN(str);
9581 StringValue(rs);
9582 rslen = RSTRING_LEN(rs);
9583
9584 if (rs == rb_default_rs)
9585 enc = rb_enc_get(str);
9586 else
9587 enc = rb_enc_check(str, rs);
9588
9589 if (rslen == 0) {
9590 /* paragraph mode */
9591 int n;
9592 const char *eol = NULL;
9593 subend = subptr;
9594 while (subend < pend) {
9595 long chomp_rslen = 0;
9596 do {
9597 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9598 n = 0;
9599 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9600 if (rb_enc_is_newline(subend + n, pend, enc)) {
9601 if (eol == subend) break;
9602 subend += rslen;
9603 if (subptr) {
9604 eol = subend;
9605 chomp_rslen = -rslen;
9606 }
9607 }
9608 else {
9609 if (!subptr) subptr = subend;
9610 subend += rslen;
9611 }
9612 rslen = 0;
9613 } while (subend < pend);
9614 if (!subptr) break;
9615 if (rslen == 0) chomp_rslen = 0;
9616 line = rb_str_subseq(str, subptr - ptr,
9617 subend - subptr + (chomp ? chomp_rslen : rslen));
9618 if (ENUM_ELEM(ary, line)) {
9619 str_mod_check(str, ptr, len);
9620 }
9621 subptr = eol = NULL;
9622 }
9623 goto end;
9624 }
9625 else {
9626 rsptr = RSTRING_PTR(rs);
9627 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9628 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9629 rsnewline = 1;
9630 }
9631 }
9632
9633 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9634 rs = rb_str_new(rsptr, rslen);
9635 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9636 rsptr = RSTRING_PTR(rs);
9637 rslen = RSTRING_LEN(rs);
9638 }
9639
9640 while (subptr < pend) {
9641 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9642 if (pos < 0) break;
9643 hit = subptr + pos;
9644 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9645 if (hit != adjusted) {
9646 subptr = adjusted;
9647 continue;
9648 }
9649 subend = hit += rslen;
9650 if (chomp) {
9651 if (rsnewline) {
9652 subend = chomp_newline(subptr, subend, enc);
9653 }
9654 else {
9655 subend -= rslen;
9656 }
9657 }
9658 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9659 if (ENUM_ELEM(ary, line)) {
9660 str_mod_check(str, ptr, len);
9661 }
9662 subptr = hit;
9663 }
9664
9665 if (subptr != pend) {
9666 if (chomp) {
9667 if (rsnewline) {
9668 pend = chomp_newline(subptr, pend, enc);
9669 }
9670 else if (pend - subptr >= rslen &&
9671 memcmp(pend - rslen, rsptr, rslen) == 0) {
9672 pend -= rslen;
9673 }
9674 }
9675 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9676 ENUM_ELEM(ary, line);
9677 RB_GC_GUARD(str);
9678 }
9679
9680 end:
9681 if (ary)
9682 return ary;
9683 else
9684 return orig;
9685}
9686
9687/*
9688 * call-seq:
9689 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9690 * each_line(record_separator = $/, chomp: false) -> enumerator
9691 *
9692 * :include: doc/string/each_line.rdoc
9693 *
9694 */
9695
9696static VALUE
9697rb_str_each_line(int argc, VALUE *argv, VALUE str)
9698{
9699 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9700 return rb_str_enumerate_lines(argc, argv, str, 0);
9701}
9702
9703/*
9704 * call-seq:
9705 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9706 *
9707 * Forms substrings ("lines") of +self+ according to the given arguments
9708 * (see String#each_line for details); returns the lines in an array.
9709 *
9710 */
9711
9712static VALUE
9713rb_str_lines(int argc, VALUE *argv, VALUE str)
9714{
9715 VALUE ary = WANTARRAY("lines", 0);
9716 return rb_str_enumerate_lines(argc, argv, str, ary);
9717}
9718
9719static VALUE
9720rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9721{
9722 return LONG2FIX(RSTRING_LEN(str));
9723}
9724
9725static VALUE
9726rb_str_enumerate_bytes(VALUE str, VALUE ary)
9727{
9728 long i;
9729
9730 for (i=0; i<RSTRING_LEN(str); i++) {
9731 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9732 }
9733 if (ary)
9734 return ary;
9735 else
9736 return str;
9737}
9738
9739/*
9740 * call-seq:
9741 * each_byte {|byte| ... } -> self
9742 * each_byte -> enumerator
9743 *
9744 * :include: doc/string/each_byte.rdoc
9745 *
9746 */
9747
9748static VALUE
9749rb_str_each_byte(VALUE str)
9750{
9751 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9752 return rb_str_enumerate_bytes(str, 0);
9753}
9754
9755/*
9756 * call-seq:
9757 * bytes -> array_of_bytes
9758 *
9759 * :include: doc/string/bytes.rdoc
9760 *
9761 */
9762
9763static VALUE
9764rb_str_bytes(VALUE str)
9765{
9766 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9767 return rb_str_enumerate_bytes(str, ary);
9768}
9769
9770static VALUE
9771rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9772{
9773 return rb_str_length(str);
9774}
9775
9776static VALUE
9777rb_str_enumerate_chars(VALUE str, VALUE ary)
9778{
9779 VALUE orig = str;
9780 long i, len, n;
9781 const char *ptr;
9782 rb_encoding *enc;
9783
9784 str = rb_str_new_frozen(str);
9785 ptr = RSTRING_PTR(str);
9786 len = RSTRING_LEN(str);
9787 enc = rb_enc_get(str);
9788
9790 for (i = 0; i < len; i += n) {
9791 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9792 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9793 }
9794 }
9795 else {
9796 for (i = 0; i < len; i += n) {
9797 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9798 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9799 }
9800 }
9801 RB_GC_GUARD(str);
9802 if (ary)
9803 return ary;
9804 else
9805 return orig;
9806}
9807
9808/*
9809 * call-seq:
9810 * each_char {|char| ... } -> self
9811 * each_char -> enumerator
9812 *
9813 * :include: doc/string/each_char.rdoc
9814 *
9815 */
9816
9817static VALUE
9818rb_str_each_char(VALUE str)
9819{
9820 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9821 return rb_str_enumerate_chars(str, 0);
9822}
9823
9824/*
9825 * call-seq:
9826 * chars -> array_of_characters
9827 *
9828 * :include: doc/string/chars.rdoc
9829 *
9830 */
9831
9832static VALUE
9833rb_str_chars(VALUE str)
9834{
9835 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9836 return rb_str_enumerate_chars(str, ary);
9837}
9838
9839static VALUE
9840rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9841{
9842 VALUE orig = str;
9843 int n;
9844 unsigned int c;
9845 const char *ptr, *end;
9846 rb_encoding *enc;
9847
9848 if (single_byte_optimizable(str))
9849 return rb_str_enumerate_bytes(str, ary);
9850
9851 str = rb_str_new_frozen(str);
9852 ptr = RSTRING_PTR(str);
9853 end = RSTRING_END(str);
9854 enc = STR_ENC_GET(str);
9855
9856 while (ptr < end) {
9857 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9858 ENUM_ELEM(ary, UINT2NUM(c));
9859 ptr += n;
9860 }
9861 RB_GC_GUARD(str);
9862 if (ary)
9863 return ary;
9864 else
9865 return orig;
9866}
9867
9868/*
9869 * call-seq:
9870 * each_codepoint {|codepoint| ... } -> self
9871 * each_codepoint -> enumerator
9872 *
9873 * :include: doc/string/each_codepoint.rdoc
9874 *
9875 */
9876
9877static VALUE
9878rb_str_each_codepoint(VALUE str)
9879{
9880 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9881 return rb_str_enumerate_codepoints(str, 0);
9882}
9883
9884/*
9885 * call-seq:
9886 * codepoints -> array_of_integers
9887 *
9888 * :include: doc/string/codepoints.rdoc
9889 *
9890 */
9891
9892static VALUE
9893rb_str_codepoints(VALUE str)
9894{
9895 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9896 return rb_str_enumerate_codepoints(str, ary);
9897}
9898
9899static regex_t *
9900get_reg_grapheme_cluster(rb_encoding *enc)
9901{
9902 int encidx = rb_enc_to_index(enc);
9903
9904 const OnigUChar source_ascii[] = "\\X";
9905 const OnigUChar *source = source_ascii;
9906 size_t source_len = sizeof(source_ascii) - 1;
9907
9908 switch (encidx) {
9909#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9910#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9911#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9912#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9913#define CASE_UTF(e) \
9914 case ENCINDEX_UTF_##e: { \
9915 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9916 source = source_UTF_##e; \
9917 source_len = sizeof(source_UTF_##e); \
9918 break; \
9919 }
9920 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9921#undef CASE_UTF
9922#undef CHARS_16BE
9923#undef CHARS_16LE
9924#undef CHARS_32BE
9925#undef CHARS_32LE
9926 }
9927
9928 regex_t *reg_grapheme_cluster;
9929 OnigErrorInfo einfo;
9930 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9931 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9932 if (r) {
9933 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9934 onig_error_code_to_str(message, r, &einfo);
9935 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9936 }
9937
9938 return reg_grapheme_cluster;
9939}
9940
9941static regex_t *
9942get_cached_reg_grapheme_cluster(rb_encoding *enc)
9943{
9944 int encidx = rb_enc_to_index(enc);
9945 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9946
9947 if (encidx == rb_utf8_encindex()) {
9948 if (!reg_grapheme_cluster_utf8) {
9949 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9950 }
9951
9952 return reg_grapheme_cluster_utf8;
9953 }
9954
9955 return NULL;
9956}
9957
9958static VALUE
9959rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9960{
9961 size_t grapheme_cluster_count = 0;
9962 rb_encoding *enc = get_encoding(str);
9963 const char *ptr, *end;
9964
9965 if (!rb_enc_unicode_p(enc)) {
9966 return rb_str_length(str);
9967 }
9968
9969 bool cached_reg_grapheme_cluster = true;
9970 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9971 if (!reg_grapheme_cluster) {
9972 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9973 cached_reg_grapheme_cluster = false;
9974 }
9975
9976 ptr = RSTRING_PTR(str);
9977 end = RSTRING_END(str);
9978
9979 while (ptr < end) {
9980 OnigPosition len = onig_match(reg_grapheme_cluster,
9981 (const OnigUChar *)ptr, (const OnigUChar *)end,
9982 (const OnigUChar *)ptr, NULL, 0);
9983 if (len <= 0) break;
9984 grapheme_cluster_count++;
9985 ptr += len;
9986 }
9987
9988 if (!cached_reg_grapheme_cluster) {
9989 onig_free(reg_grapheme_cluster);
9990 }
9991
9992 return SIZET2NUM(grapheme_cluster_count);
9993}
9994
9995static VALUE
9996rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9997{
9998 VALUE orig = str;
9999 rb_encoding *enc = get_encoding(str);
10000 const char *ptr0, *ptr, *end;
10001
10002 if (!rb_enc_unicode_p(enc)) {
10003 return rb_str_enumerate_chars(str, ary);
10004 }
10005
10006 if (!ary) str = rb_str_new_frozen(str);
10007
10008 bool cached_reg_grapheme_cluster = true;
10009 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10010 if (!reg_grapheme_cluster) {
10011 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10012 cached_reg_grapheme_cluster = false;
10013 }
10014
10015 ptr0 = ptr = RSTRING_PTR(str);
10016 end = RSTRING_END(str);
10017
10018 while (ptr < end) {
10019 OnigPosition len = onig_match(reg_grapheme_cluster,
10020 (const OnigUChar *)ptr, (const OnigUChar *)end,
10021 (const OnigUChar *)ptr, NULL, 0);
10022 if (len <= 0) break;
10023 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10024 ptr += len;
10025 }
10026
10027 if (!cached_reg_grapheme_cluster) {
10028 onig_free(reg_grapheme_cluster);
10029 }
10030
10031 RB_GC_GUARD(str);
10032 if (ary)
10033 return ary;
10034 else
10035 return orig;
10036}
10037
10038/*
10039 * call-seq:
10040 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
10041 * each_grapheme_cluster -> enumerator
10042 *
10043 * :include: doc/string/each_grapheme_cluster.rdoc
10044 *
10045 */
10046
10047static VALUE
10048rb_str_each_grapheme_cluster(VALUE str)
10049{
10050 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10051 return rb_str_enumerate_grapheme_clusters(str, 0);
10052}
10053
10054/*
10055 * call-seq:
10056 * grapheme_clusters -> array_of_grapheme_clusters
10057 *
10058 * :include: doc/string/grapheme_clusters.rdoc
10059 *
10060 */
10061
10062static VALUE
10063rb_str_grapheme_clusters(VALUE str)
10064{
10065 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10066 return rb_str_enumerate_grapheme_clusters(str, ary);
10067}
10068
10069static long
10070chopped_length(VALUE str)
10071{
10072 rb_encoding *enc = STR_ENC_GET(str);
10073 const char *p, *p2, *beg, *end;
10074
10075 beg = RSTRING_PTR(str);
10076 end = beg + RSTRING_LEN(str);
10077 if (beg >= end) return 0;
10078 p = rb_enc_prev_char(beg, end, end, enc);
10079 if (!p) return 0;
10080 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10081 p2 = rb_enc_prev_char(beg, p, end, enc);
10082 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10083 }
10084 return p - beg;
10085}
10086
10087/*
10088 * call-seq:
10089 * chop! -> self or nil
10090 *
10091 * Like String#chop, except that:
10092 *
10093 * - Removes trailing characters from +self+ (not from a copy of +self+).
10094 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10095 *
10096 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10097 */
10098
10099static VALUE
10100rb_str_chop_bang(VALUE str)
10101{
10102 str_modify_keep_cr(str);
10103 if (RSTRING_LEN(str) > 0) {
10104 long len;
10105 len = chopped_length(str);
10106 STR_SET_LEN(str, len);
10107 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10108 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10110 }
10111 return str;
10112 }
10113 return Qnil;
10114}
10115
10116
10117/*
10118 * call-seq:
10119 * chop -> new_string
10120 *
10121 * :include: doc/string/chop.rdoc
10122 *
10123 */
10124
10125static VALUE
10126rb_str_chop(VALUE str)
10127{
10128 return rb_str_subseq(str, 0, chopped_length(str));
10129}
10130
10131static long
10132smart_chomp(VALUE str, const char *e, const char *p)
10133{
10134 rb_encoding *enc = rb_enc_get(str);
10135 if (rb_enc_mbminlen(enc) > 1) {
10136 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10137 if (rb_enc_is_newline(pp, e, enc)) {
10138 e = pp;
10139 }
10140 pp = e - rb_enc_mbminlen(enc);
10141 if (pp >= p) {
10142 pp = rb_enc_left_char_head(p, pp, e, enc);
10143 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10144 e = pp;
10145 }
10146 }
10147 }
10148 else {
10149 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10150 case '\n':
10151 if (--e > p && *(e-1) == '\r') {
10152 --e;
10153 }
10154 break;
10155 case '\r':
10156 --e;
10157 break;
10158 }
10159 }
10160 return e - p;
10161}
10162
10163static long
10164chompped_length(VALUE str, VALUE rs)
10165{
10166 rb_encoding *enc;
10167 int newline;
10168 char *pp, *e, *rsptr;
10169 long rslen;
10170 char *const p = RSTRING_PTR(str);
10171 long len = RSTRING_LEN(str);
10172
10173 if (len == 0) return 0;
10174 e = p + len;
10175 if (rs == rb_default_rs) {
10176 return smart_chomp(str, e, p);
10177 }
10178
10179 enc = rb_enc_get(str);
10180 RSTRING_GETMEM(rs, rsptr, rslen);
10181 if (rslen == 0) {
10182 if (rb_enc_mbminlen(enc) > 1) {
10183 while (e > p) {
10184 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10185 if (!rb_enc_is_newline(pp, e, enc)) break;
10186 e = pp;
10187 pp -= rb_enc_mbminlen(enc);
10188 if (pp >= p) {
10189 pp = rb_enc_left_char_head(p, pp, e, enc);
10190 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10191 e = pp;
10192 }
10193 }
10194 }
10195 }
10196 else {
10197 while (e > p && *(e-1) == '\n') {
10198 --e;
10199 if (e > p && *(e-1) == '\r')
10200 --e;
10201 }
10202 }
10203 return e - p;
10204 }
10205 if (rslen > len) return len;
10206
10207 enc = rb_enc_get(rs);
10208 newline = rsptr[rslen-1];
10209 if (rslen == rb_enc_mbminlen(enc)) {
10210 if (rslen == 1) {
10211 if (newline == '\n')
10212 return smart_chomp(str, e, p);
10213 }
10214 else {
10215 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10216 return smart_chomp(str, e, p);
10217 }
10218 }
10219
10220 enc = rb_enc_check(str, rs);
10221 if (is_broken_string(rs)) {
10222 return len;
10223 }
10224 pp = e - rslen;
10225 if (p[len-1] == newline &&
10226 (rslen <= 1 ||
10227 memcmp(rsptr, pp, rslen) == 0)) {
10228 if (at_char_boundary(p, pp, e, enc))
10229 return len - rslen;
10230 RB_GC_GUARD(rs);
10231 }
10232 return len;
10233}
10234
10240static VALUE
10241chomp_rs(int argc, const VALUE *argv)
10242{
10243 rb_check_arity(argc, 0, 1);
10244 if (argc > 0) {
10245 VALUE rs = argv[0];
10246 if (!NIL_P(rs)) StringValue(rs);
10247 return rs;
10248 }
10249 else {
10250 return rb_rs;
10251 }
10252}
10253
10254VALUE
10255rb_str_chomp_string(VALUE str, VALUE rs)
10256{
10257 long olen = RSTRING_LEN(str);
10258 long len = chompped_length(str, rs);
10259 if (len >= olen) return Qnil;
10260 str_modify_keep_cr(str);
10261 STR_SET_LEN(str, len);
10262 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10263 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10265 }
10266 return str;
10267}
10268
10269/*
10270 * call-seq:
10271 * chomp!(line_sep = $/) -> self or nil
10272 *
10273 * Like String#chomp, except that:
10274 *
10275 * - Removes trailing characters from +self+ (not from a copy of +self+).
10276 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10277 *
10278 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10279 */
10280
10281static VALUE
10282rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10283{
10284 VALUE rs;
10285 str_modifiable(str);
10286 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10287 rs = chomp_rs(argc, argv);
10288 if (NIL_P(rs)) return Qnil;
10289 return rb_str_chomp_string(str, rs);
10290}
10291
10292
10293/*
10294 * call-seq:
10295 * chomp(line_sep = $/) -> new_string
10296 *
10297 * :include: doc/string/chomp.rdoc
10298 *
10299 */
10300
10301static VALUE
10302rb_str_chomp(int argc, VALUE *argv, VALUE str)
10303{
10304 VALUE rs = chomp_rs(argc, argv);
10305 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10306 return rb_str_subseq(str, 0, chompped_length(str, rs));
10307}
10308
10309static long
10310lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10311{
10312 const char *const start = s;
10313
10314 if (!s || s >= e) return 0;
10315
10316 /* remove spaces at head */
10317 if (single_byte_optimizable(str)) {
10318 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10319 }
10320 else {
10321 while (s < e) {
10322 int n;
10323 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10324
10325 if (cc && !rb_isspace(cc)) break;
10326 s += n;
10327 }
10328 }
10329 return s - start;
10330}
10331
10332/*
10333 * call-seq:
10334 * lstrip! -> self or nil
10335 *
10336 * Like String#lstrip, except that any modifications are made in +self+;
10337 * returns +self+ if any modification are made, +nil+ otherwise.
10338 *
10339 * Related: String#rstrip!, String#strip!.
10340 */
10341
10342static VALUE
10343rb_str_lstrip_bang(VALUE str)
10344{
10345 rb_encoding *enc;
10346 char *start, *s;
10347 long olen, loffset;
10348
10349 str_modify_keep_cr(str);
10350 enc = STR_ENC_GET(str);
10351 RSTRING_GETMEM(str, start, olen);
10352 loffset = lstrip_offset(str, start, start+olen, enc);
10353 if (loffset > 0) {
10354 long len = olen-loffset;
10355 s = start + loffset;
10356 memmove(start, s, len);
10357 STR_SET_LEN(str, len);
10358 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10359 return str;
10360 }
10361 return Qnil;
10362}
10363
10364
10365/*
10366 * call-seq:
10367 * lstrip -> new_string
10368 *
10369 * Returns a copy of +self+ with leading whitespace removed;
10370 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10371 *
10372 * whitespace = "\x00\t\n\v\f\r "
10373 * s = whitespace + 'abc' + whitespace
10374 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10375 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10376 *
10377 * Related: String#rstrip, String#strip.
10378 */
10379
10380static VALUE
10381rb_str_lstrip(VALUE str)
10382{
10383 char *start;
10384 long len, loffset;
10385 RSTRING_GETMEM(str, start, len);
10386 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10387 if (loffset <= 0) return str_duplicate(rb_cString, str);
10388 return rb_str_subseq(str, loffset, len - loffset);
10389}
10390
10391static long
10392rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10393{
10394 const char *t;
10395
10396 rb_str_check_dummy_enc(enc);
10398 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10399 }
10400 if (!s || s >= e) return 0;
10401 t = e;
10402
10403 /* remove trailing spaces or '\0's */
10404 if (single_byte_optimizable(str)) {
10405 unsigned char c;
10406 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10407 }
10408 else {
10409 char *tp;
10410
10411 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10412 unsigned int c = rb_enc_codepoint(tp, e, enc);
10413 if (c && !rb_isspace(c)) break;
10414 t = tp;
10415 }
10416 }
10417 return e - t;
10418}
10419
10420/*
10421 * call-seq:
10422 * rstrip! -> self or nil
10423 *
10424 * Like String#rstrip, except that any modifications are made in +self+;
10425 * returns +self+ if any modification are made, +nil+ otherwise.
10426 *
10427 * Related: String#lstrip!, String#strip!.
10428 */
10429
10430static VALUE
10431rb_str_rstrip_bang(VALUE str)
10432{
10433 rb_encoding *enc;
10434 char *start;
10435 long olen, roffset;
10436
10437 str_modify_keep_cr(str);
10438 enc = STR_ENC_GET(str);
10439 RSTRING_GETMEM(str, start, olen);
10440 roffset = rstrip_offset(str, start, start+olen, enc);
10441 if (roffset > 0) {
10442 long len = olen - roffset;
10443
10444 STR_SET_LEN(str, len);
10445 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10446 return str;
10447 }
10448 return Qnil;
10449}
10450
10451
10452/*
10453 * call-seq:
10454 * rstrip -> new_string
10455 *
10456 * Returns a copy of the receiver with trailing whitespace removed;
10457 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10458 *
10459 * whitespace = "\x00\t\n\v\f\r "
10460 * s = whitespace + 'abc' + whitespace
10461 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10462 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10463 *
10464 * Related: String#lstrip, String#strip.
10465 */
10466
10467static VALUE
10468rb_str_rstrip(VALUE str)
10469{
10470 rb_encoding *enc;
10471 char *start;
10472 long olen, roffset;
10473
10474 enc = STR_ENC_GET(str);
10475 RSTRING_GETMEM(str, start, olen);
10476 roffset = rstrip_offset(str, start, start+olen, enc);
10477
10478 if (roffset <= 0) return str_duplicate(rb_cString, str);
10479 return rb_str_subseq(str, 0, olen-roffset);
10480}
10481
10482
10483/*
10484 * call-seq:
10485 * strip! -> self or nil
10486 *
10487 * Like String#strip, except that any modifications are made in +self+;
10488 * returns +self+ if any modification are made, +nil+ otherwise.
10489 *
10490 * Related: String#lstrip!, String#strip!.
10491 */
10492
10493static VALUE
10494rb_str_strip_bang(VALUE str)
10495{
10496 char *start;
10497 long olen, loffset, roffset;
10498 rb_encoding *enc;
10499
10500 str_modify_keep_cr(str);
10501 enc = STR_ENC_GET(str);
10502 RSTRING_GETMEM(str, start, olen);
10503 loffset = lstrip_offset(str, start, start+olen, enc);
10504 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10505
10506 if (loffset > 0 || roffset > 0) {
10507 long len = olen-roffset;
10508 if (loffset > 0) {
10509 len -= loffset;
10510 memmove(start, start + loffset, len);
10511 }
10512 STR_SET_LEN(str, len);
10513 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10514 return str;
10515 }
10516 return Qnil;
10517}
10518
10519
10520/*
10521 * call-seq:
10522 * strip -> new_string
10523 *
10524 * Returns a copy of the receiver with leading and trailing whitespace removed;
10525 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10526 *
10527 * whitespace = "\x00\t\n\v\f\r "
10528 * s = whitespace + 'abc' + whitespace
10529 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10530 * s.strip # => "abc"
10531 *
10532 * Related: String#lstrip, String#rstrip.
10533 */
10534
10535static VALUE
10536rb_str_strip(VALUE str)
10537{
10538 char *start;
10539 long olen, loffset, roffset;
10540 rb_encoding *enc = STR_ENC_GET(str);
10541
10542 RSTRING_GETMEM(str, start, olen);
10543 loffset = lstrip_offset(str, start, start+olen, enc);
10544 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10545
10546 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10547 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10548}
10549
10550static VALUE
10551scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10552{
10553 VALUE result = Qnil;
10554 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10555 if (pos >= 0) {
10556 VALUE match;
10557 struct re_registers *regs;
10558 if (BUILTIN_TYPE(pat) == T_STRING) {
10559 regs = NULL;
10560 end = pos + RSTRING_LEN(pat);
10561 }
10562 else {
10563 match = rb_backref_get();
10564 regs = RMATCH_REGS(match);
10565 pos = BEG(0);
10566 end = END(0);
10567 }
10568
10569 if (pos == end) {
10570 rb_encoding *enc = STR_ENC_GET(str);
10571 /*
10572 * Always consume at least one character of the input string
10573 */
10574 if (RSTRING_LEN(str) > end)
10575 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10576 RSTRING_END(str), enc);
10577 else
10578 *start = end + 1;
10579 }
10580 else {
10581 *start = end;
10582 }
10583
10584 if (!regs || regs->num_regs == 1) {
10585 result = rb_str_subseq(str, pos, end - pos);
10586 return result;
10587 }
10588 else {
10589 result = rb_ary_new2(regs->num_regs);
10590 for (int i = 1; i < regs->num_regs; i++) {
10591 VALUE s = Qnil;
10592 if (BEG(i) >= 0) {
10593 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10594 }
10595
10596 rb_ary_push(result, s);
10597 }
10598 }
10599
10600 RB_GC_GUARD(match);
10601 }
10602
10603 return result;
10604}
10605
10606
10607/*
10608 * call-seq:
10609 * scan(string_or_regexp) -> array
10610 * scan(string_or_regexp) {|matches| ... } -> self
10611 *
10612 * Matches a pattern against +self+; the pattern is:
10613 *
10614 * - +string_or_regexp+ itself, if it is a Regexp.
10615 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10616 *
10617 * Iterates through +self+, generating a collection of matching results:
10618 *
10619 * - If the pattern contains no groups, each result is the
10620 * matched string, <code>$&</code>.
10621 * - If the pattern contains groups, each result is an array
10622 * containing one entry per group.
10623 *
10624 * With no block given, returns an array of the results:
10625 *
10626 * s = 'cruel world'
10627 * s.scan(/\w+/) # => ["cruel", "world"]
10628 * s.scan(/.../) # => ["cru", "el ", "wor"]
10629 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10630 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10631 *
10632 * With a block given, calls the block with each result; returns +self+:
10633 *
10634 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10635 * print "\n"
10636 * s.scan(/(.)(.)/) {|x,y| print y, x }
10637 * print "\n"
10638 *
10639 * Output:
10640 *
10641 * <<cruel>> <<world>>
10642 * rceu lowlr
10643 *
10644 */
10645
10646static VALUE
10647rb_str_scan(VALUE str, VALUE pat)
10648{
10649 VALUE result;
10650 long start = 0;
10651 long last = -1, prev = 0;
10652 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10653
10654 pat = get_pat_quoted(pat, 1);
10655 mustnot_broken(str);
10656 if (!rb_block_given_p()) {
10657 VALUE ary = rb_ary_new();
10658
10659 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10660 last = prev;
10661 prev = start;
10662 rb_ary_push(ary, result);
10663 }
10664 if (last >= 0) rb_pat_search(pat, str, last, 1);
10665 else rb_backref_set(Qnil);
10666 return ary;
10667 }
10668
10669 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10670 last = prev;
10671 prev = start;
10672 rb_yield(result);
10673 str_mod_check(str, p, len);
10674 }
10675 if (last >= 0) rb_pat_search(pat, str, last, 1);
10676 return str;
10677}
10678
10679
10680/*
10681 * call-seq:
10682 * hex -> integer
10683 *
10684 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10685 * (with an optional sign and an optional <code>0x</code>) and returns the
10686 * corresponding number;
10687 * returns zero if there is no such leading substring:
10688 *
10689 * '0x0a'.hex # => 10
10690 * '-1234'.hex # => -4660
10691 * '0'.hex # => 0
10692 * 'non-numeric'.hex # => 0
10693 *
10694 * Related: String#oct.
10695 *
10696 */
10697
10698static VALUE
10699rb_str_hex(VALUE str)
10700{
10701 return rb_str_to_inum(str, 16, FALSE);
10702}
10703
10704
10705/*
10706 * call-seq:
10707 * oct -> integer
10708 *
10709 * Interprets the leading substring of +self+ as a string of octal digits
10710 * (with an optional sign) and returns the corresponding number;
10711 * returns zero if there is no such leading substring:
10712 *
10713 * '123'.oct # => 83
10714 * '-377'.oct # => -255
10715 * '0377non-numeric'.oct # => 255
10716 * 'non-numeric'.oct # => 0
10717 *
10718 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10719 * see Kernel#Integer.
10720 *
10721 * Related: String#hex.
10722 *
10723 */
10724
10725static VALUE
10726rb_str_oct(VALUE str)
10727{
10728 return rb_str_to_inum(str, -8, FALSE);
10729}
10730
10731#ifndef HAVE_CRYPT_R
10732# include "ruby/thread_native.h"
10733# include "ruby/atomic.h"
10734
10735static struct {
10736 rb_nativethread_lock_t lock;
10737} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10738#endif
10739
10740/*
10741 * call-seq:
10742 * crypt(salt_str) -> new_string
10743 *
10744 * Returns the string generated by calling <code>crypt(3)</code>
10745 * standard library function with <code>str</code> and
10746 * <code>salt_str</code>, in this order, as its arguments. Please do
10747 * not use this method any longer. It is legacy; provided only for
10748 * backward compatibility with ruby scripts in earlier days. It is
10749 * bad to use in contemporary programs for several reasons:
10750 *
10751 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10752 * run. The generated string lacks data portability.
10753 *
10754 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10755 * (i.e. silently ends up in unexpected results).
10756 *
10757 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10758 * thread safe.
10759 *
10760 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10761 * very very weak. According to its manpage, Linux's traditional
10762 * <code>crypt(3)</code> output has only 2**56 variations; too
10763 * easy to brute force today. And this is the default behaviour.
10764 *
10765 * * In order to make things robust some OSes implement so-called
10766 * "modular" usage. To go through, you have to do a complex
10767 * build-up of the <code>salt_str</code> parameter, by hand.
10768 * Failure in generation of a proper salt string tends not to
10769 * yield any errors; typos in parameters are normally not
10770 * detectable.
10771 *
10772 * * For instance, in the following example, the second invocation
10773 * of String#crypt is wrong; it has a typo in "round=" (lacks
10774 * "s"). However the call does not fail and something unexpected
10775 * is generated.
10776 *
10777 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10778 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10779 *
10780 * * Even in the "modular" mode, some hash functions are considered
10781 * archaic and no longer recommended at all; for instance module
10782 * <code>$1$</code> is officially abandoned by its author: see
10783 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10784 * instance module <code>$3$</code> is considered completely
10785 * broken: see the manpage of FreeBSD.
10786 *
10787 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10788 * written above, <code>crypt(3)</code> on Mac OS never fails.
10789 * This means even if you build up a proper salt string it
10790 * generates a traditional DES hash anyways, and there is no way
10791 * for you to be aware of.
10792 *
10793 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10794 *
10795 * If for some reason you cannot migrate to other secure contemporary
10796 * password hashing algorithms, install the string-crypt gem and
10797 * <code>require 'string/crypt'</code> to continue using it.
10798 */
10799
10800static VALUE
10801rb_str_crypt(VALUE str, VALUE salt)
10802{
10803#ifdef HAVE_CRYPT_R
10804 VALUE databuf;
10805 struct crypt_data *data;
10806# define CRYPT_END() ALLOCV_END(databuf)
10807#else
10808 char *tmp_buf;
10809 extern char *crypt(const char *, const char *);
10810# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10811#endif
10812 VALUE result;
10813 const char *s, *saltp;
10814 char *res;
10815#ifdef BROKEN_CRYPT
10816 char salt_8bit_clean[3];
10817#endif
10818
10819 StringValue(salt);
10820 mustnot_wchar(str);
10821 mustnot_wchar(salt);
10822 s = StringValueCStr(str);
10823 saltp = RSTRING_PTR(salt);
10824 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10825 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10826 }
10827
10828#ifdef BROKEN_CRYPT
10829 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10830 salt_8bit_clean[0] = saltp[0] & 0x7f;
10831 salt_8bit_clean[1] = saltp[1] & 0x7f;
10832 salt_8bit_clean[2] = '\0';
10833 saltp = salt_8bit_clean;
10834 }
10835#endif
10836#ifdef HAVE_CRYPT_R
10837 data = ALLOCV(databuf, sizeof(struct crypt_data));
10838# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10839 data->initialized = 0;
10840# endif
10841 res = crypt_r(s, saltp, data);
10842#else
10843 rb_nativethread_lock_lock(&crypt_mutex.lock);
10844 res = crypt(s, saltp);
10845#endif
10846 if (!res) {
10847 int err = errno;
10848 CRYPT_END();
10849 rb_syserr_fail(err, "crypt");
10850 }
10851#ifdef HAVE_CRYPT_R
10852 result = rb_str_new_cstr(res);
10853 CRYPT_END();
10854#else
10855 // We need to copy this buffer because it's static and we need to unlock the mutex
10856 // before allocating a new object (the string to be returned). If we allocate while
10857 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10858 // if other ractors are waiting on this lock.
10859 size_t res_size = strlen(res)+1;
10860 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10861 memcpy(tmp_buf, res, res_size);
10862 res = tmp_buf;
10863 CRYPT_END();
10864 result = rb_str_new_cstr(res);
10865#endif
10866 return result;
10867}
10868
10869
10870/*
10871 * call-seq:
10872 * ord -> integer
10873 *
10874 * :include: doc/string/ord.rdoc
10875 *
10876 */
10877
10878static VALUE
10879rb_str_ord(VALUE s)
10880{
10881 unsigned int c;
10882
10883 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10884 return UINT2NUM(c);
10885}
10886/*
10887 * call-seq:
10888 * sum(n = 16) -> integer
10889 *
10890 * :include: doc/string/sum.rdoc
10891 *
10892 */
10893
10894static VALUE
10895rb_str_sum(int argc, VALUE *argv, VALUE str)
10896{
10897 int bits = 16;
10898 char *ptr, *p, *pend;
10899 long len;
10900 VALUE sum = INT2FIX(0);
10901 unsigned long sum0 = 0;
10902
10903 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10904 bits = 0;
10905 }
10906 ptr = p = RSTRING_PTR(str);
10907 len = RSTRING_LEN(str);
10908 pend = p + len;
10909
10910 while (p < pend) {
10911 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10912 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10913 str_mod_check(str, ptr, len);
10914 sum0 = 0;
10915 }
10916 sum0 += (unsigned char)*p;
10917 p++;
10918 }
10919
10920 if (bits == 0) {
10921 if (sum0) {
10922 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10923 }
10924 }
10925 else {
10926 if (sum == INT2FIX(0)) {
10927 if (bits < (int)sizeof(long)*CHAR_BIT) {
10928 sum0 &= (((unsigned long)1)<<bits)-1;
10929 }
10930 sum = LONG2FIX(sum0);
10931 }
10932 else {
10933 VALUE mod;
10934
10935 if (sum0) {
10936 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10937 }
10938
10939 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10940 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10941 sum = rb_funcall(sum, '&', 1, mod);
10942 }
10943 }
10944 return sum;
10945}
10946
10947static VALUE
10948rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10949{
10950 rb_encoding *enc;
10951 VALUE w;
10952 long width, len, flen = 1, fclen = 1;
10953 VALUE res;
10954 char *p;
10955 const char *f = " ";
10956 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10957 VALUE pad;
10958 int singlebyte = 1, cr;
10959 int termlen;
10960
10961 rb_scan_args(argc, argv, "11", &w, &pad);
10962 enc = STR_ENC_GET(str);
10963 termlen = rb_enc_mbminlen(enc);
10964 width = NUM2LONG(w);
10965 if (argc == 2) {
10966 StringValue(pad);
10967 enc = rb_enc_check(str, pad);
10968 f = RSTRING_PTR(pad);
10969 flen = RSTRING_LEN(pad);
10970 fclen = str_strlen(pad, enc); /* rb_enc_check */
10971 singlebyte = single_byte_optimizable(pad);
10972 if (flen == 0 || fclen == 0) {
10973 rb_raise(rb_eArgError, "zero width padding");
10974 }
10975 }
10976 len = str_strlen(str, enc); /* rb_enc_check */
10977 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10978 n = width - len;
10979 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10980 rlen = n - llen;
10981 cr = ENC_CODERANGE(str);
10982 if (flen > 1) {
10983 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10984 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10985 }
10986 size = RSTRING_LEN(str);
10987 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10988 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10989 (len += llen2 + rlen2) >= LONG_MAX - size) {
10990 rb_raise(rb_eArgError, "argument too big");
10991 }
10992 len += size;
10993 res = str_enc_new(rb_cString, 0, len, enc);
10994 p = RSTRING_PTR(res);
10995 if (flen <= 1) {
10996 memset(p, *f, llen);
10997 p += llen;
10998 }
10999 else {
11000 while (llen >= fclen) {
11001 memcpy(p,f,flen);
11002 p += flen;
11003 llen -= fclen;
11004 }
11005 if (llen > 0) {
11006 memcpy(p, f, llen2);
11007 p += llen2;
11008 }
11009 }
11010 memcpy(p, RSTRING_PTR(str), size);
11011 p += size;
11012 if (flen <= 1) {
11013 memset(p, *f, rlen);
11014 p += rlen;
11015 }
11016 else {
11017 while (rlen >= fclen) {
11018 memcpy(p,f,flen);
11019 p += flen;
11020 rlen -= fclen;
11021 }
11022 if (rlen > 0) {
11023 memcpy(p, f, rlen2);
11024 p += rlen2;
11025 }
11026 }
11027 TERM_FILL(p, termlen);
11028 STR_SET_LEN(res, p-RSTRING_PTR(res));
11029
11030 if (argc == 2)
11031 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11032 if (cr != ENC_CODERANGE_BROKEN)
11033 ENC_CODERANGE_SET(res, cr);
11034
11035 RB_GC_GUARD(pad);
11036 return res;
11037}
11038
11039
11040/*
11041 * call-seq:
11042 * ljust(size, pad_string = ' ') -> new_string
11043 *
11044 * :include: doc/string/ljust.rdoc
11045 *
11046 * Related: String#rjust, String#center.
11047 *
11048 */
11049
11050static VALUE
11051rb_str_ljust(int argc, VALUE *argv, VALUE str)
11052{
11053 return rb_str_justify(argc, argv, str, 'l');
11054}
11055
11056/*
11057 * call-seq:
11058 * rjust(size, pad_string = ' ') -> new_string
11059 *
11060 * :include: doc/string/rjust.rdoc
11061 *
11062 * Related: String#ljust, String#center.
11063 *
11064 */
11065
11066static VALUE
11067rb_str_rjust(int argc, VALUE *argv, VALUE str)
11068{
11069 return rb_str_justify(argc, argv, str, 'r');
11070}
11071
11072
11073/*
11074 * call-seq:
11075 * center(size, pad_string = ' ') -> new_string
11076 *
11077 * :include: doc/string/center.rdoc
11078 *
11079 */
11080
11081static VALUE
11082rb_str_center(int argc, VALUE *argv, VALUE str)
11083{
11084 return rb_str_justify(argc, argv, str, 'c');
11085}
11086
11087/*
11088 * call-seq:
11089 * partition(string_or_regexp) -> [head, match, tail]
11090 *
11091 * :include: doc/string/partition.rdoc
11092 *
11093 */
11094
11095static VALUE
11096rb_str_partition(VALUE str, VALUE sep)
11097{
11098 long pos;
11099
11100 sep = get_pat_quoted(sep, 0);
11101 if (RB_TYPE_P(sep, T_REGEXP)) {
11102 if (rb_reg_search(sep, str, 0, 0) < 0) {
11103 goto failed;
11104 }
11105 VALUE match = rb_backref_get();
11106 struct re_registers *regs = RMATCH_REGS(match);
11107
11108 pos = BEG(0);
11109 sep = rb_str_subseq(str, pos, END(0) - pos);
11110 }
11111 else {
11112 pos = rb_str_index(str, sep, 0);
11113 if (pos < 0) goto failed;
11114 }
11115 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11116 sep,
11117 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11118 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11119
11120 failed:
11121 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11122}
11123
11124/*
11125 * call-seq:
11126 * rpartition(sep) -> [head, match, tail]
11127 *
11128 * :include: doc/string/rpartition.rdoc
11129 *
11130 */
11131
11132static VALUE
11133rb_str_rpartition(VALUE str, VALUE sep)
11134{
11135 long pos = RSTRING_LEN(str);
11136
11137 sep = get_pat_quoted(sep, 0);
11138 if (RB_TYPE_P(sep, T_REGEXP)) {
11139 if (rb_reg_search(sep, str, pos, 1) < 0) {
11140 goto failed;
11141 }
11142 VALUE match = rb_backref_get();
11143 struct re_registers *regs = RMATCH_REGS(match);
11144
11145 pos = BEG(0);
11146 sep = rb_str_subseq(str, pos, END(0) - pos);
11147 }
11148 else {
11149 pos = rb_str_sublen(str, pos);
11150 pos = rb_str_rindex(str, sep, pos);
11151 if (pos < 0) {
11152 goto failed;
11153 }
11154 }
11155
11156 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11157 sep,
11158 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11159 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11160 failed:
11161 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11162}
11163
11164/*
11165 * call-seq:
11166 * start_with?(*string_or_regexp) -> true or false
11167 *
11168 * :include: doc/string/start_with_p.rdoc
11169 *
11170 */
11171
11172static VALUE
11173rb_str_start_with(int argc, VALUE *argv, VALUE str)
11174{
11175 int i;
11176
11177 for (i=0; i<argc; i++) {
11178 VALUE tmp = argv[i];
11179 if (RB_TYPE_P(tmp, T_REGEXP)) {
11180 if (rb_reg_start_with_p(tmp, str))
11181 return Qtrue;
11182 }
11183 else {
11184 const char *p, *s, *e;
11185 long slen, tlen;
11186 rb_encoding *enc;
11187
11188 StringValue(tmp);
11189 enc = rb_enc_check(str, tmp);
11190 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11191 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11192 p = RSTRING_PTR(str);
11193 e = p + slen;
11194 s = p + tlen;
11195 if (!at_char_right_boundary(p, s, e, enc))
11196 continue;
11197 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11198 return Qtrue;
11199 }
11200 }
11201 return Qfalse;
11202}
11203
11204/*
11205 * call-seq:
11206 * end_with?(*strings) -> true or false
11207 *
11208 * :include: doc/string/end_with_p.rdoc
11209 *
11210 */
11211
11212static VALUE
11213rb_str_end_with(int argc, VALUE *argv, VALUE str)
11214{
11215 int i;
11216
11217 for (i=0; i<argc; i++) {
11218 VALUE tmp = argv[i];
11219 const char *p, *s, *e;
11220 long slen, tlen;
11221 rb_encoding *enc;
11222
11223 StringValue(tmp);
11224 enc = rb_enc_check(str, tmp);
11225 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11226 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11227 p = RSTRING_PTR(str);
11228 e = p + slen;
11229 s = e - tlen;
11230 if (!at_char_boundary(p, s, e, enc))
11231 continue;
11232 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11233 return Qtrue;
11234 }
11235 return Qfalse;
11236}
11237
11247static long
11248deleted_prefix_length(VALUE str, VALUE prefix)
11249{
11250 const char *strptr, *prefixptr;
11251 long olen, prefixlen;
11252 rb_encoding *enc = rb_enc_get(str);
11253
11254 StringValue(prefix);
11255
11256 if (!is_broken_string(prefix) ||
11257 !rb_enc_asciicompat(enc) ||
11258 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11259 enc = rb_enc_check(str, prefix);
11260 }
11261
11262 /* return 0 if not start with prefix */
11263 prefixlen = RSTRING_LEN(prefix);
11264 if (prefixlen <= 0) return 0;
11265 olen = RSTRING_LEN(str);
11266 if (olen < prefixlen) return 0;
11267 strptr = RSTRING_PTR(str);
11268 prefixptr = RSTRING_PTR(prefix);
11269 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11270 if (is_broken_string(prefix)) {
11271 if (!is_broken_string(str)) {
11272 /* prefix in a valid string cannot be broken */
11273 return 0;
11274 }
11275 const char *strend = strptr + olen;
11276 const char *after_prefix = strptr + prefixlen;
11277 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11278 /* prefix does not end at char-boundary */
11279 return 0;
11280 }
11281 }
11282 /* prefix part in `str` also should be valid. */
11283
11284 return prefixlen;
11285}
11286
11287/*
11288 * call-seq:
11289 * delete_prefix!(prefix) -> self or nil
11290 *
11291 * Like String#delete_prefix, except that +self+ is modified in place;
11292 * returns +self+ if the prefix is removed, +nil+ otherwise.
11293 *
11294 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11295 */
11296
11297static VALUE
11298rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11299{
11300 long prefixlen;
11301 str_modify_keep_cr(str);
11302
11303 prefixlen = deleted_prefix_length(str, prefix);
11304 if (prefixlen <= 0) return Qnil;
11305
11306 return rb_str_drop_bytes(str, prefixlen);
11307}
11308
11309/*
11310 * call-seq:
11311 * delete_prefix(prefix) -> new_string
11312 *
11313 * :include: doc/string/delete_prefix.rdoc
11314 *
11315 */
11316
11317static VALUE
11318rb_str_delete_prefix(VALUE str, VALUE prefix)
11319{
11320 long prefixlen;
11321
11322 prefixlen = deleted_prefix_length(str, prefix);
11323 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11324
11325 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11326}
11327
11337static long
11338deleted_suffix_length(VALUE str, VALUE suffix)
11339{
11340 const char *strptr, *suffixptr;
11341 long olen, suffixlen;
11342 rb_encoding *enc;
11343
11344 StringValue(suffix);
11345 if (is_broken_string(suffix)) return 0;
11346 enc = rb_enc_check(str, suffix);
11347
11348 /* return 0 if not start with suffix */
11349 suffixlen = RSTRING_LEN(suffix);
11350 if (suffixlen <= 0) return 0;
11351 olen = RSTRING_LEN(str);
11352 if (olen < suffixlen) return 0;
11353 strptr = RSTRING_PTR(str);
11354 suffixptr = RSTRING_PTR(suffix);
11355 const char *strend = strptr + olen;
11356 const char *before_suffix = strend - suffixlen;
11357 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11358 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11359
11360 return suffixlen;
11361}
11362
11363/*
11364 * call-seq:
11365 * delete_suffix!(suffix) -> self or nil
11366 *
11367 * Like String#delete_suffix, except that +self+ is modified in place;
11368 * returns +self+ if the suffix is removed, +nil+ otherwise.
11369 *
11370 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11371 */
11372
11373static VALUE
11374rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11375{
11376 long olen, suffixlen, len;
11377 str_modifiable(str);
11378
11379 suffixlen = deleted_suffix_length(str, suffix);
11380 if (suffixlen <= 0) return Qnil;
11381
11382 olen = RSTRING_LEN(str);
11383 str_modify_keep_cr(str);
11384 len = olen - suffixlen;
11385 STR_SET_LEN(str, len);
11386 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11387 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11389 }
11390 return str;
11391}
11392
11393/*
11394 * call-seq:
11395 * delete_suffix(suffix) -> new_string
11396 *
11397 * :include: doc/string/delete_suffix.rdoc
11398 *
11399 */
11400
11401static VALUE
11402rb_str_delete_suffix(VALUE str, VALUE suffix)
11403{
11404 long suffixlen;
11405
11406 suffixlen = deleted_suffix_length(str, suffix);
11407 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11408
11409 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11410}
11411
11412void
11413rb_str_setter(VALUE val, ID id, VALUE *var)
11414{
11415 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11416 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11417 }
11418 *var = val;
11419}
11420
11421static void
11422rb_fs_setter(VALUE val, ID id, VALUE *var)
11423{
11424 val = rb_fs_check(val);
11425 if (!val) {
11426 rb_raise(rb_eTypeError,
11427 "value of %"PRIsVALUE" must be String or Regexp",
11428 rb_id2str(id));
11429 }
11430 if (!NIL_P(val)) {
11431 rb_warn_deprecated("'$;'", NULL);
11432 }
11433 *var = val;
11434}
11435
11436
11437/*
11438 * call-seq:
11439 * force_encoding(encoding) -> self
11440 *
11441 * :include: doc/string/force_encoding.rdoc
11442 *
11443 */
11444
11445static VALUE
11446rb_str_force_encoding(VALUE str, VALUE enc)
11447{
11448 str_modifiable(str);
11449
11450 rb_encoding *encoding = rb_to_encoding(enc);
11451 int idx = rb_enc_to_index(encoding);
11452
11453 // If the encoding is unchanged, we do nothing.
11454 if (ENCODING_GET(str) == idx) {
11455 return str;
11456 }
11457
11458 rb_enc_associate_index(str, idx);
11459
11460 // If the coderange was 7bit and the new encoding is ASCII-compatible
11461 // we can keep the coderange.
11462 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11463 return str;
11464 }
11465
11467 return str;
11468}
11469
11470/*
11471 * call-seq:
11472 * b -> new_string
11473 *
11474 * :include: doc/string/b.rdoc
11475 *
11476 */
11477
11478static VALUE
11479rb_str_b(VALUE str)
11480{
11481 VALUE str2;
11482 if (STR_EMBED_P(str)) {
11483 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11484 }
11485 else {
11486 str2 = str_alloc_heap(rb_cString);
11487 }
11488 str_replace_shared_without_enc(str2, str);
11489
11490 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11491 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11492 // If we know the receiver's code range then we know the result's code range.
11493 int cr = ENC_CODERANGE(str);
11494 switch (cr) {
11495 case ENC_CODERANGE_7BIT:
11497 break;
11501 break;
11502 default:
11503 ENC_CODERANGE_CLEAR(str2);
11504 break;
11505 }
11506 }
11507
11508 return str2;
11509}
11510
11511/*
11512 * call-seq:
11513 * valid_encoding? -> true or false
11514 *
11515 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11516 *
11517 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11518 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11519 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11520 */
11521
11522static VALUE
11523rb_str_valid_encoding_p(VALUE str)
11524{
11525 int cr = rb_enc_str_coderange(str);
11526
11527 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11528}
11529
11530/*
11531 * call-seq:
11532 * ascii_only? -> true or false
11533 *
11534 * Returns whether +self+ contains only ASCII characters:
11535 *
11536 * 'abc'.ascii_only? # => true
11537 * "abc\u{6666}".ascii_only? # => false
11538 *
11539 * Related: see {Querying}[rdoc-ref:String@Querying].
11540 */
11541
11542static VALUE
11543rb_str_is_ascii_only_p(VALUE str)
11544{
11545 int cr = rb_enc_str_coderange(str);
11546
11547 return RBOOL(cr == ENC_CODERANGE_7BIT);
11548}
11549
11550VALUE
11552{
11553 static const char ellipsis[] = "...";
11554 const long ellipsislen = sizeof(ellipsis) - 1;
11555 rb_encoding *const enc = rb_enc_get(str);
11556 const long blen = RSTRING_LEN(str);
11557 const char *const p = RSTRING_PTR(str), *e = p + blen;
11558 VALUE estr, ret = 0;
11559
11560 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11561 if (len * rb_enc_mbminlen(enc) >= blen ||
11562 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11563 ret = str;
11564 }
11565 else if (len <= ellipsislen ||
11566 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11567 if (rb_enc_asciicompat(enc)) {
11568 ret = rb_str_new(ellipsis, len);
11569 rb_enc_associate(ret, enc);
11570 }
11571 else {
11572 estr = rb_usascii_str_new(ellipsis, len);
11573 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11574 }
11575 }
11576 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11577 rb_str_cat(ret, ellipsis, ellipsislen);
11578 }
11579 else {
11580 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11581 rb_enc_from_encoding(enc), 0, Qnil);
11582 rb_str_append(ret, estr);
11583 }
11584 return ret;
11585}
11586
11587static VALUE
11588str_compat_and_valid(VALUE str, rb_encoding *enc)
11589{
11590 int cr;
11591 str = StringValue(str);
11592 cr = rb_enc_str_coderange(str);
11593 if (cr == ENC_CODERANGE_BROKEN) {
11594 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11595 }
11596 else {
11597 rb_encoding *e = STR_ENC_GET(str);
11598 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11599 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11600 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11601 }
11602 }
11603 return str;
11604}
11605
11606static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11607
11608VALUE
11610{
11611 rb_encoding *enc = STR_ENC_GET(str);
11612 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11613}
11614
11615VALUE
11616rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11617{
11618 int cr = ENC_CODERANGE_UNKNOWN;
11619 if (enc == STR_ENC_GET(str)) {
11620 /* cached coderange makes sense only when enc equals the
11621 * actual encoding of str */
11622 cr = ENC_CODERANGE(str);
11623 }
11624 return enc_str_scrub(enc, str, repl, cr);
11625}
11626
11627static VALUE
11628enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11629{
11630 int encidx;
11631 VALUE buf = Qnil;
11632 const char *rep, *p, *e, *p1, *sp;
11633 long replen = -1;
11634 long slen;
11635
11636 if (rb_block_given_p()) {
11637 if (!NIL_P(repl))
11638 rb_raise(rb_eArgError, "both of block and replacement given");
11639 replen = 0;
11640 }
11641
11642 if (ENC_CODERANGE_CLEAN_P(cr))
11643 return Qnil;
11644
11645 if (!NIL_P(repl)) {
11646 repl = str_compat_and_valid(repl, enc);
11647 }
11648
11649 if (rb_enc_dummy_p(enc)) {
11650 return Qnil;
11651 }
11652 encidx = rb_enc_to_index(enc);
11653
11654#define DEFAULT_REPLACE_CHAR(str) do { \
11655 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11656 rep = replace; replen = (int)sizeof(replace); \
11657 } while (0)
11658
11659 slen = RSTRING_LEN(str);
11660 p = RSTRING_PTR(str);
11661 e = RSTRING_END(str);
11662 p1 = p;
11663 sp = p;
11664
11665 if (rb_enc_asciicompat(enc)) {
11666 int rep7bit_p;
11667 if (!replen) {
11668 rep = NULL;
11669 rep7bit_p = FALSE;
11670 }
11671 else if (!NIL_P(repl)) {
11672 rep = RSTRING_PTR(repl);
11673 replen = RSTRING_LEN(repl);
11674 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11675 }
11676 else if (encidx == rb_utf8_encindex()) {
11677 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11678 rep7bit_p = FALSE;
11679 }
11680 else {
11681 DEFAULT_REPLACE_CHAR("?");
11682 rep7bit_p = TRUE;
11683 }
11684 cr = ENC_CODERANGE_7BIT;
11685
11686 p = search_nonascii(p, e);
11687 if (!p) {
11688 p = e;
11689 }
11690 while (p < e) {
11691 int ret = rb_enc_precise_mbclen(p, e, enc);
11692 if (MBCLEN_NEEDMORE_P(ret)) {
11693 break;
11694 }
11695 else if (MBCLEN_CHARFOUND_P(ret)) {
11697 p += MBCLEN_CHARFOUND_LEN(ret);
11698 }
11699 else if (MBCLEN_INVALID_P(ret)) {
11700 /*
11701 * p1~p: valid ascii/multibyte chars
11702 * p ~e: invalid bytes + unknown bytes
11703 */
11704 long clen = rb_enc_mbmaxlen(enc);
11705 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11706 if (p > p1) {
11707 rb_str_buf_cat(buf, p1, p - p1);
11708 }
11709
11710 if (e - p < clen) clen = e - p;
11711 if (clen <= 2) {
11712 clen = 1;
11713 }
11714 else {
11715 const char *q = p;
11716 clen--;
11717 for (; clen > 1; clen--) {
11718 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11719 if (MBCLEN_NEEDMORE_P(ret)) break;
11720 if (MBCLEN_INVALID_P(ret)) continue;
11722 }
11723 }
11724 if (rep) {
11725 rb_str_buf_cat(buf, rep, replen);
11726 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11727 }
11728 else {
11729 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11730 str_mod_check(str, sp, slen);
11731 repl = str_compat_and_valid(repl, enc);
11732 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11735 }
11736 p += clen;
11737 p1 = p;
11738 p = search_nonascii(p, e);
11739 if (!p) {
11740 p = e;
11741 break;
11742 }
11743 }
11744 else {
11746 }
11747 }
11748 if (NIL_P(buf)) {
11749 if (p == e) {
11750 ENC_CODERANGE_SET(str, cr);
11751 return Qnil;
11752 }
11753 buf = rb_str_buf_new(RSTRING_LEN(str));
11754 }
11755 if (p1 < p) {
11756 rb_str_buf_cat(buf, p1, p - p1);
11757 }
11758 if (p < e) {
11759 if (rep) {
11760 rb_str_buf_cat(buf, rep, replen);
11761 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11762 }
11763 else {
11764 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11765 str_mod_check(str, sp, slen);
11766 repl = str_compat_and_valid(repl, enc);
11767 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11770 }
11771 }
11772 }
11773 else {
11774 /* ASCII incompatible */
11775 long mbminlen = rb_enc_mbminlen(enc);
11776 if (!replen) {
11777 rep = NULL;
11778 }
11779 else if (!NIL_P(repl)) {
11780 rep = RSTRING_PTR(repl);
11781 replen = RSTRING_LEN(repl);
11782 }
11783 else if (encidx == ENCINDEX_UTF_16BE) {
11784 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11785 }
11786 else if (encidx == ENCINDEX_UTF_16LE) {
11787 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11788 }
11789 else if (encidx == ENCINDEX_UTF_32BE) {
11790 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11791 }
11792 else if (encidx == ENCINDEX_UTF_32LE) {
11793 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11794 }
11795 else {
11796 DEFAULT_REPLACE_CHAR("?");
11797 }
11798
11799 while (p < e) {
11800 int ret = rb_enc_precise_mbclen(p, e, enc);
11801 if (MBCLEN_NEEDMORE_P(ret)) {
11802 break;
11803 }
11804 else if (MBCLEN_CHARFOUND_P(ret)) {
11805 p += MBCLEN_CHARFOUND_LEN(ret);
11806 }
11807 else if (MBCLEN_INVALID_P(ret)) {
11808 const char *q = p;
11809 long clen = rb_enc_mbmaxlen(enc);
11810 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11811 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11812
11813 if (e - p < clen) clen = e - p;
11814 if (clen <= mbminlen * 2) {
11815 clen = mbminlen;
11816 }
11817 else {
11818 clen -= mbminlen;
11819 for (; clen > mbminlen; clen-=mbminlen) {
11820 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11821 if (MBCLEN_NEEDMORE_P(ret)) break;
11822 if (MBCLEN_INVALID_P(ret)) continue;
11824 }
11825 }
11826 if (rep) {
11827 rb_str_buf_cat(buf, rep, replen);
11828 }
11829 else {
11830 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11831 str_mod_check(str, sp, slen);
11832 repl = str_compat_and_valid(repl, enc);
11833 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11834 }
11835 p += clen;
11836 p1 = p;
11837 }
11838 else {
11840 }
11841 }
11842 if (NIL_P(buf)) {
11843 if (p == e) {
11845 return Qnil;
11846 }
11847 buf = rb_str_buf_new(RSTRING_LEN(str));
11848 }
11849 if (p1 < p) {
11850 rb_str_buf_cat(buf, p1, p - p1);
11851 }
11852 if (p < e) {
11853 if (rep) {
11854 rb_str_buf_cat(buf, rep, replen);
11855 }
11856 else {
11857 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11858 str_mod_check(str, sp, slen);
11859 repl = str_compat_and_valid(repl, enc);
11860 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11861 }
11862 }
11864 }
11865 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11866 return buf;
11867}
11868
11869/*
11870 * call-seq:
11871 * scrub(replacement_string = default_replacement) -> new_string
11872 * scrub{|bytes| ... } -> new_string
11873 *
11874 * :include: doc/string/scrub.rdoc
11875 *
11876 */
11877static VALUE
11878str_scrub(int argc, VALUE *argv, VALUE str)
11879{
11880 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11881 VALUE new = rb_str_scrub(str, repl);
11882 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11883}
11884
11885/*
11886 * call-seq:
11887 * scrub! -> self
11888 * scrub!(replacement_string = default_replacement) -> self
11889 * scrub!{|bytes| ... } -> self
11890 *
11891 * Like String#scrub, except that any replacements are made in +self+.
11892 *
11893 */
11894static VALUE
11895str_scrub_bang(int argc, VALUE *argv, VALUE str)
11896{
11897 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11898 VALUE new = rb_str_scrub(str, repl);
11899 if (!NIL_P(new)) rb_str_replace(str, new);
11900 return str;
11901}
11902
11903static ID id_normalize;
11904static ID id_normalized_p;
11905static VALUE mUnicodeNormalize;
11906
11907static VALUE
11908unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11909{
11910 static int UnicodeNormalizeRequired = 0;
11911 VALUE argv2[2];
11912
11913 if (!UnicodeNormalizeRequired) {
11914 rb_require("unicode_normalize/normalize.rb");
11915 UnicodeNormalizeRequired = 1;
11916 }
11917 argv2[0] = str;
11918 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11919 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11920}
11921
11922/*
11923 * call-seq:
11924 * unicode_normalize(form = :nfc) -> string
11925 *
11926 * Returns a copy of +self+ with
11927 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11928 *
11929 * Argument +form+ must be one of the following symbols
11930 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11931 *
11932 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11933 * - +:nfd+: Canonical decomposition.
11934 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11935 * - +:nfkd+: Compatibility decomposition.
11936 *
11937 * The encoding of +self+ must be one of:
11938 *
11939 * - Encoding::UTF_8
11940 * - Encoding::UTF_16BE
11941 * - Encoding::UTF_16LE
11942 * - Encoding::UTF_32BE
11943 * - Encoding::UTF_32LE
11944 * - Encoding::GB18030
11945 * - Encoding::UCS_2BE
11946 * - Encoding::UCS_4BE
11947 *
11948 * Examples:
11949 *
11950 * "a\u0300".unicode_normalize # => "a"
11951 * "\u00E0".unicode_normalize(:nfd) # => "a "
11952 *
11953 * Related: String#unicode_normalize!, String#unicode_normalized?.
11954 */
11955static VALUE
11956rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11957{
11958 return unicode_normalize_common(argc, argv, str, id_normalize);
11959}
11960
11961/*
11962 * call-seq:
11963 * unicode_normalize!(form = :nfc) -> self
11964 *
11965 * Like String#unicode_normalize, except that the normalization
11966 * is performed on +self+.
11967 *
11968 * Related String#unicode_normalized?.
11969 *
11970 */
11971static VALUE
11972rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11973{
11974 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11975}
11976
11977/* call-seq:
11978 * unicode_normalized?(form = :nfc) -> true or false
11979 *
11980 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11981 * +false+ otherwise.
11982 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11983 *
11984 * Examples:
11985 *
11986 * "a\u0300".unicode_normalized? # => false
11987 * "a\u0300".unicode_normalized?(:nfd) # => true
11988 * "\u00E0".unicode_normalized? # => true
11989 * "\u00E0".unicode_normalized?(:nfd) # => false
11990 *
11991 *
11992 * Raises an exception if +self+ is not in a Unicode encoding:
11993 *
11994 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11995 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11996 *
11997 * Related: String#unicode_normalize, String#unicode_normalize!.
11998 *
11999 */
12000static VALUE
12001rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12002{
12003 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12004}
12005
12006/**********************************************************************
12007 * Document-class: Symbol
12008 *
12009 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12010 *
12011 * You can create a +Symbol+ object explicitly with:
12012 *
12013 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12014 *
12015 * The same +Symbol+ object will be
12016 * created for a given name or string for the duration of a program's
12017 * execution, regardless of the context or meaning of that name. Thus
12018 * if <code>Fred</code> is a constant in one context, a method in
12019 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12020 * will be the same object in all three contexts.
12021 *
12022 * module One
12023 * class Fred
12024 * end
12025 * $f1 = :Fred
12026 * end
12027 * module Two
12028 * Fred = 1
12029 * $f2 = :Fred
12030 * end
12031 * def Fred()
12032 * end
12033 * $f3 = :Fred
12034 * $f1.object_id #=> 2514190
12035 * $f2.object_id #=> 2514190
12036 * $f3.object_id #=> 2514190
12037 *
12038 * Constant, method, and variable names are returned as symbols:
12039 *
12040 * module One
12041 * Two = 2
12042 * def three; 3 end
12043 * @four = 4
12044 * @@five = 5
12045 * $six = 6
12046 * end
12047 * seven = 7
12048 *
12049 * One.constants
12050 * # => [:Two]
12051 * One.instance_methods(true)
12052 * # => [:three]
12053 * One.instance_variables
12054 * # => [:@four]
12055 * One.class_variables
12056 * # => [:@@five]
12057 * global_variables.grep(/six/)
12058 * # => [:$six]
12059 * local_variables
12060 * # => [:seven]
12061 *
12062 * A +Symbol+ object differs from a String object in that
12063 * a +Symbol+ object represents an identifier, while a String object
12064 * represents text or data.
12065 *
12066 * == What's Here
12067 *
12068 * First, what's elsewhere. Class +Symbol+:
12069 *
12070 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12071 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12072 *
12073 * Here, class +Symbol+ provides methods that are useful for:
12074 *
12075 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12076 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12077 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12078 *
12079 * === Methods for Querying
12080 *
12081 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12082 * - #=~: Returns the index of the first substring in symbol that matches a
12083 * given Regexp or other object; returns +nil+ if no match is found.
12084 * - #[], #slice : Returns a substring of symbol
12085 * determined by a given index, start/length, or range, or string.
12086 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12087 * - #encoding: Returns the Encoding object that represents the encoding
12088 * of symbol.
12089 * - #end_with?: Returns +true+ if symbol ends with
12090 * any of the given strings.
12091 * - #match: Returns a MatchData object if symbol
12092 * matches a given Regexp; +nil+ otherwise.
12093 * - #match?: Returns +true+ if symbol
12094 * matches a given Regexp; +false+ otherwise.
12095 * - #length, #size: Returns the number of characters in symbol.
12096 * - #start_with?: Returns +true+ if symbol starts with
12097 * any of the given strings.
12098 *
12099 * === Methods for Comparing
12100 *
12101 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12102 * or larger than symbol.
12103 * - #==, #===: Returns +true+ if a given symbol has the same content and
12104 * encoding.
12105 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12106 * symbol is smaller than, equal to, or larger than symbol.
12107 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12108 * after Unicode case folding; +false+ otherwise.
12109 *
12110 * === Methods for Converting
12111 *
12112 * - #capitalize: Returns symbol with the first character upcased
12113 * and all other characters downcased.
12114 * - #downcase: Returns symbol with all characters downcased.
12115 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12116 * - #name: Returns the frozen string corresponding to symbol.
12117 * - #succ, #next: Returns the symbol that is the successor to symbol.
12118 * - #swapcase: Returns symbol with all upcase characters downcased
12119 * and all downcase characters upcased.
12120 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12121 * - #to_s, #id2name: Returns the string corresponding to +self+.
12122 * - #to_sym, #intern: Returns +self+.
12123 * - #upcase: Returns symbol with all characters upcased.
12124 *
12125 */
12126
12127
12128/*
12129 * call-seq:
12130 * symbol == object -> true or false
12131 *
12132 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12133 */
12134
12135#define sym_equal rb_obj_equal
12136
12137static int
12138sym_printable(const char *s, const char *send, rb_encoding *enc)
12139{
12140 while (s < send) {
12141 int n;
12142 int c = rb_enc_precise_mbclen(s, send, enc);
12143
12144 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12145 n = MBCLEN_CHARFOUND_LEN(c);
12146 c = rb_enc_mbc_to_codepoint(s, send, enc);
12147 if (!rb_enc_isprint(c, enc)) return FALSE;
12148 s += n;
12149 }
12150 return TRUE;
12151}
12152
12153int
12154rb_str_symname_p(VALUE sym)
12155{
12156 rb_encoding *enc;
12157 const char *ptr;
12158 long len;
12159 rb_encoding *resenc = rb_default_internal_encoding();
12160
12161 if (resenc == NULL) resenc = rb_default_external_encoding();
12162 enc = STR_ENC_GET(sym);
12163 ptr = RSTRING_PTR(sym);
12164 len = RSTRING_LEN(sym);
12165 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12166 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12167 return FALSE;
12168 }
12169 return TRUE;
12170}
12171
12172VALUE
12173rb_str_quote_unprintable(VALUE str)
12174{
12175 rb_encoding *enc;
12176 const char *ptr;
12177 long len;
12178 rb_encoding *resenc;
12179
12180 Check_Type(str, T_STRING);
12181 resenc = rb_default_internal_encoding();
12182 if (resenc == NULL) resenc = rb_default_external_encoding();
12183 enc = STR_ENC_GET(str);
12184 ptr = RSTRING_PTR(str);
12185 len = RSTRING_LEN(str);
12186 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12187 !sym_printable(ptr, ptr + len, enc)) {
12188 return rb_str_escape(str);
12189 }
12190 return str;
12191}
12192
12193VALUE
12194rb_id_quote_unprintable(ID id)
12195{
12196 VALUE str = rb_id2str(id);
12197 if (!rb_str_symname_p(str)) {
12198 return rb_str_escape(str);
12199 }
12200 return str;
12201}
12202
12203/*
12204 * call-seq:
12205 * inspect -> string
12206 *
12207 * Returns a string representation of +self+ (including the leading colon):
12208 *
12209 * :foo.inspect # => ":foo"
12210 *
12211 * Related: Symbol#to_s, Symbol#name.
12212 *
12213 */
12214
12215static VALUE
12216sym_inspect(VALUE sym)
12217{
12218 VALUE str = rb_sym2str(sym);
12219 const char *ptr;
12220 long len;
12221 char *dest;
12222
12223 if (!rb_str_symname_p(str)) {
12224 str = rb_str_inspect(str);
12225 len = RSTRING_LEN(str);
12226 rb_str_resize(str, len + 1);
12227 dest = RSTRING_PTR(str);
12228 memmove(dest + 1, dest, len);
12229 }
12230 else {
12231 rb_encoding *enc = STR_ENC_GET(str);
12232 VALUE orig_str = str;
12233
12234 len = RSTRING_LEN(orig_str);
12235 str = rb_enc_str_new(0, len + 1, enc);
12236
12237 // Get data pointer after allocation
12238 ptr = RSTRING_PTR(orig_str);
12239 dest = RSTRING_PTR(str);
12240 memcpy(dest + 1, ptr, len);
12241
12242 RB_GC_GUARD(orig_str);
12243 }
12244 dest[0] = ':';
12245
12247
12248 return str;
12249}
12250
12251VALUE
12253{
12254 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12255 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12256 return str;
12257}
12258
12259VALUE
12260rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12261{
12262 VALUE obj;
12263
12264 if (argc < 1) {
12265 rb_raise(rb_eArgError, "no receiver given");
12266 }
12267 obj = argv[0];
12268 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12269}
12270
12271/*
12272 * call-seq:
12273 * succ
12274 *
12275 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12276 *
12277 * :foo.succ # => :fop
12278 *
12279 * Related: String#succ.
12280 */
12281
12282static VALUE
12283sym_succ(VALUE sym)
12284{
12285 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12286}
12287
12288/*
12289 * call-seq:
12290 * symbol <=> object -> -1, 0, +1, or nil
12291 *
12292 * If +object+ is a symbol,
12293 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12294 *
12295 * :bar <=> :foo # => -1
12296 * :foo <=> :foo # => 0
12297 * :foo <=> :bar # => 1
12298 *
12299 * Otherwise, returns +nil+:
12300 *
12301 * :foo <=> 'bar' # => nil
12302 *
12303 * Related: String#<=>.
12304 */
12305
12306static VALUE
12307sym_cmp(VALUE sym, VALUE other)
12308{
12309 if (!SYMBOL_P(other)) {
12310 return Qnil;
12311 }
12312 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12313}
12314
12315/*
12316 * call-seq:
12317 * casecmp(object) -> -1, 0, 1, or nil
12318 *
12319 * :include: doc/symbol/casecmp.rdoc
12320 *
12321 */
12322
12323static VALUE
12324sym_casecmp(VALUE sym, VALUE other)
12325{
12326 if (!SYMBOL_P(other)) {
12327 return Qnil;
12328 }
12329 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12330}
12331
12332/*
12333 * call-seq:
12334 * casecmp?(object) -> true, false, or nil
12335 *
12336 * :include: doc/symbol/casecmp_p.rdoc
12337 *
12338 */
12339
12340static VALUE
12341sym_casecmp_p(VALUE sym, VALUE other)
12342{
12343 if (!SYMBOL_P(other)) {
12344 return Qnil;
12345 }
12346 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12347}
12348
12349/*
12350 * call-seq:
12351 * symbol =~ object -> integer or nil
12352 *
12353 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12354 * including possible updates to global variables;
12355 * see String#=~.
12356 *
12357 */
12358
12359static VALUE
12360sym_match(VALUE sym, VALUE other)
12361{
12362 return rb_str_match(rb_sym2str(sym), other);
12363}
12364
12365/*
12366 * call-seq:
12367 * match(pattern, offset = 0) -> matchdata or nil
12368 * match(pattern, offset = 0) {|matchdata| } -> object
12369 *
12370 * Equivalent to <tt>self.to_s.match</tt>,
12371 * including possible updates to global variables;
12372 * see String#match.
12373 *
12374 */
12375
12376static VALUE
12377sym_match_m(int argc, VALUE *argv, VALUE sym)
12378{
12379 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12380}
12381
12382/*
12383 * call-seq:
12384 * match?(pattern, offset) -> true or false
12385 *
12386 * Equivalent to <tt>sym.to_s.match?</tt>;
12387 * see String#match.
12388 *
12389 */
12390
12391static VALUE
12392sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12393{
12394 return rb_str_match_m_p(argc, argv, sym);
12395}
12396
12397/*
12398 * call-seq:
12399 * symbol[index] -> string or nil
12400 * symbol[start, length] -> string or nil
12401 * symbol[range] -> string or nil
12402 * symbol[regexp, capture = 0] -> string or nil
12403 * symbol[substring] -> string or nil
12404 *
12405 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12406 *
12407 */
12408
12409static VALUE
12410sym_aref(int argc, VALUE *argv, VALUE sym)
12411{
12412 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12413}
12414
12415/*
12416 * call-seq:
12417 * length -> integer
12418 *
12419 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12420 */
12421
12422static VALUE
12423sym_length(VALUE sym)
12424{
12425 return rb_str_length(rb_sym2str(sym));
12426}
12427
12428/*
12429 * call-seq:
12430 * empty? -> true or false
12431 *
12432 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12433 *
12434 */
12435
12436static VALUE
12437sym_empty(VALUE sym)
12438{
12439 return rb_str_empty(rb_sym2str(sym));
12440}
12441
12442/*
12443 * call-seq:
12444 * upcase(mapping) -> symbol
12445 *
12446 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12447 *
12448 * See String#upcase.
12449 *
12450 */
12451
12452static VALUE
12453sym_upcase(int argc, VALUE *argv, VALUE sym)
12454{
12455 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12456}
12457
12458/*
12459 * call-seq:
12460 * downcase(mapping) -> symbol
12461 *
12462 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12463 *
12464 * See String#downcase.
12465 *
12466 * Related: Symbol#upcase.
12467 *
12468 */
12469
12470static VALUE
12471sym_downcase(int argc, VALUE *argv, VALUE sym)
12472{
12473 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12474}
12475
12476/*
12477 * call-seq:
12478 * capitalize(mapping) -> symbol
12479 *
12480 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12481 *
12482 * See String#capitalize.
12483 *
12484 */
12485
12486static VALUE
12487sym_capitalize(int argc, VALUE *argv, VALUE sym)
12488{
12489 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12490}
12491
12492/*
12493 * call-seq:
12494 * swapcase(mapping) -> symbol
12495 *
12496 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12497 *
12498 * See String#swapcase.
12499 *
12500 */
12501
12502static VALUE
12503sym_swapcase(int argc, VALUE *argv, VALUE sym)
12504{
12505 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12506}
12507
12508/*
12509 * call-seq:
12510 * start_with?(*string_or_regexp) -> true or false
12511 *
12512 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12513 *
12514 */
12515
12516static VALUE
12517sym_start_with(int argc, VALUE *argv, VALUE sym)
12518{
12519 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12520}
12521
12522/*
12523 * call-seq:
12524 * end_with?(*strings) -> true or false
12525 *
12526 *
12527 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12528 *
12529 */
12530
12531static VALUE
12532sym_end_with(int argc, VALUE *argv, VALUE sym)
12533{
12534 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12535}
12536
12537/*
12538 * call-seq:
12539 * encoding -> encoding
12540 *
12541 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12542 *
12543 */
12544
12545static VALUE
12546sym_encoding(VALUE sym)
12547{
12548 return rb_obj_encoding(rb_sym2str(sym));
12549}
12550
12551static VALUE
12552string_for_symbol(VALUE name)
12553{
12554 if (!RB_TYPE_P(name, T_STRING)) {
12555 VALUE tmp = rb_check_string_type(name);
12556 if (NIL_P(tmp)) {
12557 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12558 name);
12559 }
12560 name = tmp;
12561 }
12562 return name;
12563}
12564
12565ID
12567{
12568 if (SYMBOL_P(name)) {
12569 return SYM2ID(name);
12570 }
12571 name = string_for_symbol(name);
12572 return rb_intern_str(name);
12573}
12574
12575VALUE
12577{
12578 if (SYMBOL_P(name)) {
12579 return name;
12580 }
12581 name = string_for_symbol(name);
12582 return rb_str_intern(name);
12583}
12584
12585/*
12586 * call-seq:
12587 * Symbol.all_symbols -> array_of_symbols
12588 *
12589 * Returns an array of all symbols currently in Ruby's symbol table:
12590 *
12591 * Symbol.all_symbols.size # => 9334
12592 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12593 *
12594 */
12595
12596static VALUE
12597sym_all_symbols(VALUE _)
12598{
12599 return rb_sym_all_symbols();
12600}
12601
12602VALUE
12603rb_str_to_interned_str(VALUE str)
12604{
12605 return rb_fstring(str);
12606}
12607
12608VALUE
12609rb_interned_str(const char *ptr, long len)
12610{
12611 struct RString fake_str;
12612 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12613}
12614
12615VALUE
12617{
12618 return rb_interned_str(ptr, strlen(ptr));
12619}
12620
12621VALUE
12622rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12623{
12624 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12625 rb_enc_autoload(enc);
12626 }
12627
12628 struct RString fake_str;
12629 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12630}
12631
12632VALUE
12633rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12634{
12635 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12636 rb_enc_autoload(enc);
12637 }
12638
12639 struct RString fake_str;
12640 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12641}
12642
12643VALUE
12645{
12646 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12647}
12648
12649#if USE_YJIT
12650void
12651rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12652{
12653 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12654 ssize_t code = RB_NUM2SSIZE(codepoint);
12655
12656 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12657 rb_str_buf_cat_byte(str, (char) code);
12658 return;
12659 }
12660 }
12661
12662 rb_str_concat(str, codepoint);
12663}
12664#endif
12665
12666static int
12667fstring_set_class_i(VALUE *str, void *data)
12668{
12669 RBASIC_SET_CLASS(*str, rb_cString);
12670
12671 return ST_CONTINUE;
12672}
12673
12674void
12675Init_String(void)
12676{
12677 rb_cString = rb_define_class("String", rb_cObject);
12678
12679 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12680
12682 rb_define_alloc_func(rb_cString, empty_str_alloc);
12683 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12684 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12685 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12686 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12687 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12690 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12691 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12692 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12693 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12696 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12697 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12698 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12699 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12702 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12703 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12704 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12705 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12706 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12708 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12710 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12711 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12712 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12713 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12714 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12715 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12717 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12718 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12719 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12720 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12721 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12722 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12723 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12724 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12726 rb_define_method(rb_cString, "+@", str_uplus, 0);
12727 rb_define_method(rb_cString, "-@", str_uminus, 0);
12728 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12729 rb_define_alias(rb_cString, "dedup", "-@");
12730
12731 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12732 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12733 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12734 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12737 rb_define_method(rb_cString, "undump", str_undump, 0);
12738
12739 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12740 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12741 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12742 sym_fold = ID2SYM(rb_intern_const("fold"));
12743
12744 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12745 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12746 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12747 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12748
12749 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12750 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12751 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12752 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12753
12754 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12755 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12756 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12757 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12758 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12759 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12760 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12761 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12762 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12763 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12764 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12765 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12767 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12768 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12769 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12770 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12771 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12772
12773 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12774 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12775 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12776
12777 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12778
12779 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12780 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12781 rb_define_method(rb_cString, "center", rb_str_center, -1);
12782
12783 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12784 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12785 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12786 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12787 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12788 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12789 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12790 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12791 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12792
12793 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12794 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12795 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12796 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12797 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12798 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12799 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12800 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12801 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12802
12803 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12804 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12805 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12806 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12807 rb_define_method(rb_cString, "count", rb_str_count, -1);
12808
12809 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12810 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12811 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12812 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12813
12814 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12815 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12816 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12817 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12818 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12819
12820 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12821
12822 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12823 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12824
12825 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12826 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12827
12828 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12829 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12830 rb_define_method(rb_cString, "b", rb_str_b, 0);
12831 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12832 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12833
12834 /* define UnicodeNormalize module here so that we don't have to look it up */
12835 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12836 id_normalize = rb_intern_const("normalize");
12837 id_normalized_p = rb_intern_const("normalized?");
12838
12839 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12840 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12841 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12842
12843 rb_fs = Qnil;
12844 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12845 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12846 rb_gc_register_address(&rb_fs);
12847
12848 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12852 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12853
12854 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12855 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12856 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12857 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12858 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12859 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12860
12861 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12862 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12863 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12864 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12865
12866 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12867 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12868 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12869 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12870 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12871 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12872 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12873
12874 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12875 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12876 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12877 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12878
12879 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12880 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12881
12882 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12883}
12884
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1691
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1474
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1592
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2663
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1036
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3908
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:644
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2123
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2141
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1309
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3537
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:551
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:175
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1297
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3221
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1317
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:932
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1182
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2988
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1201
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12622
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2294
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3692
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1130
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1422
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1323
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:951
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12644
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:816
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:703
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1490
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2670
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2935
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1746
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:701
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1861
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1072
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1867
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4225
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3722
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1927
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1717
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1487
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2447
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3757
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1398
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12252
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2520
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1374
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1711
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3016
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5395
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4121
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3113
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11551
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1768
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1753
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1164
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:986
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1493
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1956
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4107
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3525
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2383
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1974
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6657
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3121
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12616
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1404
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3723
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3063
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4230
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3347
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7325
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2750
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12609
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4177
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3994
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4152
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3699
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3238
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5905
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11609
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1667
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2910
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3210
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3329
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1176
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2704
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7432
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1386
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1683
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2397
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5823
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9493
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1170
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:949
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1815
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2080
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2155
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3309
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1602
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1005
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12576
ID rb_to_id(VALUE str)
Definition string.c:12566
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1866
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3501
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4469
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1416
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2887
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2769
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1410
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2782
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1744
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1586
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
Definition string.c:8376
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113