Ruby 4.1.0dev (2026-04-22 revision f6b0f318421b45a46ff83ed1daecd12389512b60)
string.c (f6b0f318421b45a46ff83ed1daecd12389512b60)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby/ractor.h"
49#include "ruby_assert.h"
50#include "shape.h"
51#include "vm_sync.h"
53
54#if defined HAVE_CRYPT_R
55# if defined HAVE_CRYPT_H
56# include <crypt.h>
57# endif
58#elif !defined HAVE_CRYPT
59# include "missing/crypt.h"
60# define HAVE_CRYPT_R 1
61#endif
62
63#define BEG(no) (regs->beg[(no)])
64#define END(no) (regs->end[(no)])
65
66#undef rb_str_new
67#undef rb_usascii_str_new
68#undef rb_utf8_str_new
69#undef rb_enc_str_new
70#undef rb_str_new_cstr
71#undef rb_usascii_str_new_cstr
72#undef rb_utf8_str_new_cstr
73#undef rb_enc_str_new_cstr
74#undef rb_external_str_new_cstr
75#undef rb_locale_str_new_cstr
76#undef rb_str_dup_frozen
77#undef rb_str_buf_new_cstr
78#undef rb_str_buf_cat
79#undef rb_str_buf_cat2
80#undef rb_str_cat2
81#undef rb_str_cat_cstr
82#undef rb_fstring_cstr
83
86
87/* Flags of RString
88 *
89 * 0: STR_SHARED (equal to ELTS_SHARED)
90 * The string is shared. The buffer this string points to is owned by
91 * another string (the shared root).
92 * 1: RSTRING_NOEMBED
93 * The string is not embedded. When a string is embedded, the contents
94 * follow the header. When a string is not embedded, the contents is
95 * on a separately allocated buffer.
96 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
97 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
98 * It emits a deprecation warning when mutated for the first time.
99 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
100 * The string was allocated by the `Symbol#to_s` method.
101 * It emits a deprecation warning when mutated for the first time.
102 * 4: STR_PRECOMPUTED_HASH
103 * The string is embedded and has its precomputed hashcode stored
104 * after the terminator.
105 * 5: STR_SHARED_ROOT
106 * Other strings may point to the contents of this string. When this
107 * flag is set, STR_SHARED must not be set.
108 * 6: STR_BORROWED
109 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
110 * to be unshared by rb_str_tmp_frozen_release.
111 * 7: STR_TMPLOCK
112 * The pointer to the buffer is passed to a system call such as
113 * read(2). Any modification and realloc is prohibited.
114 * 8-9: ENC_CODERANGE
115 * Stores the coderange of the string.
116 * 10-16: ENCODING
117 * Stores the encoding of the string.
118 * 17: RSTRING_FSTR
119 * The string is a fstring. The string is deduplicated in the fstring
120 * table.
121 * 18: STR_NOFREE
122 * Do not free this string's buffer when the string is reclaimed
123 * by the garbage collector. Used for when the string buffer is a C
124 * string literal.
125 * 19: STR_FAKESTR
126 * The string is not allocated or managed by the garbage collector.
127 * Typically, the string object header (struct RString) is temporarily
128 * allocated on C stack.
129 */
130
131#define RUBY_MAX_CHAR_LEN 16
132#define STR_PRECOMPUTED_HASH FL_USER4
133#define STR_SHARED_ROOT FL_USER5
134#define STR_BORROWED FL_USER6
135#define STR_TMPLOCK FL_USER7
136#define STR_NOFREE FL_USER18
137#define STR_FAKESTR FL_USER19
138
139#define STR_SET_NOEMBED(str) do {\
140 FL_SET((str), STR_NOEMBED);\
141 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
142} while (0)
143#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
144
145#define STR_SET_LEN(str, n) do { \
146 RSTRING(str)->len = (n); \
147} while (0)
148
149#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
150#define TERM_FILL(ptr, termlen) do {\
151 char *const term_fill_ptr = (ptr);\
152 const int term_fill_len = (termlen);\
153 *term_fill_ptr = '\0';\
154 if (UNLIKELY(term_fill_len > 1))\
155 memset(term_fill_ptr, 0, term_fill_len);\
156} while (0)
157
158#define RESIZE_CAPA(str,capacity) do {\
159 const int termlen = TERM_LEN(str);\
160 RESIZE_CAPA_TERM(str,capacity,termlen);\
161} while (0)
162#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
163 if (STR_EMBED_P(str)) {\
164 if (str_embed_capa(str) < capacity + termlen) {\
165 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
166 const long tlen = RSTRING_LEN(str);\
167 memcpy(tmp, RSTRING_PTR(str), str_embed_capa(str));\
168 RSTRING(str)->as.heap.ptr = tmp;\
169 RSTRING(str)->len = tlen;\
170 STR_SET_NOEMBED(str);\
171 RSTRING(str)->as.heap.aux.capa = (capacity);\
172 }\
173 }\
174 else {\
175 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
176 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
177 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
178 RSTRING(str)->as.heap.aux.capa = (capacity);\
179 }\
180} while (0)
181
182#define STR_SET_SHARED(str, shared_str) do { \
183 if (!FL_TEST(str, STR_FAKESTR)) { \
184 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
185 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
186 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
187 FL_SET((str), STR_SHARED); \
188 rb_gc_register_pinning_obj(str); \
189 FL_SET((shared_str), STR_SHARED_ROOT); \
190 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
191 FL_SET_RAW((shared_str), STR_BORROWED); \
192 } \
193} while (0)
194
195#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
196#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
197/* TODO: include the terminator size in capa. */
198
199#define STR_ENC_GET(str) get_encoding(str)
200
201static inline bool
202zero_filled(const char *s, int n)
203{
204 for (; n > 0; --n) {
205 if (*s++) return false;
206 }
207 return true;
208}
209
210#if !defined SHARABLE_MIDDLE_SUBSTRING
211# define SHARABLE_MIDDLE_SUBSTRING 0
212#endif
213
214static inline bool
215SHARABLE_SUBSTRING_P(VALUE str, long beg, long len)
216{
217#if SHARABLE_MIDDLE_SUBSTRING
218 return true;
219#else
220 long end = beg + len;
221 long source_len = RSTRING_LEN(str);
222 return end == source_len || zero_filled(RSTRING_PTR(str) + end, TERM_LEN(str));
223#endif
224}
225
226static inline long
227str_embed_capa(VALUE str)
228{
229 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
230}
231
232bool
233rb_str_reembeddable_p(VALUE str)
234{
235 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
236}
237
238static inline size_t
239rb_str_embed_size(long capa, long termlen)
240{
241 size_t size = offsetof(struct RString, as.embed.ary) + capa + termlen;
242 if (size < sizeof(struct RString)) size = sizeof(struct RString);
243 return size;
244}
245
246size_t
247rb_str_size_as_embedded(VALUE str)
248{
249 size_t real_size;
250 if (STR_EMBED_P(str)) {
251 size_t capa = RSTRING(str)->len;
252 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
253
254 real_size = rb_str_embed_size(capa, TERM_LEN(str));
255 }
256 /* if the string is not currently embedded, but it can be embedded, how
257 * much space would it require */
258 else if (rb_str_reembeddable_p(str)) {
259 size_t capa = RSTRING(str)->as.heap.aux.capa;
260 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) capa += sizeof(st_index_t);
261
262 real_size = rb_str_embed_size(capa, TERM_LEN(str));
263 }
264 else {
265 real_size = sizeof(struct RString);
266 }
267
268 return real_size;
269}
270
271static inline bool
272STR_EMBEDDABLE_P(long len, long termlen)
273{
274 return rb_gc_size_allocatable_p(rb_str_embed_size(len, termlen));
275}
276
277static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
278static VALUE str_new_frozen(VALUE klass, VALUE orig);
279static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
280static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
281static VALUE str_new(VALUE klass, const char *ptr, long len);
282static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
283static inline void str_modifiable(VALUE str);
284static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
285static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
286
287static inline void
288str_make_independent(VALUE str)
289{
290 long len = RSTRING_LEN(str);
291 int termlen = TERM_LEN(str);
292 str_make_independent_expand((str), len, 0L, termlen);
293}
294
295static inline int str_dependent_p(VALUE str);
296
297void
298rb_str_make_independent(VALUE str)
299{
300 if (str_dependent_p(str)) {
301 str_make_independent(str);
302 }
303}
304
305void
306rb_str_make_embedded(VALUE str)
307{
308 RUBY_ASSERT(rb_str_reembeddable_p(str));
309 RUBY_ASSERT(!STR_EMBED_P(str));
310
311 int termlen = TERM_LEN(str);
312 char *buf = RSTRING(str)->as.heap.ptr;
313 long old_capa = RSTRING(str)->as.heap.aux.capa + termlen;
314 long len = RSTRING(str)->len;
315
316 STR_SET_EMBED(str);
317 STR_SET_LEN(str, len);
318
319 if (len > 0) {
320 memcpy(RSTRING_PTR(str), buf, len);
321 SIZED_FREE_N(buf, old_capa);
322 }
323
324 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
325}
326
327void
328rb_debug_rstring_null_ptr(const char *func)
329{
330 fprintf(stderr, "%s is returning NULL!! "
331 "SIGSEGV is highly expected to follow immediately.\n"
332 "If you could reproduce, attach your debugger here, "
333 "and look at the passed string.\n",
334 func);
335}
336
337/* symbols for [up|down|swap]case/capitalize options */
338static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
339
340static rb_encoding *
341get_encoding(VALUE str)
342{
343 return rb_enc_from_index(ENCODING_GET(str));
344}
345
346static void
347mustnot_broken(VALUE str)
348{
349 if (is_broken_string(str)) {
350 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
351 }
352}
353
354static void
355mustnot_wchar(VALUE str)
356{
357 rb_encoding *enc = STR_ENC_GET(str);
358 if (rb_enc_mbminlen(enc) > 1) {
359 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
360 }
361}
362
363static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
364
365#if SIZEOF_LONG == SIZEOF_VOIDP
366#define PRECOMPUTED_FAKESTR_HASH 1
367#else
368#endif
369
370static inline bool
371BARE_STRING_P(VALUE str)
372{
373 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
374}
375
376static inline st_index_t
377str_do_hash(VALUE str)
378{
379 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
380 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
381 if (e && !is_ascii_string(str)) {
382 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
383 }
384 return h;
385}
386
387static VALUE
388str_store_precomputed_hash(VALUE str, st_index_t hash)
389{
390 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
391 RUBY_ASSERT(STR_EMBED_P(str));
392
393#if RUBY_DEBUG
394 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
395 size_t free_bytes = str_embed_capa(str) - used_bytes;
396 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
397#endif
398
399 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
400
401 FL_SET(str, STR_PRECOMPUTED_HASH);
402
403 return str;
404}
405
406VALUE
407rb_fstring(VALUE str)
408{
409 VALUE fstr;
410 int bare;
411
412 Check_Type(str, T_STRING);
413
414 if (FL_TEST(str, RSTRING_FSTR))
415 return str;
416
417 bare = BARE_STRING_P(str);
418 if (!bare) {
419 if (STR_EMBED_P(str)) {
420 OBJ_FREEZE(str);
421 return str;
422 }
423
424 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
426 return str;
427 }
428 }
429
430 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
431 rb_str_resize(str, RSTRING_LEN(str));
432
433 fstr = register_fstring(str, false, false);
434
435 if (!bare) {
436 str_replace_shared_without_enc(str, fstr);
437 OBJ_FREEZE(str);
438 return str;
439 }
440 return fstr;
441}
442
443static VALUE fstring_table_obj;
444
445static VALUE
446fstring_concurrent_set_hash(VALUE str)
447{
448#ifdef PRECOMPUTED_FAKESTR_HASH
449 st_index_t h;
450 if (FL_TEST_RAW(str, STR_FAKESTR)) {
451 // register_fstring precomputes the hash and stores it in capa for fake strings
452 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
453 }
454 else {
455 h = rb_str_hash(str);
456 }
457 // rb_str_hash doesn't include the encoding for ascii only strings, so
458 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
459 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
460#else
461 return (VALUE)rb_str_hash(str);
462#endif
463}
464
465static bool
466fstring_concurrent_set_cmp(VALUE a, VALUE b)
467{
468 long alen, blen;
469 const char *aptr, *bptr;
470
473
474 RSTRING_GETMEM(a, aptr, alen);
475 RSTRING_GETMEM(b, bptr, blen);
476 return (alen == blen &&
477 ENCODING_GET(a) == ENCODING_GET(b) &&
478 memcmp(aptr, bptr, alen) == 0);
479}
480
482 bool copy;
483 bool force_precompute_hash;
484};
485
486static VALUE
487fstring_concurrent_set_create(VALUE str, void *data)
488{
489 struct fstr_create_arg *arg = data;
490
491 // Unless the string is empty or binary, its coderange has been precomputed.
492 int coderange = ENC_CODERANGE(str);
493
494 if (FL_TEST_RAW(str, STR_FAKESTR)) {
495 if (arg->copy) {
496 VALUE new_str;
497 long len = RSTRING_LEN(str);
498 long capa = len + sizeof(st_index_t);
499 int term_len = TERM_LEN(str);
500
501 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
502 new_str = str_alloc_embed(rb_cString, capa + term_len);
503 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
504 STR_SET_LEN(new_str, RSTRING_LEN(str));
505 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
506 rb_enc_copy(new_str, str);
507 str_store_precomputed_hash(new_str, str_do_hash(str));
508 }
509 else {
510 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
511 rb_enc_copy(new_str, str);
512#ifdef PRECOMPUTED_FAKESTR_HASH
513 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
514 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
515 }
516#endif
517 }
518 str = new_str;
519 }
520 else {
521 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
522 RSTRING(str)->len,
523 ENCODING_GET(str));
524 }
525 OBJ_FREEZE(str);
526 }
527 else {
528 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
529 str = str_new_frozen(rb_cString, str);
530 }
531 if (STR_SHARED_P(str)) { /* str should not be shared */
532 /* shared substring */
533 str_make_independent(str);
535 }
536 if (!BARE_STRING_P(str)) {
537 str = str_new_frozen(rb_cString, str);
538 }
539 }
540
541 ENC_CODERANGE_SET(str, coderange);
542 RBASIC(str)->flags |= RSTRING_FSTR;
543 if (!RB_OBJ_SHAREABLE_P(str)) {
544 RB_OBJ_SET_SHAREABLE(str);
545 }
546 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
549 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
550 RUBY_ASSERT(!rb_shape_obj_has_ivars(str));
552 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
553
554 return str;
555}
556
557static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
558 .hash = fstring_concurrent_set_hash,
559 .cmp = fstring_concurrent_set_cmp,
560 .create = fstring_concurrent_set_create,
561 .free = NULL,
562};
563
564void
565Init_fstring_table(void)
566{
567 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
568 rb_gc_register_address(&fstring_table_obj);
569}
570
571static VALUE
572register_fstring(VALUE str, bool copy, bool force_precompute_hash)
573{
574 struct fstr_create_arg args = {
575 .copy = copy,
576 .force_precompute_hash = force_precompute_hash
577 };
578
579#if SIZEOF_VOIDP == SIZEOF_LONG
580 if (FL_TEST_RAW(str, STR_FAKESTR)) {
581 // if the string hasn't been interned, we'll need the hash twice, so we
582 // compute it once and store it in capa
583 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
584 }
585#endif
586
587 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
588
589 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
591 RUBY_ASSERT(OBJ_FROZEN(result));
593 RUBY_ASSERT((rb_gc_verify_shareable(result), 1));
594 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
596
597 return result;
598}
599
600bool
601rb_obj_is_fstring_table(VALUE obj)
602{
603 ASSERT_vm_locking();
604
605 return obj == fstring_table_obj;
606}
607
608void
609rb_gc_free_fstring(VALUE obj)
610{
611 ASSERT_vm_locking_with_barrier();
612
613 RUBY_ASSERT(FL_TEST(obj, RSTRING_FSTR));
615 RUBY_ASSERT(!FL_TEST(obj, STR_SHARED));
616
617 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
618
619 RB_DEBUG_COUNTER_INC(obj_str_fstr);
620
621 FL_UNSET(obj, RSTRING_FSTR);
622}
623
624void
625rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
626{
627 if (fstring_table_obj) {
628 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
629 }
630}
631
632static VALUE
633setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
634{
635 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
636 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
637
638 if (!name) {
640 name = "";
641 }
642
643 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
644
645 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
646 fake_str->len = len;
647 fake_str->as.heap.ptr = (char *)name;
648 fake_str->as.heap.aux.capa = len;
649 return (VALUE)fake_str;
650}
651
652/*
653 * set up a fake string which refers a static string literal.
654 */
655VALUE
656rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
657{
658 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
659}
660
661/*
662 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
663 * shared string which refers a static string literal. `ptr` must
664 * point a constant string.
665 */
666VALUE
667rb_fstring_new(const char *ptr, long len)
668{
669 struct RString fake_str = {RBASIC_INIT};
670 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
671}
672
673VALUE
674rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
675{
676 struct RString fake_str = {RBASIC_INIT};
677 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
678}
679
680VALUE
681rb_fstring_cstr(const char *ptr)
682{
683 return rb_fstring_new(ptr, strlen(ptr));
684}
685
686static inline bool
687single_byte_optimizable(VALUE str)
688{
689 int encindex = ENCODING_GET(str);
690 switch (encindex) {
691 case ENCINDEX_ASCII_8BIT:
692 case ENCINDEX_US_ASCII:
693 return true;
694 case ENCINDEX_UTF_8:
695 // For UTF-8 it's worth scanning the string coderange when unknown.
696 return rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT;
697 }
698 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
699 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
700 return true;
701 }
702
703 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
704 return true;
705 }
706
707 /* Conservative. Possibly single byte.
708 * "\xa1" in Shift_JIS for example. */
709 return false;
710}
711
713
714static inline const char *
715search_nonascii(const char *p, const char *e)
716{
717 const char *s, *t;
718
719#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
720# if SIZEOF_UINTPTR_T == 8
721# define NONASCII_MASK UINT64_C(0x8080808080808080)
722# elif SIZEOF_UINTPTR_T == 4
723# define NONASCII_MASK UINT32_C(0x80808080)
724# else
725# error "don't know what to do."
726# endif
727#else
728# if SIZEOF_UINTPTR_T == 8
729# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
730# elif SIZEOF_UINTPTR_T == 4
731# define NONASCII_MASK 0x80808080UL /* or...? */
732# else
733# error "don't know what to do."
734# endif
735#endif
736
737 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
738#if !UNALIGNED_WORD_ACCESS
739 if ((uintptr_t)p % SIZEOF_VOIDP) {
740 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
741 p += l;
742 switch (l) {
743 default: UNREACHABLE;
744#if SIZEOF_VOIDP > 4
745 case 7: if (p[-7]&0x80) return p-7;
746 case 6: if (p[-6]&0x80) return p-6;
747 case 5: if (p[-5]&0x80) return p-5;
748 case 4: if (p[-4]&0x80) return p-4;
749#endif
750 case 3: if (p[-3]&0x80) return p-3;
751 case 2: if (p[-2]&0x80) return p-2;
752 case 1: if (p[-1]&0x80) return p-1;
753 case 0: break;
754 }
755 }
756#endif
757#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
758#define aligned_ptr(value) \
759 __builtin_assume_aligned((value), sizeof(uintptr_t))
760#else
761#define aligned_ptr(value) (value)
762#endif
763 s = aligned_ptr(p);
764 t = (e - (SIZEOF_VOIDP-1));
765#undef aligned_ptr
766 for (;s < t; s += sizeof(uintptr_t)) {
767 uintptr_t word;
768 memcpy(&word, s, sizeof(word));
769 if (word & NONASCII_MASK) {
770#ifdef WORDS_BIGENDIAN
771 return (const char *)s + (nlz_intptr(word&NONASCII_MASK)>>3);
772#else
773 return (const char *)s + (ntz_intptr(word&NONASCII_MASK)>>3);
774#endif
775 }
776 }
777 p = (const char *)s;
778 }
779
780 switch (e - p) {
781 default: UNREACHABLE;
782#if SIZEOF_VOIDP > 4
783 case 7: if (e[-7]&0x80) return e-7;
784 case 6: if (e[-6]&0x80) return e-6;
785 case 5: if (e[-5]&0x80) return e-5;
786 case 4: if (e[-4]&0x80) return e-4;
787#endif
788 case 3: if (e[-3]&0x80) return e-3;
789 case 2: if (e[-2]&0x80) return e-2;
790 case 1: if (e[-1]&0x80) return e-1;
791 case 0: return NULL;
792 }
793}
794
795static int
796coderange_scan(const char *p, long len, rb_encoding *enc)
797{
798 const char *e = p + len;
799
800 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
801 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
802 p = search_nonascii(p, e);
804 }
805
806 if (rb_enc_asciicompat(enc)) {
807 p = search_nonascii(p, e);
808 if (!p) return ENC_CODERANGE_7BIT;
809 for (;;) {
810 int ret = rb_enc_precise_mbclen(p, e, enc);
812 p += MBCLEN_CHARFOUND_LEN(ret);
813 if (p == e) break;
814 p = search_nonascii(p, e);
815 if (!p) break;
816 }
817 }
818 else {
819 while (p < e) {
820 int ret = rb_enc_precise_mbclen(p, e, enc);
822 p += MBCLEN_CHARFOUND_LEN(ret);
823 }
824 }
825 return ENC_CODERANGE_VALID;
826}
827
828long
829rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
830{
831 const char *p = s;
832
833 if (*cr == ENC_CODERANGE_BROKEN)
834 return e - s;
835
836 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
837 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
838 if (*cr == ENC_CODERANGE_VALID) return e - s;
839 p = search_nonascii(p, e);
841 return e - s;
842 }
843 else if (rb_enc_asciicompat(enc)) {
844 p = search_nonascii(p, e);
845 if (!p) {
846 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
847 return e - s;
848 }
849 for (;;) {
850 int ret = rb_enc_precise_mbclen(p, e, enc);
851 if (!MBCLEN_CHARFOUND_P(ret)) {
853 return p - s;
854 }
855 p += MBCLEN_CHARFOUND_LEN(ret);
856 if (p == e) break;
857 p = search_nonascii(p, e);
858 if (!p) break;
859 }
860 }
861 else {
862 while (p < e) {
863 int ret = rb_enc_precise_mbclen(p, e, enc);
864 if (!MBCLEN_CHARFOUND_P(ret)) {
866 return p - s;
867 }
868 p += MBCLEN_CHARFOUND_LEN(ret);
869 }
870 }
872 return e - s;
873}
874
875static inline void
876str_enc_copy(VALUE str1, VALUE str2)
877{
878 rb_enc_set_index(str1, ENCODING_GET(str2));
879}
880
881/* Like str_enc_copy, but does not check frozen status of str1.
882 * You should use this only if you're certain that str1 is not frozen. */
883static inline void
884str_enc_copy_direct(VALUE str1, VALUE str2)
885{
886 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
887 if (inlined_encoding == ENCODING_INLINE_MAX) {
888 rb_enc_set_index(str1, rb_enc_get_index(str2));
889 }
890 else {
891 ENCODING_SET_INLINED(str1, inlined_encoding);
892 }
893}
894
895static void
896rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
897{
898 /* this function is designed for copying encoding and coderange
899 * from src to new string "dest" which is made from the part of src.
900 */
901 str_enc_copy(dest, src);
902 if (RSTRING_LEN(dest) == 0) {
903 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
905 else
907 return;
908 }
909 switch (ENC_CODERANGE(src)) {
912 break;
914 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
915 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
917 else
919 break;
920 default:
921 break;
922 }
923}
924
925static void
926rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
927{
928 str_enc_copy(dest, src);
930}
931
932static int
933enc_coderange_scan(VALUE str, rb_encoding *enc)
934{
935 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
936}
937
938int
939rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
940{
941 return enc_coderange_scan(str, enc);
942}
943
944int
945rbimpl_enc_str_coderange_scan(VALUE str)
946{
947 int cr = enc_coderange_scan(str, get_encoding(str));
948 ENC_CODERANGE_SET(str, cr);
949 return cr;
950}
951
952#undef rb_enc_str_coderange
953int
954rb_enc_str_coderange(VALUE str)
955{
956 int cr = ENC_CODERANGE(str);
957
958 if (cr == ENC_CODERANGE_UNKNOWN) {
959 cr = rbimpl_enc_str_coderange_scan(str);
960 }
961 return cr;
962}
963#define rb_enc_str_coderange rb_enc_str_coderange_inline
964
965static inline bool
966rb_enc_str_asciicompat(VALUE str)
967{
968 int encindex = ENCODING_GET_INLINED(str);
969 return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
970}
971
972int
974{
975 switch(ENC_CODERANGE(str)) {
977 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
979 return true;
980 default:
981 return false;
982 }
983}
984
985static inline void
986str_mod_check(VALUE s, const char *p, long len)
987{
988 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
989 rb_raise(rb_eRuntimeError, "string modified");
990 }
991}
992
993static size_t
994str_capacity(VALUE str, const int termlen)
995{
996 if (STR_EMBED_P(str)) {
997 return str_embed_capa(str) - termlen;
998 }
999 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
1000 return RSTRING(str)->len;
1001 }
1002 else {
1003 return RSTRING(str)->as.heap.aux.capa;
1004 }
1005}
1006
1007size_t
1009{
1010 return str_capacity(str, TERM_LEN(str));
1011}
1012
1013static inline void
1014must_not_null(const char *ptr)
1015{
1016 if (!ptr) {
1017 rb_raise(rb_eArgError, "NULL pointer given");
1018 }
1019}
1020
1021static inline VALUE
1022str_alloc_embed(VALUE klass, size_t capa)
1023{
1024 size_t size = rb_str_embed_size(capa, 0);
1025 RUBY_ASSERT(size > 0);
1026 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1027
1028 NEWOBJ_OF(str, struct RString, klass,
1030
1031 str->len = 0;
1032 str->as.embed.ary[0] = 0;
1033
1034 return (VALUE)str;
1035}
1036
1037static inline VALUE
1038str_alloc_heap(VALUE klass)
1039{
1040 NEWOBJ_OF(str, struct RString, klass,
1041 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1042
1043 str->len = 0;
1044 str->as.heap.aux.capa = 0;
1045 str->as.heap.ptr = NULL;
1046
1047 return (VALUE)str;
1048}
1049
1050static inline VALUE
1051empty_str_alloc(VALUE klass)
1052{
1053 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1054 VALUE str = str_alloc_embed(klass, 0);
1055 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1057 return str;
1058}
1059
1060static VALUE
1061str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1062{
1063 VALUE str;
1064
1065 if (len < 0) {
1066 rb_raise(rb_eArgError, "negative string size (or size too big)");
1067 }
1068
1069 if (enc == NULL) {
1070 enc = rb_ascii8bit_encoding();
1071 }
1072
1073 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1074
1075 int termlen = rb_enc_mbminlen(enc);
1076
1077 if (STR_EMBEDDABLE_P(len, termlen)) {
1078 str = str_alloc_embed(klass, len + termlen);
1079 if (len == 0) {
1080 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1081 }
1082 }
1083 else {
1084 str = str_alloc_heap(klass);
1085 RSTRING(str)->as.heap.aux.capa = len;
1086 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1087 * integer overflow. If we can STATIC_ASSERT that, the following
1088 * mul_add_mul can be reverted to a simple ALLOC_N. */
1089 RSTRING(str)->as.heap.ptr =
1090 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1091 }
1092
1093 rb_enc_raw_set(str, enc);
1094
1095 if (ptr) {
1096 memcpy(RSTRING_PTR(str), ptr, len);
1097 }
1098 else {
1099 memset(RSTRING_PTR(str), 0, len);
1100 }
1101
1102 STR_SET_LEN(str, len);
1103 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1104 return str;
1105}
1106
1107static VALUE
1108str_new(VALUE klass, const char *ptr, long len)
1109{
1110 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1111}
1112
1113VALUE
1114rb_str_new(const char *ptr, long len)
1115{
1116 return str_new(rb_cString, ptr, len);
1117}
1118
1119VALUE
1120rb_usascii_str_new(const char *ptr, long len)
1121{
1122 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1123}
1124
1125VALUE
1126rb_utf8_str_new(const char *ptr, long len)
1127{
1128 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1129}
1130
1131VALUE
1132rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1133{
1134 return str_enc_new(rb_cString, ptr, len, enc);
1135}
1136
1137VALUE
1139{
1140 must_not_null(ptr);
1141 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1142 * memory regions, and that cannot be detected by the MSAN. Just
1143 * trust the programmer that the argument passed here is a sane C
1144 * string. */
1145 __msan_unpoison_string(ptr);
1146 return rb_str_new(ptr, strlen(ptr));
1147}
1148
1149VALUE
1151{
1152 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1153}
1154
1155VALUE
1157{
1158 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1159}
1160
1161VALUE
1163{
1164 must_not_null(ptr);
1165 if (rb_enc_mbminlen(enc) != 1) {
1166 rb_raise(rb_eArgError, "wchar encoding given");
1167 }
1168 return rb_enc_str_new(ptr, strlen(ptr), enc);
1169}
1170
1171static VALUE
1172str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1173{
1174 VALUE str;
1175
1176 if (len < 0) {
1177 rb_raise(rb_eArgError, "negative string size (or size too big)");
1178 }
1179
1180 if (!ptr) {
1181 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1182 }
1183 else {
1184 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1185 str = str_alloc_heap(klass);
1186 RSTRING(str)->len = len;
1187 RSTRING(str)->as.heap.ptr = (char *)ptr;
1188 RSTRING(str)->as.heap.aux.capa = len;
1189 RBASIC(str)->flags |= STR_NOFREE;
1190 rb_enc_associate_index(str, encindex);
1191 }
1192 return str;
1193}
1194
1195VALUE
1196rb_str_new_static(const char *ptr, long len)
1197{
1198 return str_new_static(rb_cString, ptr, len, 0);
1199}
1200
1201VALUE
1203{
1204 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1205}
1206
1207VALUE
1209{
1210 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1211}
1212
1213VALUE
1215{
1216 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1217}
1218
1219static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1220 rb_encoding *from, rb_encoding *to,
1221 int ecflags, VALUE ecopts);
1222
1223static inline bool
1224is_enc_ascii_string(VALUE str, rb_encoding *enc)
1225{
1226 int encidx = rb_enc_to_index(enc);
1227 if (rb_enc_get_index(str) == encidx)
1228 return is_ascii_string(str);
1229 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1230}
1231
1232VALUE
1233rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1234{
1235 long len;
1236 const char *ptr;
1237 VALUE newstr;
1238
1239 if (!to) return str;
1240 if (!from) from = rb_enc_get(str);
1241 if (from == to) return str;
1242 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1243 rb_is_ascii8bit_enc(to)) {
1244 if (STR_ENC_GET(str) != to) {
1245 str = rb_str_dup(str);
1246 rb_enc_associate(str, to);
1247 }
1248 return str;
1249 }
1250
1251 RSTRING_GETMEM(str, ptr, len);
1252 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1253 from, to, ecflags, ecopts);
1254 if (NIL_P(newstr)) {
1255 /* some error, return original */
1256 return str;
1257 }
1258 return newstr;
1259}
1260
1261VALUE
1262rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1263 rb_encoding *from, int ecflags, VALUE ecopts)
1264{
1265 long olen;
1266
1267 olen = RSTRING_LEN(newstr);
1268 if (ofs < -olen || olen < ofs)
1269 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1270 if (ofs < 0) ofs += olen;
1271 if (!from) {
1272 STR_SET_LEN(newstr, ofs);
1273 return rb_str_cat(newstr, ptr, len);
1274 }
1275
1276 rb_str_modify(newstr);
1277 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1278 rb_enc_get(newstr),
1279 ecflags, ecopts);
1280}
1281
1282VALUE
1283rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1284{
1285 STR_SET_LEN(str, 0);
1286 rb_enc_associate(str, enc);
1287 rb_str_cat(str, ptr, len);
1288 return str;
1289}
1290
1291static VALUE
1292str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1293 rb_encoding *from, rb_encoding *to,
1294 int ecflags, VALUE ecopts)
1295{
1296 rb_econv_t *ec;
1298 long olen;
1299 VALUE econv_wrapper;
1300 const unsigned char *start, *sp;
1301 unsigned char *dest, *dp;
1302 size_t converted_output = (size_t)ofs;
1303
1304 olen = rb_str_capacity(newstr);
1305
1306 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1307 RBASIC_CLEAR_CLASS(econv_wrapper);
1308 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1309 if (!ec) return Qnil;
1310 DATA_PTR(econv_wrapper) = ec;
1311
1312 sp = (unsigned char*)ptr;
1313 start = sp;
1314 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1315 (dp = dest + converted_output),
1316 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1318 /* destination buffer short */
1319 size_t converted_input = sp - start;
1320 size_t rest = len - converted_input;
1321 converted_output = dp - dest;
1322 rb_str_set_len(newstr, converted_output);
1323 if (converted_input && converted_output &&
1324 rest < (LONG_MAX / converted_output)) {
1325 rest = (rest * converted_output) / converted_input;
1326 }
1327 else {
1328 rest = olen;
1329 }
1330 olen += rest < 2 ? 2 : rest;
1331 rb_str_resize(newstr, olen);
1332 }
1333 DATA_PTR(econv_wrapper) = 0;
1334 RB_GC_GUARD(econv_wrapper);
1335 rb_econv_close(ec);
1336 switch (ret) {
1337 case econv_finished:
1338 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1339 rb_str_set_len(newstr, len);
1340 rb_enc_associate(newstr, to);
1341 return newstr;
1342
1343 default:
1344 return Qnil;
1345 }
1346}
1347
1348VALUE
1350{
1351 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1352}
1353
1354VALUE
1356{
1357 rb_encoding *ienc;
1358 VALUE str;
1359 const int eidx = rb_enc_to_index(eenc);
1360
1361 if (!ptr) {
1362 return rb_enc_str_new(ptr, len, eenc);
1363 }
1364
1365 /* ASCII-8BIT case, no conversion */
1366 if ((eidx == rb_ascii8bit_encindex()) ||
1367 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1368 return rb_str_new(ptr, len);
1369 }
1370 /* no default_internal or same encoding, no conversion */
1371 ienc = rb_default_internal_encoding();
1372 if (!ienc || eenc == ienc) {
1373 return rb_enc_str_new(ptr, len, eenc);
1374 }
1375 /* ASCII compatible, and ASCII only string, no conversion in
1376 * default_internal */
1377 if ((eidx == rb_ascii8bit_encindex()) ||
1378 (eidx == rb_usascii_encindex()) ||
1379 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1380 return rb_enc_str_new(ptr, len, ienc);
1381 }
1382 /* convert from the given encoding to default_internal */
1383 str = rb_enc_str_new(NULL, 0, ienc);
1384 /* when the conversion failed for some reason, just ignore the
1385 * default_internal and result in the given encoding as-is. */
1386 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1387 rb_str_initialize(str, ptr, len, eenc);
1388 }
1389 return str;
1390}
1391
1392VALUE
1393rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1394{
1395 int eidx = rb_enc_to_index(eenc);
1396 if (eidx == rb_usascii_encindex() &&
1397 !is_ascii_string(str)) {
1398 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1399 return str;
1400 }
1401 rb_enc_associate_index(str, eidx);
1402 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1403}
1404
1405VALUE
1406rb_external_str_new(const char *ptr, long len)
1407{
1408 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1409}
1410
1411VALUE
1413{
1414 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1415}
1416
1417VALUE
1418rb_locale_str_new(const char *ptr, long len)
1419{
1420 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1421}
1422
1423VALUE
1425{
1426 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1427}
1428
1429VALUE
1431{
1432 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1433}
1434
1435VALUE
1437{
1438 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1439}
1440
1441VALUE
1443{
1444 return rb_str_export_to_enc(str, rb_default_external_encoding());
1445}
1446
1447VALUE
1449{
1450 return rb_str_export_to_enc(str, rb_locale_encoding());
1451}
1452
1453VALUE
1455{
1456 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1457}
1458
1459static VALUE
1460str_replace_shared_without_enc(VALUE str2, VALUE str)
1461{
1462 const int termlen = TERM_LEN(str);
1463 char *ptr;
1464 long len;
1465
1466 RSTRING_GETMEM(str, ptr, len);
1467 if (str_embed_capa(str2) >= len + termlen) {
1468 char *ptr2 = RSTRING(str2)->as.embed.ary;
1469 STR_SET_EMBED(str2);
1470 memcpy(ptr2, RSTRING_PTR(str), len);
1471 TERM_FILL(ptr2+len, termlen);
1472 }
1473 else {
1474 VALUE root;
1475 if (STR_SHARED_P(str)) {
1476 root = RSTRING(str)->as.heap.aux.shared;
1477 RSTRING_GETMEM(str, ptr, len);
1478 }
1479 else {
1480 root = rb_str_new_frozen(str);
1481 RSTRING_GETMEM(root, ptr, len);
1482 }
1483 RUBY_ASSERT(OBJ_FROZEN(root));
1484
1485 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1486 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1487 rb_fatal("about to free a possible shared root");
1488 }
1489 char *ptr2 = STR_HEAP_PTR(str2);
1490 if (ptr2 != ptr) {
1491 SIZED_FREE_N(ptr2, STR_HEAP_SIZE(str2));
1492 }
1493 }
1494 FL_SET(str2, STR_NOEMBED);
1495 RSTRING(str2)->as.heap.ptr = ptr;
1496 STR_SET_SHARED(str2, root);
1497 }
1498
1499 STR_SET_LEN(str2, len);
1500
1501 return str2;
1502}
1503
1504static VALUE
1505str_replace_shared(VALUE str2, VALUE str)
1506{
1507 str_replace_shared_without_enc(str2, str);
1508 rb_enc_cr_str_exact_copy(str2, str);
1509 return str2;
1510}
1511
1512static VALUE
1513str_new_shared(VALUE klass, VALUE str)
1514{
1515 return str_replace_shared(str_alloc_heap(klass), str);
1516}
1517
1518VALUE
1520{
1521 return str_new_shared(rb_obj_class(str), str);
1522}
1523
1524VALUE
1526{
1527 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1528 return str_new_frozen(rb_obj_class(orig), orig);
1529}
1530
1531static VALUE
1532rb_str_new_frozen_String(VALUE orig)
1533{
1534 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1535 return str_new_frozen(rb_cString, orig);
1536}
1537
1538
1539VALUE
1540rb_str_frozen_bare_string(VALUE orig)
1541{
1542 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1543 return str_new_frozen(rb_cString, orig);
1544}
1545
1546VALUE
1547rb_str_tmp_frozen_acquire(VALUE orig)
1548{
1549 if (OBJ_FROZEN_RAW(orig)) return orig;
1550 return str_new_frozen_buffer(0, orig, FALSE);
1551}
1552
1553VALUE
1554rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1555{
1556 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1557 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1558
1559 VALUE str = str_alloc_heap(0);
1560 OBJ_FREEZE(str);
1561 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1562 FL_SET(str, STR_SHARED_ROOT);
1563
1564 size_t capa = str_capacity(orig, TERM_LEN(orig));
1565
1566 /* If the string is embedded then we want to create a copy that is heap
1567 * allocated. If the string is shared then the shared root must be
1568 * embedded, so we want to create a copy. If the string is a shared root
1569 * then it must be embedded, so we want to create a copy. */
1570 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT | RSTRING_FSTR)) {
1571 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1572 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1573 }
1574 else {
1575 /* orig must be heap allocated and not shared, so we can safely transfer
1576 * the pointer to str. */
1577 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1578 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1579 RBASIC(orig)->flags &= ~STR_NOFREE;
1580 STR_SET_SHARED(orig, str);
1581 if (RB_OBJ_SHAREABLE_P(orig)) {
1582 RB_OBJ_SET_SHAREABLE(str);
1583 RUBY_ASSERT((rb_gc_verify_shareable(str), 1));
1584 }
1585 }
1586
1587 RSTRING(str)->len = RSTRING(orig)->len;
1588 RSTRING(str)->as.heap.aux.capa = capa + (TERM_LEN(orig) - TERM_LEN(str));
1589
1590 return str;
1591}
1592
1593void
1594rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1595{
1596 if (RBASIC_CLASS(tmp) != 0)
1597 return;
1598
1599 if (STR_EMBED_P(tmp)) {
1601 }
1602 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1603 !OBJ_FROZEN_RAW(orig)) {
1604 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1605
1606 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1607 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1608 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1609
1610 /* Unshare orig since the root (tmp) only has this one child. */
1611 FL_UNSET_RAW(orig, STR_SHARED);
1612 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1613 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1615
1616 /* Make tmp embedded and empty so it is safe for sweeping. */
1617 STR_SET_EMBED(tmp);
1618 STR_SET_LEN(tmp, 0);
1619 }
1620 }
1621}
1622
1623static VALUE
1624str_new_frozen(VALUE klass, VALUE orig)
1625{
1626 return str_new_frozen_buffer(klass, orig, TRUE);
1627}
1628
1629static VALUE
1630heap_str_make_shared(VALUE klass, VALUE orig)
1631{
1632 RUBY_ASSERT(!STR_EMBED_P(orig));
1633 RUBY_ASSERT(!STR_SHARED_P(orig));
1635
1636 VALUE str = str_alloc_heap(klass);
1637 STR_SET_LEN(str, RSTRING_LEN(orig));
1638 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1639 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1640 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1641 RBASIC(orig)->flags &= ~STR_NOFREE;
1642 STR_SET_SHARED(orig, str);
1643 if (klass == 0)
1644 FL_UNSET_RAW(str, STR_BORROWED);
1645 return str;
1646}
1647
1648static VALUE
1649str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1650{
1651 VALUE str;
1652
1653 long len = RSTRING_LEN(orig);
1654 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1655 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1656
1657 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1658 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1659 RUBY_ASSERT(STR_EMBED_P(str));
1660 }
1661 else {
1662 if (FL_TEST_RAW(orig, STR_SHARED)) {
1663 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1664 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1665 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1666 RUBY_ASSERT(ofs >= 0);
1667 RUBY_ASSERT(rest >= 0);
1668 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1670
1671 if ((ofs > 0) || (rest > 0) ||
1672 (klass != RBASIC(shared)->klass) ||
1673 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1674 str = str_new_shared(klass, shared);
1675 RUBY_ASSERT(!STR_EMBED_P(str));
1676 RSTRING(str)->as.heap.ptr += ofs;
1677 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1678 }
1679 else {
1680 if (RBASIC_CLASS(shared) == 0)
1681 FL_SET_RAW(shared, STR_BORROWED);
1682 return shared;
1683 }
1684 }
1685 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1686 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1687 STR_SET_EMBED(str);
1688 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1689 STR_SET_LEN(str, RSTRING_LEN(orig));
1690 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1691 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1692 }
1693 else {
1694 if (RB_OBJ_SHAREABLE_P(orig)) {
1695 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1696 }
1697 else {
1698 str = heap_str_make_shared(klass, orig);
1699 }
1700 }
1701 }
1702
1703 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1704 OBJ_FREEZE(str);
1705 return str;
1706}
1707
1708VALUE
1709rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1710{
1711 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1712}
1713
1714static VALUE
1715str_new_empty_String(VALUE str)
1716{
1717 VALUE v = rb_str_new(0, 0);
1718 rb_enc_copy(v, str);
1719 return v;
1720}
1721
1722#define STR_BUF_MIN_SIZE 63
1723
1724VALUE
1726{
1727 if (STR_EMBEDDABLE_P(capa, 1)) {
1728 return str_alloc_embed(rb_cString, capa + 1);
1729 }
1730
1731 VALUE str = str_alloc_heap(rb_cString);
1732
1733 RSTRING(str)->as.heap.aux.capa = capa;
1734 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1735 RSTRING(str)->as.heap.ptr[0] = '\0';
1736
1737 return str;
1738}
1739
1740VALUE
1742{
1743 VALUE str;
1744 long len = strlen(ptr);
1745
1746 str = rb_str_buf_new(len);
1747 rb_str_buf_cat(str, ptr, len);
1748
1749 return str;
1750}
1751
1752VALUE
1754{
1755 return str_new(0, 0, len);
1756}
1757
1758void
1760{
1761 if (STR_EMBED_P(str)) {
1762 RB_DEBUG_COUNTER_INC(obj_str_embed);
1763 }
1764 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1765 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1766 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1767 }
1768 else {
1769 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1770 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1771 }
1772}
1773
1774size_t
1775rb_str_memsize(VALUE str)
1776{
1777 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1778 return STR_HEAP_SIZE(str);
1779 }
1780 else {
1781 return 0;
1782 }
1783}
1784
1785VALUE
1787{
1788 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1789}
1790
1791static inline void str_discard(VALUE str);
1792static void str_shared_replace(VALUE str, VALUE str2);
1793
1794void
1796{
1797 if (str != str2) str_shared_replace(str, str2);
1798}
1799
1800static void
1801str_shared_replace(VALUE str, VALUE str2)
1802{
1803 rb_encoding *enc;
1804 int cr;
1805 int termlen;
1806
1807 RUBY_ASSERT(str2 != str);
1808 enc = STR_ENC_GET(str2);
1809 cr = ENC_CODERANGE(str2);
1810 str_discard(str);
1811 termlen = rb_enc_mbminlen(enc);
1812
1813 STR_SET_LEN(str, RSTRING_LEN(str2));
1814
1815 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1816 STR_SET_EMBED(str);
1817 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1818 rb_enc_associate(str, enc);
1819 ENC_CODERANGE_SET(str, cr);
1820 }
1821 else {
1822 if (STR_EMBED_P(str2)) {
1823 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1824 long len = RSTRING_LEN(str2);
1825 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1826
1827 char *new_ptr = ALLOC_N(char, len + termlen);
1828 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1829 RSTRING(str2)->as.heap.ptr = new_ptr;
1830 STR_SET_LEN(str2, len);
1831 RSTRING(str2)->as.heap.aux.capa = len;
1832 STR_SET_NOEMBED(str2);
1833 }
1834
1835 STR_SET_NOEMBED(str);
1836 FL_UNSET(str, STR_SHARED);
1837 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1838
1839 if (FL_TEST(str2, STR_SHARED)) {
1840 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1841 STR_SET_SHARED(str, shared);
1842 }
1843 else {
1844 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1845 }
1846
1847 /* abandon str2 */
1848 STR_SET_EMBED(str2);
1849 RSTRING_PTR(str2)[0] = 0;
1850 STR_SET_LEN(str2, 0);
1851 rb_enc_associate(str, enc);
1852 ENC_CODERANGE_SET(str, cr);
1853 }
1854}
1855
1856VALUE
1858{
1859 VALUE str;
1860
1861 if (RB_TYPE_P(obj, T_STRING)) {
1862 return obj;
1863 }
1864 str = rb_funcall(obj, idTo_s, 0);
1865 return rb_obj_as_string_result(str, obj);
1866}
1867
1868VALUE
1869rb_obj_as_string_result(VALUE str, VALUE obj)
1870{
1871 if (!RB_TYPE_P(str, T_STRING))
1872 return rb_any_to_s(obj);
1873 return str;
1874}
1875
1876static VALUE
1877str_replace(VALUE str, VALUE str2)
1878{
1879 long len;
1880
1881 len = RSTRING_LEN(str2);
1882 if (STR_SHARED_P(str2)) {
1883 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1885 STR_SET_NOEMBED(str);
1886 STR_SET_LEN(str, len);
1887 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1888 STR_SET_SHARED(str, shared);
1889 rb_enc_cr_str_exact_copy(str, str2);
1890 }
1891 else {
1892 str_replace_shared(str, str2);
1893 }
1894
1895 return str;
1896}
1897
1898static inline VALUE
1899ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1900{
1901 size_t size = rb_str_embed_size(capa, 0);
1902 RUBY_ASSERT(size > 0);
1903 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1904
1905 NEWOBJ_OF(str, struct RString, klass,
1907
1908 str->len = 0;
1909
1910 return (VALUE)str;
1911}
1912
1913static inline VALUE
1914ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1915{
1916 NEWOBJ_OF(str, struct RString, klass,
1917 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1918
1919 str->as.heap.aux.capa = 0;
1920 str->as.heap.ptr = NULL;
1921
1922 return (VALUE)str;
1923}
1924
1925static inline VALUE
1926str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1927{
1928 int encidx = 0;
1929 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1930 encidx = rb_enc_get_index(str);
1931 flags &= ~ENCODING_MASK;
1932 }
1933 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1934 if (encidx) rb_enc_associate_index(dup, encidx);
1935 return dup;
1936}
1937
1938static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1939
1940static inline VALUE
1941str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1942{
1943 VALUE flags = FL_TEST_RAW(str, flag_mask);
1944 long len = RSTRING_LEN(str);
1945
1946 RUBY_ASSERT(STR_EMBED_P(dup));
1947 RUBY_ASSERT(str_embed_capa(dup) >= len + TERM_LEN(str));
1948 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + TERM_LEN(str));
1949 STR_SET_LEN(dup, RSTRING_LEN(str));
1950 return str_duplicate_setup_encoding(str, dup, flags);
1951}
1952
1953static inline VALUE
1954str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1955{
1956 VALUE flags = FL_TEST_RAW(str, flag_mask);
1957 VALUE root = str;
1958 if (FL_TEST_RAW(str, STR_SHARED)) {
1959 root = RSTRING(str)->as.heap.aux.shared;
1960 }
1961 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1962 root = str = str_new_frozen(klass, str);
1963 flags = FL_TEST_RAW(str, flag_mask);
1964 }
1965 RUBY_ASSERT(!STR_SHARED_P(root));
1967
1968 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1969 FL_SET_RAW(dup, RSTRING_NOEMBED);
1970 STR_SET_SHARED(dup, root);
1971 flags |= RSTRING_NOEMBED | STR_SHARED;
1972
1973 STR_SET_LEN(dup, RSTRING_LEN(str));
1974 return str_duplicate_setup_encoding(str, dup, flags);
1975}
1976
1977static inline VALUE
1978str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1979{
1980 if (STR_EMBED_P(str)) {
1981 return str_duplicate_setup_embed(klass, str, dup);
1982 }
1983 else {
1984 return str_duplicate_setup_heap(klass, str, dup);
1985 }
1986}
1987
1988static inline VALUE
1989str_duplicate(VALUE klass, VALUE str)
1990{
1991 VALUE dup;
1992 if (STR_EMBED_P(str)) {
1993 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1994 }
1995 else {
1996 dup = str_alloc_heap(klass);
1997 }
1998
1999 return str_duplicate_setup(klass, str, dup);
2000}
2001
2002VALUE
2004{
2005 return str_duplicate(rb_obj_class(str), str);
2006}
2007
2008/* :nodoc: */
2009VALUE
2010rb_str_dup_m(VALUE str)
2011{
2012 if (LIKELY(BARE_STRING_P(str))) {
2013 return str_duplicate(rb_cString, str);
2014 }
2015 else {
2016 return rb_obj_dup(str);
2017 }
2018}
2019
2020VALUE
2022{
2023 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2024 return str_duplicate(rb_cString, str);
2025}
2026
2027VALUE
2028rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
2029{
2030 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
2031 VALUE new_str, klass = rb_cString;
2032
2033 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
2034 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
2035 str_duplicate_setup_embed(klass, str, new_str);
2036 }
2037 else {
2038 new_str = ec_str_alloc_heap(ec, klass);
2039 str_duplicate_setup_heap(klass, str, new_str);
2040 }
2041 if (chilled) {
2042 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
2043 }
2044 return new_str;
2045}
2046
2047VALUE
2048rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2049{
2050 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2051 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2052 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2053 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2054 return rb_str_freeze(str);
2055}
2056
2057/*
2058 * The documentation block below uses an include (instead of inline text)
2059 * because the included text has non-ASCII characters (which are not allowed in a C file).
2060 */
2061
2062/*
2063 *
2064 * call-seq:
2065 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2066 *
2067 * :include: doc/string/new.rdoc
2068 *
2069 */
2070
2071static VALUE
2072rb_str_init(int argc, VALUE *argv, VALUE str)
2073{
2074 static ID keyword_ids[2];
2075 VALUE orig, opt, venc, vcapa;
2076 VALUE kwargs[2];
2077 rb_encoding *enc = 0;
2078 int n;
2079
2080 if (!keyword_ids[0]) {
2081 keyword_ids[0] = rb_id_encoding();
2082 CONST_ID(keyword_ids[1], "capacity");
2083 }
2084
2085 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2086 if (!NIL_P(opt)) {
2087 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2088 venc = kwargs[0];
2089 vcapa = kwargs[1];
2090 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2091 enc = rb_to_encoding(venc);
2092 }
2093 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2094 long capa = NUM2LONG(vcapa);
2095 long len = 0;
2096 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2097
2098 if (capa < STR_BUF_MIN_SIZE) {
2099 capa = STR_BUF_MIN_SIZE;
2100 }
2101 if (n == 1) {
2102 StringValue(orig);
2103 len = RSTRING_LEN(orig);
2104 if (capa < len) {
2105 capa = len;
2106 }
2107 if (orig == str) n = 0;
2108 }
2109 str_modifiable(str);
2110 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2111 /* make noembed always */
2112 const size_t size = (size_t)capa + termlen;
2113 const char *const old_ptr = RSTRING_PTR(str);
2114 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2115 char *new_ptr = ALLOC_N(char, size);
2116 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2117 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2118 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2119 RSTRING(str)->as.heap.ptr = new_ptr;
2120 }
2121 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2122 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2123 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2124 }
2125 STR_SET_LEN(str, len);
2126 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2127 if (n == 1) {
2128 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2129 rb_enc_cr_str_exact_copy(str, orig);
2130 }
2131 FL_SET(str, STR_NOEMBED);
2132 RSTRING(str)->as.heap.aux.capa = capa;
2133 }
2134 else if (n == 1) {
2135 rb_str_replace(str, orig);
2136 }
2137 if (enc) {
2138 rb_enc_associate(str, enc);
2140 }
2141 }
2142 else if (n == 1) {
2143 rb_str_replace(str, orig);
2144 }
2145 return str;
2146}
2147
2148/* :nodoc: */
2149static VALUE
2150rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2151{
2152 if (klass != rb_cString) {
2153 return rb_class_new_instance_pass_kw(argc, argv, klass);
2154 }
2155
2156 static ID keyword_ids[2];
2157 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2158 VALUE kwargs[2];
2159 rb_encoding *enc = NULL;
2160
2161 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2162 if (NIL_P(opt)) {
2163 return rb_class_new_instance_pass_kw(argc, argv, klass);
2164 }
2165
2166 keyword_ids[0] = rb_id_encoding();
2167 CONST_ID(keyword_ids[1], "capacity");
2168 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2169 encoding = kwargs[0];
2170 capacity = kwargs[1];
2171
2172 if (n == 1) {
2173 orig = StringValue(orig);
2174 }
2175 else {
2176 orig = Qnil;
2177 }
2178
2179 if (UNDEF_P(encoding)) {
2180 if (!NIL_P(orig)) {
2181 encoding = rb_obj_encoding(orig);
2182 }
2183 }
2184
2185 if (!UNDEF_P(encoding)) {
2186 enc = rb_to_encoding(encoding);
2187 }
2188
2189 // If capacity is nil, we're basically just duping `orig`.
2190 if (UNDEF_P(capacity)) {
2191 if (NIL_P(orig)) {
2192 VALUE empty_str = str_new(klass, "", 0);
2193 if (enc) {
2194 rb_enc_associate(empty_str, enc);
2195 }
2196 return empty_str;
2197 }
2198 VALUE copy = str_duplicate(klass, orig);
2199 rb_enc_associate(copy, enc);
2200 ENC_CODERANGE_CLEAR(copy);
2201 return copy;
2202 }
2203
2204 long capa = 0;
2205 capa = NUM2LONG(capacity);
2206 if (capa < 0) {
2207 capa = 0;
2208 }
2209
2210 if (!NIL_P(orig)) {
2211 long orig_capa = rb_str_capacity(orig);
2212 if (orig_capa > capa) {
2213 capa = orig_capa;
2214 }
2215 }
2216
2217 VALUE str = str_enc_new(klass, NULL, capa, enc);
2218 STR_SET_LEN(str, 0);
2219 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2220
2221 if (!NIL_P(orig)) {
2222 rb_str_buf_append(str, orig);
2223 }
2224
2225 return str;
2226}
2227
2228#ifdef NONASCII_MASK
2229#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2230
2231/*
2232 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2233 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2234 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2235 *
2236 * if (!(byte & 0x80))
2237 * byte |= 0x40; // turn on bit6
2238 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2239 *
2240 * This function calculates whether a byte is leading or not for all bytes
2241 * in the argument word by concurrently using the above logic, and then
2242 * adds up the number of leading bytes in the word.
2243 */
2244static inline uintptr_t
2245count_utf8_lead_bytes_with_word(const uintptr_t *s)
2246{
2247 uintptr_t d = *s;
2248
2249 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2250 d = (d>>6) | (~d>>7);
2251 d &= NONASCII_MASK >> 7;
2252
2253 /* Gather all bytes. */
2254#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2255 /* use only if it can use POPCNT */
2256 return rb_popcount_intptr(d);
2257#else
2258 d += (d>>8);
2259 d += (d>>16);
2260# if SIZEOF_VOIDP == 8
2261 d += (d>>32);
2262# endif
2263 return (d&0xF);
2264#endif
2265}
2266#endif
2267
2268static inline long
2269enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2270{
2271 long c;
2272 const char *q;
2273
2274 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2275 long diff = (long)(e - p);
2276 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2277 }
2278#ifdef NONASCII_MASK
2279 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2280 uintptr_t len = 0;
2281 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2282 const uintptr_t *s, *t;
2283 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2284 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2285 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2286 while (p < (const char *)s) {
2287 if (is_utf8_lead_byte(*p)) len++;
2288 p++;
2289 }
2290 while (s < t) {
2291 len += count_utf8_lead_bytes_with_word(s);
2292 s++;
2293 }
2294 p = (const char *)s;
2295 }
2296 while (p < e) {
2297 if (is_utf8_lead_byte(*p)) len++;
2298 p++;
2299 }
2300 return (long)len;
2301 }
2302#endif
2303 else if (rb_enc_asciicompat(enc)) {
2304 c = 0;
2305 if (ENC_CODERANGE_CLEAN_P(cr)) {
2306 while (p < e) {
2307 if (ISASCII(*p)) {
2308 q = search_nonascii(p, e);
2309 if (!q)
2310 return c + (e - p);
2311 c += q - p;
2312 p = q;
2313 }
2314 p += rb_enc_fast_mbclen(p, e, enc);
2315 c++;
2316 }
2317 }
2318 else {
2319 while (p < e) {
2320 if (ISASCII(*p)) {
2321 q = search_nonascii(p, e);
2322 if (!q)
2323 return c + (e - p);
2324 c += q - p;
2325 p = q;
2326 }
2327 p += rb_enc_mbclen(p, e, enc);
2328 c++;
2329 }
2330 }
2331 return c;
2332 }
2333
2334 for (c=0; p<e; c++) {
2335 p += rb_enc_mbclen(p, e, enc);
2336 }
2337 return c;
2338}
2339
2340long
2341rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2342{
2343 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2344}
2345
2346/* To get strlen with cr
2347 * Note that given cr is not used.
2348 */
2349long
2350rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2351{
2352 long c;
2353 const char *q;
2354 int ret;
2355
2356 *cr = 0;
2357 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2358 long diff = (long)(e - p);
2359 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2360 }
2361 else if (rb_enc_asciicompat(enc)) {
2362 c = 0;
2363 while (p < e) {
2364 if (ISASCII(*p)) {
2365 q = search_nonascii(p, e);
2366 if (!q) {
2367 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2368 return c + (e - p);
2369 }
2370 c += q - p;
2371 p = q;
2372 }
2373 ret = rb_enc_precise_mbclen(p, e, enc);
2374 if (MBCLEN_CHARFOUND_P(ret)) {
2375 *cr |= ENC_CODERANGE_VALID;
2376 p += MBCLEN_CHARFOUND_LEN(ret);
2377 }
2378 else {
2380 p++;
2381 }
2382 c++;
2383 }
2384 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2385 return c;
2386 }
2387
2388 for (c=0; p<e; c++) {
2389 ret = rb_enc_precise_mbclen(p, e, enc);
2390 if (MBCLEN_CHARFOUND_P(ret)) {
2391 *cr |= ENC_CODERANGE_VALID;
2392 p += MBCLEN_CHARFOUND_LEN(ret);
2393 }
2394 else {
2396 if (p + rb_enc_mbminlen(enc) <= e)
2397 p += rb_enc_mbminlen(enc);
2398 else
2399 p = e;
2400 }
2401 }
2402 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2403 return c;
2404}
2405
2406/* enc must be str's enc or rb_enc_check(str, str2) */
2407static long
2408str_strlen(VALUE str, rb_encoding *enc)
2409{
2410 const char *p, *e;
2411 int cr;
2412
2413 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2414 if (!enc) enc = STR_ENC_GET(str);
2415 p = RSTRING_PTR(str);
2416 e = RSTRING_END(str);
2417 cr = ENC_CODERANGE(str);
2418
2419 if (cr == ENC_CODERANGE_UNKNOWN) {
2420 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2421 if (cr) ENC_CODERANGE_SET(str, cr);
2422 return n;
2423 }
2424 else {
2425 return enc_strlen(p, e, enc, cr);
2426 }
2427}
2428
2429long
2431{
2432 return str_strlen(str, NULL);
2433}
2434
2435/*
2436 * call-seq:
2437 * length -> integer
2438 *
2439 * :include: doc/string/length.rdoc
2440 *
2441 */
2442
2443VALUE
2445{
2446 return LONG2NUM(str_strlen(str, NULL));
2447}
2448
2449/*
2450 * call-seq:
2451 * bytesize -> integer
2452 *
2453 * :include: doc/string/bytesize.rdoc
2454 *
2455 */
2456
2457VALUE
2458rb_str_bytesize(VALUE str)
2459{
2460 return LONG2NUM(RSTRING_LEN(str));
2461}
2462
2463/*
2464 * call-seq:
2465 * empty? -> true or false
2466 *
2467 * Returns whether the length of +self+ is zero:
2468 *
2469 * 'hello'.empty? # => false
2470 * ' '.empty? # => false
2471 * ''.empty? # => true
2472 *
2473 * Related: see {Querying}[rdoc-ref:String@Querying].
2474 */
2475
2476static VALUE
2477rb_str_empty(VALUE str)
2478{
2479 return RBOOL(RSTRING_LEN(str) == 0);
2480}
2481
2482/*
2483 * call-seq:
2484 * self + other_string -> new_string
2485 *
2486 * Returns a new string containing +other_string+ concatenated to +self+:
2487 *
2488 * 'Hello from ' + self.to_s # => "Hello from main"
2489 *
2490 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2491 */
2492
2493VALUE
2495{
2496 VALUE str3;
2497 rb_encoding *enc;
2498 char *ptr1, *ptr2, *ptr3;
2499 long len1, len2;
2500 int termlen;
2501
2502 StringValue(str2);
2503 enc = rb_enc_check_str(str1, str2);
2504 RSTRING_GETMEM(str1, ptr1, len1);
2505 RSTRING_GETMEM(str2, ptr2, len2);
2506 termlen = rb_enc_mbminlen(enc);
2507 if (len1 > LONG_MAX - len2) {
2508 rb_raise(rb_eArgError, "string size too big");
2509 }
2510 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2511 ptr3 = RSTRING_PTR(str3);
2512 memcpy(ptr3, ptr1, len1);
2513 memcpy(ptr3+len1, ptr2, len2);
2514 TERM_FILL(&ptr3[len1+len2], termlen);
2515
2516 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2518 RB_GC_GUARD(str1);
2519 RB_GC_GUARD(str2);
2520 return str3;
2521}
2522
2523/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2524VALUE
2525rb_str_opt_plus(VALUE str1, VALUE str2)
2526{
2529 long len1, len2;
2530 MAYBE_UNUSED(char) *ptr1, *ptr2;
2531 RSTRING_GETMEM(str1, ptr1, len1);
2532 RSTRING_GETMEM(str2, ptr2, len2);
2533 int enc1 = rb_enc_get_index(str1);
2534 int enc2 = rb_enc_get_index(str2);
2535
2536 if (enc1 < 0) {
2537 return Qundef;
2538 }
2539 else if (enc2 < 0) {
2540 return Qundef;
2541 }
2542 else if (enc1 != enc2) {
2543 return Qundef;
2544 }
2545 else if (len1 > LONG_MAX - len2) {
2546 return Qundef;
2547 }
2548 else {
2549 return rb_str_plus(str1, str2);
2550 }
2551
2552}
2553
2554/*
2555 * call-seq:
2556 * self * n -> new_string
2557 *
2558 * Returns a new string containing +n+ copies of +self+:
2559 *
2560 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2561 * 'No!' * 0 # => ""
2562 *
2563 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2564 */
2565
2566VALUE
2568{
2569 VALUE str2;
2570 long n, len;
2571 char *ptr2;
2572 int termlen;
2573
2574 if (times == INT2FIX(1)) {
2575 return str_duplicate(rb_cString, str);
2576 }
2577 if (times == INT2FIX(0)) {
2578 str2 = str_alloc_embed(rb_cString, 0);
2579 rb_enc_copy(str2, str);
2580 return str2;
2581 }
2582 len = NUM2LONG(times);
2583 if (len < 0) {
2584 rb_raise(rb_eArgError, "negative argument");
2585 }
2586 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2587 if (STR_EMBEDDABLE_P(len, 1)) {
2588 str2 = str_alloc_embed(rb_cString, len + 1);
2589 memset(RSTRING_PTR(str2), 0, len + 1);
2590 }
2591 else {
2592 str2 = str_alloc_heap(rb_cString);
2593 RSTRING(str2)->as.heap.aux.capa = len;
2594 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2595 }
2596 STR_SET_LEN(str2, len);
2597 rb_enc_copy(str2, str);
2598 return str2;
2599 }
2600 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2601 rb_raise(rb_eArgError, "argument too big");
2602 }
2603
2604 len *= RSTRING_LEN(str);
2605 termlen = TERM_LEN(str);
2606 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2607 ptr2 = RSTRING_PTR(str2);
2608 if (len) {
2609 n = RSTRING_LEN(str);
2610 memcpy(ptr2, RSTRING_PTR(str), n);
2611 while (n <= len/2) {
2612 memcpy(ptr2 + n, ptr2, n);
2613 n *= 2;
2614 }
2615 memcpy(ptr2 + n, ptr2, len-n);
2616 }
2617 STR_SET_LEN(str2, len);
2618 TERM_FILL(&ptr2[len], termlen);
2619 rb_enc_cr_str_copy_for_substr(str2, str);
2620
2621 return str2;
2622}
2623
2624/*
2625 * call-seq:
2626 * self % object -> new_string
2627 *
2628 * Returns the result of formatting +object+ into the format specifications
2629 * contained in +self+
2630 * (see {Format Specifications}[rdoc-ref:language/format_specifications.rdoc]):
2631 *
2632 * '%05d' % 123 # => "00123"
2633 *
2634 * If +self+ contains multiple format specifications,
2635 * +object+ must be an array or hash containing the objects to be formatted:
2636 *
2637 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2638 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2639 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2640 *
2641 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2642 */
2643
2644static VALUE
2645rb_str_format_m(VALUE str, VALUE arg)
2646{
2647 VALUE tmp = rb_check_array_type(arg);
2648
2649 if (!NIL_P(tmp)) {
2650 VALUE result = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2651 RB_GC_GUARD(tmp);
2652 return result;
2653 }
2654 return rb_str_format(1, &arg, str);
2655}
2656
2657static inline void
2658rb_check_lockedtmp(VALUE str)
2659{
2660 if (FL_TEST(str, STR_TMPLOCK)) {
2661 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2662 }
2663}
2664
2665// If none of these flags are set, we know we have an modifiable string.
2666// If any is set, we need to do more detailed checks.
2667#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2668static inline void
2669str_modifiable(VALUE str)
2670{
2671 RUBY_ASSERT(ruby_thread_has_gvl_p());
2672
2673 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2674 if (CHILLED_STRING_P(str)) {
2675 CHILLED_STRING_MUTATED(str);
2676 }
2677 rb_check_lockedtmp(str);
2678 rb_check_frozen(str);
2679 }
2680}
2681
2682static inline int
2683str_dependent_p(VALUE str)
2684{
2685 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2686 return FALSE;
2687 }
2688 else {
2689 return TRUE;
2690 }
2691}
2692
2693// If none of these flags are set, we know we have an independent string.
2694// If any is set, we need to do more detailed checks.
2695#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2696static inline int
2697str_independent(VALUE str)
2698{
2699 RUBY_ASSERT(ruby_thread_has_gvl_p());
2700
2701 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2702 str_modifiable(str);
2703 return !str_dependent_p(str);
2704 }
2705 return TRUE;
2706}
2707
2708static void
2709str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2710{
2711 RUBY_ASSERT(ruby_thread_has_gvl_p());
2712
2713 char *ptr;
2714 char *oldptr;
2715 long capa = len + expand;
2716
2717 if (len > capa) len = capa;
2718
2719 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2720 ptr = RSTRING(str)->as.heap.ptr;
2721 STR_SET_EMBED(str);
2722 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2723 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2724 STR_SET_LEN(str, len);
2725 return;
2726 }
2727
2728 ptr = ALLOC_N(char, (size_t)capa + termlen);
2729 oldptr = RSTRING_PTR(str);
2730 if (oldptr) {
2731 memcpy(ptr, oldptr, len);
2732 }
2733 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2734 SIZED_FREE_N(oldptr, STR_HEAP_SIZE(str));
2735 }
2736 STR_SET_NOEMBED(str);
2737 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2738 TERM_FILL(ptr + len, termlen);
2739 RSTRING(str)->as.heap.ptr = ptr;
2740 STR_SET_LEN(str, len);
2741 RSTRING(str)->as.heap.aux.capa = capa;
2742}
2743
2744void
2745rb_str_modify(VALUE str)
2746{
2747 if (!str_independent(str))
2748 str_make_independent(str);
2750}
2751
2752void
2754{
2755 RUBY_ASSERT(ruby_thread_has_gvl_p());
2756
2757 int termlen = TERM_LEN(str);
2758 long len = RSTRING_LEN(str);
2759
2760 if (expand < 0) {
2761 rb_raise(rb_eArgError, "negative expanding string size");
2762 }
2763 if (expand >= LONG_MAX - len) {
2764 rb_raise(rb_eArgError, "string size too big");
2765 }
2766
2767 if (!str_independent(str)) {
2768 str_make_independent_expand(str, len, expand, termlen);
2769 }
2770 else if (expand > 0) {
2771 RESIZE_CAPA_TERM(str, len + expand, termlen);
2772 }
2774}
2775
2776/* As rb_str_modify(), but don't clear coderange */
2777static void
2778str_modify_keep_cr(VALUE str)
2779{
2780 if (!str_independent(str))
2781 str_make_independent(str);
2783 /* Force re-scan later */
2785}
2786
2787static inline void
2788str_discard(VALUE str)
2789{
2790 str_modifiable(str);
2791 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2792 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2793 RSTRING(str)->as.heap.ptr = 0;
2794 STR_SET_LEN(str, 0);
2795 }
2796}
2797
2798void
2800{
2801 int encindex = rb_enc_get_index(str);
2802
2803 if (RB_UNLIKELY(encindex == -1)) {
2804 rb_raise(rb_eTypeError, "not encoding capable object");
2805 }
2806
2807 if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) {
2808 return;
2809 }
2810
2811 rb_encoding *enc = rb_enc_from_index(encindex);
2812 if (!rb_enc_asciicompat(enc)) {
2813 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2814 }
2815}
2816
2817VALUE
2819{
2820 RUBY_ASSERT(ruby_thread_has_gvl_p());
2821
2822 VALUE s = *ptr;
2823 if (!RB_TYPE_P(s, T_STRING)) {
2824 s = rb_str_to_str(s);
2825 *ptr = s;
2826 }
2827 return s;
2828}
2829
2830char *
2832{
2833 VALUE str = rb_string_value(ptr);
2834 return RSTRING_PTR(str);
2835}
2836
2837static const char *
2838str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2839{
2840 const char *e = s + len;
2841
2842 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2843 if (zero_filled(s, minlen)) return s;
2844 }
2845 return 0;
2846}
2847
2848static char *
2849str_fill_term(VALUE str, char *s, long len, int termlen)
2850{
2851 /* This function assumes that (capa + termlen) bytes of memory
2852 * is allocated, like many other functions in this file.
2853 */
2854 if (str_dependent_p(str)) {
2855 if (!zero_filled(s + len, termlen))
2856 str_make_independent_expand(str, len, 0L, termlen);
2857 }
2858 else {
2859 TERM_FILL(s + len, termlen);
2860 return s;
2861 }
2862 return RSTRING_PTR(str);
2863}
2864
2865void
2866rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2867{
2868 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2869 long len = RSTRING_LEN(str);
2870
2871 RUBY_ASSERT(capa >= len);
2872 if (capa - len < termlen) {
2873 rb_check_lockedtmp(str);
2874 str_make_independent_expand(str, len, 0L, termlen);
2875 }
2876 else if (str_dependent_p(str)) {
2877 if (termlen > oldtermlen)
2878 str_make_independent_expand(str, len, 0L, termlen);
2879 }
2880 else {
2881 if (!STR_EMBED_P(str)) {
2882 /* modify capa instead of realloc */
2883 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2884 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2885 }
2886 if (termlen > oldtermlen) {
2887 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2888 }
2889 }
2890
2891 return;
2892}
2893
2894static char *
2895str_null_check(VALUE str, int *w)
2896{
2897 char *s = RSTRING_PTR(str);
2898 long len = RSTRING_LEN(str);
2899 int minlen = 1;
2900
2901 if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) {
2902 rb_encoding *enc = rb_str_enc_get(str);
2903 minlen = rb_enc_mbminlen(enc);
2904
2905 if (minlen > 1) {
2906 *w = 1;
2907 if (str_null_char(s, len, minlen, enc)) {
2908 return NULL;
2909 }
2910 return str_fill_term(str, s, len, minlen);
2911 }
2912 }
2913
2914 *w = 0;
2915 if (!s || memchr(s, 0, len)) {
2916 return NULL;
2917 }
2918 if (s[len]) {
2919 s = str_fill_term(str, s, len, minlen);
2920 }
2921 return s;
2922}
2923
2924const char *
2925rb_str_null_check(VALUE str)
2926{
2928
2929 char *s;
2930 long len;
2931 RSTRING_GETMEM(str, s, len);
2932
2933 if (RB_LIKELY(rb_str_enc_fastpath(str))) {
2934 if (!s || memchr(s, 0, len)) {
2935 rb_raise(rb_eArgError, "string contains null byte");
2936 }
2937 }
2938 else {
2939 int w;
2940 const char *s = str_null_check(str, &w);
2941 if (!s) {
2942 if (w) {
2943 rb_raise(rb_eArgError, "string contains null char");
2944 }
2945 rb_raise(rb_eArgError, "string contains null byte");
2946 }
2947 }
2948
2949 return s;
2950}
2951
2952char *
2953rb_str_to_cstr(VALUE str)
2954{
2955 int w;
2956 return str_null_check(str, &w);
2957}
2958
2959char *
2961{
2962 VALUE str = rb_string_value(ptr);
2963 int w;
2964 char *s = str_null_check(str, &w);
2965 if (!s) {
2966 if (w) {
2967 rb_raise(rb_eArgError, "string contains null char");
2968 }
2969 rb_raise(rb_eArgError, "string contains null byte");
2970 }
2971 return s;
2972}
2973
2974char *
2975rb_str_fill_terminator(VALUE str, const int newminlen)
2976{
2977 char *s = RSTRING_PTR(str);
2978 long len = RSTRING_LEN(str);
2979 return str_fill_term(str, s, len, newminlen);
2980}
2981
2982VALUE
2984{
2985 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2986 return str;
2987}
2988
2989/*
2990 * call-seq:
2991 * String.try_convert(object) -> object, new_string, or nil
2992 *
2993 * Attempts to convert the given +object+ to a string.
2994 *
2995 * If +object+ is already a string, returns +object+, unmodified.
2996 *
2997 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2998 * calls <tt>object.to_str</tt> and returns the result.
2999 *
3000 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
3001 *
3002 * Raises an exception unless <tt>object.to_str</tt> returns a string.
3003 */
3004static VALUE
3005rb_str_s_try_convert(VALUE dummy, VALUE str)
3006{
3007 return rb_check_string_type(str);
3008}
3009
3010static char*
3011str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
3012{
3013 long nth = *nthp;
3014 if (rb_enc_mbmaxlen(enc) == 1) {
3015 p += nth;
3016 }
3017 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3018 p += nth * rb_enc_mbmaxlen(enc);
3019 }
3020 else if (rb_enc_asciicompat(enc)) {
3021 const char *p2, *e2;
3022 int n;
3023
3024 while (p < e && 0 < nth) {
3025 e2 = p + nth;
3026 if (e < e2) {
3027 *nthp = nth;
3028 return (char *)e;
3029 }
3030 if (ISASCII(*p)) {
3031 p2 = search_nonascii(p, e2);
3032 if (!p2) {
3033 nth -= e2 - p;
3034 *nthp = nth;
3035 return (char *)e2;
3036 }
3037 nth -= p2 - p;
3038 p = p2;
3039 }
3040 n = rb_enc_mbclen(p, e, enc);
3041 p += n;
3042 nth--;
3043 }
3044 *nthp = nth;
3045 if (nth != 0) {
3046 return (char *)e;
3047 }
3048 return (char *)p;
3049 }
3050 else {
3051 while (p < e && nth--) {
3052 p += rb_enc_mbclen(p, e, enc);
3053 }
3054 }
3055 if (p > e) p = e;
3056 *nthp = nth;
3057 return (char*)p;
3058}
3059
3060char*
3061rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
3062{
3063 return str_nth_len(p, e, &nth, enc);
3064}
3065
3066static char*
3067str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3068{
3069 if (singlebyte)
3070 p += nth;
3071 else {
3072 p = str_nth_len(p, e, &nth, enc);
3073 }
3074 if (!p) return 0;
3075 if (p > e) p = e;
3076 return (char *)p;
3077}
3078
3079/* char offset to byte offset */
3080static long
3081str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3082{
3083 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3084 if (!pp) return e - p;
3085 return pp - p;
3086}
3087
3088long
3089rb_str_offset(VALUE str, long pos)
3090{
3091 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3092 STR_ENC_GET(str), single_byte_optimizable(str));
3093}
3094
3095#ifdef NONASCII_MASK
3096static char *
3097str_utf8_nth(const char *p, const char *e, long *nthp)
3098{
3099 long nth = *nthp;
3100 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3101 const uintptr_t *s, *t;
3102 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3103 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3104 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3105 while (p < (const char *)s) {
3106 if (is_utf8_lead_byte(*p)) nth--;
3107 p++;
3108 }
3109 do {
3110 nth -= count_utf8_lead_bytes_with_word(s);
3111 s++;
3112 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3113 p = (char *)s;
3114 }
3115 while (p < e) {
3116 if (is_utf8_lead_byte(*p)) {
3117 if (nth == 0) break;
3118 nth--;
3119 }
3120 p++;
3121 }
3122 *nthp = nth;
3123 return (char *)p;
3124}
3125
3126static long
3127str_utf8_offset(const char *p, const char *e, long nth)
3128{
3129 const char *pp = str_utf8_nth(p, e, &nth);
3130 return pp - p;
3131}
3132#endif
3133
3134/* byte offset to char offset */
3135long
3136rb_str_sublen(VALUE str, long pos)
3137{
3138 if (single_byte_optimizable(str) || pos < 0)
3139 return pos;
3140 else {
3141 char *p = RSTRING_PTR(str);
3142 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3143 }
3144}
3145
3146static VALUE
3147str_subseq(VALUE str, long beg, long len)
3148{
3149 VALUE str2;
3150
3151 RUBY_ASSERT(beg >= 0);
3152 RUBY_ASSERT(len >= 0);
3153 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3154
3155 const int termlen = TERM_LEN(str);
3156 if (!SHARABLE_SUBSTRING_P(str, beg, len)) {
3157 str2 = rb_enc_str_new(RSTRING_PTR(str) + beg, len, rb_str_enc_get(str));
3158 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3160 }
3161 RB_GC_GUARD(str);
3162 return str2;
3163 }
3164
3165 str2 = str_alloc_heap(rb_cString);
3166 if (str_embed_capa(str2) >= len + termlen) {
3167 char *ptr2 = RSTRING(str2)->as.embed.ary;
3168 STR_SET_EMBED(str2);
3169 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3170 TERM_FILL(ptr2+len, termlen);
3171
3172 STR_SET_LEN(str2, len);
3173 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
3175 }
3176
3177 RB_GC_GUARD(str);
3178 }
3179 else {
3180 str_replace_shared(str2, str);
3181 RUBY_ASSERT(!STR_EMBED_P(str2));
3182 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3183 ENC_CODERANGE_CLEAR(str2);
3184 }
3185
3186 RSTRING(str2)->as.heap.ptr += beg;
3187 if (RSTRING_LEN(str2) > len) {
3188 STR_SET_LEN(str2, len);
3189 }
3190 }
3191
3192 return str2;
3193}
3194
3195VALUE
3196rb_str_subseq(VALUE str, long beg, long len)
3197{
3198 VALUE str2 = str_subseq(str, beg, len);
3199 rb_enc_cr_str_copy_for_substr(str2, str);
3200 return str2;
3201}
3202
3203char *
3204rb_str_subpos(VALUE str, long beg, long *lenp)
3205{
3206 long len = *lenp;
3207 long slen = -1L;
3208 const long blen = RSTRING_LEN(str);
3209 rb_encoding *enc = STR_ENC_GET(str);
3210 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3211
3212 if (len < 0) return 0;
3213 if (beg < 0 && -beg < 0) return 0;
3214 if (!blen) {
3215 len = 0;
3216 }
3217 if (single_byte_optimizable(str)) {
3218 if (beg > blen) return 0;
3219 if (beg < 0) {
3220 beg += blen;
3221 if (beg < 0) return 0;
3222 }
3223 if (len > blen - beg)
3224 len = blen - beg;
3225 if (len < 0) return 0;
3226 p = s + beg;
3227 goto end;
3228 }
3229 if (beg < 0) {
3230 if (len > -beg) len = -beg;
3231 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3232 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3233 beg = -beg;
3234 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3235 p = e;
3236 if (!p) return 0;
3237 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3238 if (!p) return 0;
3239 len = e - p;
3240 goto end;
3241 }
3242 else {
3243 slen = str_strlen(str, enc);
3244 beg += slen;
3245 if (beg < 0) return 0;
3246 p = s + beg;
3247 if (len == 0) goto end;
3248 }
3249 }
3250 else if (beg > 0 && beg > blen) {
3251 return 0;
3252 }
3253 if (len == 0) {
3254 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3255 p = s + beg;
3256 }
3257#ifdef NONASCII_MASK
3258 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3259 enc == rb_utf8_encoding()) {
3260 p = str_utf8_nth(s, e, &beg);
3261 if (beg > 0) return 0;
3262 len = str_utf8_offset(p, e, len);
3263 }
3264#endif
3265 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3266 int char_sz = rb_enc_mbmaxlen(enc);
3267
3268 p = s + beg * char_sz;
3269 if (p > e) {
3270 return 0;
3271 }
3272 else if (len * char_sz > e - p)
3273 len = e - p;
3274 else
3275 len *= char_sz;
3276 }
3277 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3278 if (beg > 0) return 0;
3279 len = 0;
3280 }
3281 else {
3282 len = str_offset(p, e, len, enc, 0);
3283 }
3284 end:
3285 *lenp = len;
3286 RB_GC_GUARD(str);
3287 return p;
3288}
3289
3290static VALUE str_substr(VALUE str, long beg, long len, int empty);
3291
3292VALUE
3293rb_str_substr(VALUE str, long beg, long len)
3294{
3295 return str_substr(str, beg, len, TRUE);
3296}
3297
3298VALUE
3299rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3300{
3301 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3302}
3303
3304static VALUE
3305str_substr(VALUE str, long beg, long len, int empty)
3306{
3307 char *p = rb_str_subpos(str, beg, &len);
3308
3309 if (!p) return Qnil;
3310 if (!len && !empty) return Qnil;
3311
3312 beg = p - RSTRING_PTR(str);
3313
3314 VALUE str2 = str_subseq(str, beg, len);
3315 rb_enc_cr_str_copy_for_substr(str2, str);
3316 return str2;
3317}
3318
3319/* :nodoc: */
3320VALUE
3322{
3323 if (CHILLED_STRING_P(str)) {
3324 FL_UNSET_RAW(str, STR_CHILLED);
3325 }
3326
3327 if (OBJ_FROZEN(str)) return str;
3328 rb_str_resize(str, RSTRING_LEN(str));
3329 return rb_obj_freeze(str);
3330}
3331
3332/*
3333 * call-seq:
3334 * +string -> new_string or self
3335 *
3336 * Returns +self+ if +self+ is not frozen and can be mutated
3337 * without warning issuance.
3338 *
3339 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3340 *
3341 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3342 */
3343static VALUE
3344str_uplus(VALUE str)
3345{
3346 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3347 return rb_str_dup(str);
3348 }
3349 else {
3350 return str;
3351 }
3352}
3353
3354/*
3355 * call-seq:
3356 * -self -> frozen_string
3357 *
3358 * Returns a frozen string equal to +self+.
3359 *
3360 * The returned string is +self+ if and only if all of the following are true:
3361 *
3362 * - +self+ is already frozen.
3363 * - +self+ is an instance of \String (rather than of a subclass of \String)
3364 * - +self+ has no instance variables set on it.
3365 *
3366 * Otherwise, the returned string is a frozen copy of +self+.
3367 *
3368 * Returning +self+, when possible, saves duplicating +self+;
3369 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3370 *
3371 * It may also save duplicating other, already-existing, strings:
3372 *
3373 * s0 = 'foo'
3374 * s1 = 'foo'
3375 * s0.object_id == s1.object_id # => false
3376 * (-s0).object_id == (-s1).object_id # => true
3377 *
3378 * Note that method #-@ is convenient for defining a constant:
3379 *
3380 * FileName = -'config/database.yml'
3381 *
3382 * While its alias #dedup is better suited for chaining:
3383 *
3384 * 'foo'.dedup.gsub!('o')
3385 *
3386 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@FreezingUnfreezing].
3387 */
3388static VALUE
3389str_uminus(VALUE str)
3390{
3391 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3392 str = rb_str_dup(str);
3393 }
3394 return rb_fstring(str);
3395}
3396
3397RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3398#define rb_str_dup_frozen rb_str_new_frozen
3399
3400VALUE
3402{
3403 rb_check_frozen(str);
3404 if (FL_TEST(str, STR_TMPLOCK)) {
3405 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3406 }
3407 FL_SET(str, STR_TMPLOCK);
3408 return str;
3409}
3410
3411VALUE
3413{
3414 rb_check_frozen(str);
3415 if (!FL_TEST(str, STR_TMPLOCK)) {
3416 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3417 }
3418 FL_UNSET(str, STR_TMPLOCK);
3419 return str;
3420}
3421
3422VALUE
3423rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3424{
3425 rb_str_locktmp(str);
3426 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3427}
3428
3429void
3431{
3432 RUBY_ASSERT(ruby_thread_has_gvl_p());
3433
3434 long capa;
3435 const int termlen = TERM_LEN(str);
3436
3437 str_modifiable(str);
3438 if (STR_SHARED_P(str)) {
3439 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3440 }
3441 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3442 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3443 }
3444
3445 int cr = ENC_CODERANGE(str);
3446 if (len == 0) {
3447 /* Empty string does not contain non-ASCII */
3449 }
3450 else if (cr == ENC_CODERANGE_UNKNOWN) {
3451 /* Leave unknown. */
3452 }
3453 else if (len > RSTRING_LEN(str)) {
3454 if (ENC_CODERANGE_CLEAN_P(cr)) {
3455 /* Update the coderange regarding the extended part. */
3456 const char *const prev_end = RSTRING_END(str);
3457 const char *const new_end = RSTRING_PTR(str) + len;
3458 rb_encoding *enc = rb_enc_get(str);
3459 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3460 ENC_CODERANGE_SET(str, cr);
3461 }
3462 else if (cr == ENC_CODERANGE_BROKEN) {
3463 /* May be valid now, by appended part. */
3465 }
3466 }
3467 else if (len < RSTRING_LEN(str)) {
3468 if (cr != ENC_CODERANGE_7BIT) {
3469 /* ASCII-only string is keeping after truncated. Valid
3470 * and broken may be invalid or valid, leave unknown. */
3472 }
3473 }
3474
3475 STR_SET_LEN(str, len);
3476 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3477}
3478
3479VALUE
3480rb_str_resize(VALUE str, long len)
3481{
3482 if (len < 0) {
3483 rb_raise(rb_eArgError, "negative string size (or size too big)");
3484 }
3485
3486 int independent = str_independent(str);
3487 long slen = RSTRING_LEN(str);
3488 const int termlen = TERM_LEN(str);
3489
3490 if (slen > len || (termlen != 1 && slen < len)) {
3492 }
3493
3494 {
3495 long capa;
3496 if (STR_EMBED_P(str)) {
3497 if (len == slen) return str;
3498 if (str_embed_capa(str) >= len + termlen) {
3499 STR_SET_LEN(str, len);
3500 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3501 return str;
3502 }
3503 str_make_independent_expand(str, slen, len - slen, termlen);
3504 }
3505 else if (str_embed_capa(str) >= len + termlen) {
3506 capa = RSTRING(str)->as.heap.aux.capa;
3507 char *ptr = STR_HEAP_PTR(str);
3508 STR_SET_EMBED(str);
3509 if (slen > len) slen = len;
3510 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3511 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3512 STR_SET_LEN(str, len);
3513 if (independent) {
3514 SIZED_FREE_N(ptr, capa + termlen);
3515 }
3516 return str;
3517 }
3518 else if (!independent) {
3519 if (len == slen) return str;
3520 str_make_independent_expand(str, slen, len - slen, termlen);
3521 }
3522 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3523 (capa - len) > (len < 1024 ? len : 1024)) {
3524 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3525 (size_t)len + termlen, STR_HEAP_SIZE(str));
3526 RSTRING(str)->as.heap.aux.capa = len;
3527 }
3528 else if (len == slen) return str;
3529 STR_SET_LEN(str, len);
3530 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3531 }
3532 return str;
3533}
3534
3535static void
3536str_ensure_available_capa(VALUE str, long len)
3537{
3538 str_modify_keep_cr(str);
3539
3540 const int termlen = TERM_LEN(str);
3541 long olen = RSTRING_LEN(str);
3542
3543 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3544 rb_raise(rb_eArgError, "string sizes too big");
3545 }
3546
3547 long total = olen + len;
3548 long capa = str_capacity(str, termlen);
3549
3550 if (capa < total) {
3551 if (total >= LONG_MAX / 2) {
3552 capa = total;
3553 }
3554 while (total > capa) {
3555 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3556 }
3557 RESIZE_CAPA_TERM(str, capa, termlen);
3558 }
3559}
3560
3561static VALUE
3562str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3563{
3564 if (keep_cr) {
3565 str_modify_keep_cr(str);
3566 }
3567 else {
3568 rb_str_modify(str);
3569 }
3570 if (len == 0) return 0;
3571
3572 long total, olen, off = -1;
3573 char *sptr;
3574 const int termlen = TERM_LEN(str);
3575
3576 RSTRING_GETMEM(str, sptr, olen);
3577 if (ptr >= sptr && ptr <= sptr + olen) {
3578 off = ptr - sptr;
3579 }
3580
3581 long capa = str_capacity(str, termlen);
3582
3583 if (olen > LONG_MAX - len) {
3584 rb_raise(rb_eArgError, "string sizes too big");
3585 }
3586 total = olen + len;
3587 if (capa < total) {
3588 if (total >= LONG_MAX / 2) {
3589 capa = total;
3590 }
3591 while (total > capa) {
3592 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3593 }
3594 RESIZE_CAPA_TERM(str, capa, termlen);
3595 sptr = RSTRING_PTR(str);
3596 }
3597 if (off != -1) {
3598 ptr = sptr + off;
3599 }
3600 memcpy(sptr + olen, ptr, len);
3601 STR_SET_LEN(str, total);
3602 TERM_FILL(sptr + total, termlen); /* sentinel */
3603
3604 return str;
3605}
3606
3607#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3608#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3609
3610VALUE
3611rb_str_cat(VALUE str, const char *ptr, long len)
3612{
3613 if (len == 0) return str;
3614 if (len < 0) {
3615 rb_raise(rb_eArgError, "negative string size (or size too big)");
3616 }
3617 return str_buf_cat(str, ptr, len);
3618}
3619
3620VALUE
3621rb_str_cat_cstr(VALUE str, const char *ptr)
3622{
3623 must_not_null(ptr);
3624 return rb_str_buf_cat(str, ptr, strlen(ptr));
3625}
3626
3627static void
3628rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3629{
3630 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3631
3632 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3633 if (UNLIKELY(!str_independent(str))) {
3634 str_make_independent(str);
3635 }
3636
3637 long string_length = -1;
3638 const int null_terminator_length = 1;
3639 char *sptr;
3640 RSTRING_GETMEM(str, sptr, string_length);
3641
3642 // Ensure the resulting string wouldn't be too long.
3643 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3644 rb_raise(rb_eArgError, "string sizes too big");
3645 }
3646
3647 long string_capacity = str_capacity(str, null_terminator_length);
3648
3649 // Get the code range before any modifications since those might clear the code range.
3650 int cr = ENC_CODERANGE(str);
3651
3652 // Check if the string has spare string_capacity to write the new byte.
3653 if (LIKELY(string_capacity >= string_length + 1)) {
3654 // In fast path we can write the new byte and note the string's new length.
3655 sptr[string_length] = byte;
3656 STR_SET_LEN(str, string_length + 1);
3657 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3658 }
3659 else {
3660 // If there's not enough string_capacity, make a call into the general string concatenation function.
3661 str_buf_cat(str, (char *)&byte, 1);
3662 }
3663
3664 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3665 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3666 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3667 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3668 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3669 if (ISASCII(byte)) {
3671 }
3672 else {
3674
3675 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3676 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3677 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3678 }
3679 }
3680 }
3681}
3682
3683RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3684RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3685RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3686
3687static VALUE
3688rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3689 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3690{
3691 int str_encindex = ENCODING_GET(str);
3692 int res_encindex;
3693 int str_cr, res_cr;
3694 rb_encoding *str_enc, *ptr_enc;
3695
3696 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3697
3698 if (str_encindex == ptr_encindex) {
3699 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3700 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3701 }
3702 }
3703 else {
3704 str_enc = rb_enc_from_index(str_encindex);
3705 ptr_enc = rb_enc_from_index(ptr_encindex);
3706 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3707 if (len == 0)
3708 return str;
3709 if (RSTRING_LEN(str) == 0) {
3710 rb_str_buf_cat(str, ptr, len);
3711 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3712 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3713 return str;
3714 }
3715 goto incompatible;
3716 }
3717 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3718 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3719 }
3720 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3721 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3722 str_cr = rb_enc_str_coderange(str);
3723 }
3724 }
3725 }
3726 if (ptr_cr_ret)
3727 *ptr_cr_ret = ptr_cr;
3728
3729 if (str_encindex != ptr_encindex &&
3730 str_cr != ENC_CODERANGE_7BIT &&
3731 ptr_cr != ENC_CODERANGE_7BIT) {
3732 str_enc = rb_enc_from_index(str_encindex);
3733 ptr_enc = rb_enc_from_index(ptr_encindex);
3734 goto incompatible;
3735 }
3736
3737 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3738 res_encindex = str_encindex;
3739 res_cr = ENC_CODERANGE_UNKNOWN;
3740 }
3741 else if (str_cr == ENC_CODERANGE_7BIT) {
3742 if (ptr_cr == ENC_CODERANGE_7BIT) {
3743 res_encindex = str_encindex;
3744 res_cr = ENC_CODERANGE_7BIT;
3745 }
3746 else {
3747 res_encindex = ptr_encindex;
3748 res_cr = ptr_cr;
3749 }
3750 }
3751 else if (str_cr == ENC_CODERANGE_VALID) {
3752 res_encindex = str_encindex;
3753 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3754 res_cr = str_cr;
3755 else
3756 res_cr = ptr_cr;
3757 }
3758 else { /* str_cr == ENC_CODERANGE_BROKEN */
3759 res_encindex = str_encindex;
3760 res_cr = str_cr;
3761 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3762 }
3763
3764 if (len < 0) {
3765 rb_raise(rb_eArgError, "negative string size (or size too big)");
3766 }
3767 str_buf_cat(str, ptr, len);
3768 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3769 return str;
3770
3771 incompatible:
3772 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3773 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3775}
3776
3777VALUE
3778rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3779{
3780 return rb_enc_cr_str_buf_cat(str, ptr, len,
3781 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3782}
3783
3784VALUE
3786{
3787 /* ptr must reference NUL terminated ASCII string. */
3788 int encindex = ENCODING_GET(str);
3789 rb_encoding *enc = rb_enc_from_index(encindex);
3790 if (rb_enc_asciicompat(enc)) {
3791 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3792 encindex, ENC_CODERANGE_7BIT, 0);
3793 }
3794 else {
3795 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3796 while (*ptr) {
3797 unsigned int c = (unsigned char)*ptr;
3798 int len = rb_enc_codelen(c, enc);
3799 rb_enc_mbcput(c, buf, enc);
3800 rb_enc_cr_str_buf_cat(str, buf, len,
3801 encindex, ENC_CODERANGE_VALID, 0);
3802 ptr++;
3803 }
3804 return str;
3805 }
3806}
3807
3808VALUE
3810{
3811 int str2_cr = rb_enc_str_coderange(str2);
3812
3813 if (rb_str_enc_fastpath(str)) {
3814 switch (str2_cr) {
3815 case ENC_CODERANGE_7BIT:
3816 // If RHS is 7bit we can do simple concatenation
3817 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3818 RB_GC_GUARD(str2);
3819 return str;
3821 // If RHS is valid, we can do simple concatenation if encodings are the same
3822 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3823 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3824 int str_cr = ENC_CODERANGE(str);
3825 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3826 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3827 }
3828 RB_GC_GUARD(str2);
3829 return str;
3830 }
3831 }
3832 }
3833
3834 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3835 ENCODING_GET(str2), str2_cr, &str2_cr);
3836
3837 ENC_CODERANGE_SET(str2, str2_cr);
3838
3839 return str;
3840}
3841
3842VALUE
3844{
3845 StringValue(str2);
3846 return rb_str_buf_append(str, str2);
3847}
3848
3849VALUE
3850rb_str_concat_literals(size_t num, const VALUE *strary)
3851{
3852 VALUE str;
3853 size_t i, s = 0;
3854 unsigned long len = 1;
3855
3856 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3857 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3858
3859 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3860 str = rb_str_buf_new(len);
3861 str_enc_copy_direct(str, strary[0]);
3862
3863 for (i = s; i < num; ++i) {
3864 const VALUE v = strary[i];
3865 int encidx = ENCODING_GET(v);
3866
3867 rb_str_buf_append(str, v);
3868 if (encidx != ENCINDEX_US_ASCII) {
3869 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3870 rb_enc_set_index(str, encidx);
3871 }
3872 }
3873 return str;
3874}
3875
3876/*
3877 * call-seq:
3878 * concat(*objects) -> string
3879 *
3880 * :include: doc/string/concat.rdoc
3881 */
3882static VALUE
3883rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3884{
3885 str_modifiable(str);
3886
3887 if (argc == 1) {
3888 return rb_str_concat(str, argv[0]);
3889 }
3890 else if (argc > 1) {
3891 int i;
3892 VALUE arg_str = rb_str_tmp_new(0);
3893 rb_enc_copy(arg_str, str);
3894 for (i = 0; i < argc; i++) {
3895 rb_str_concat(arg_str, argv[i]);
3896 }
3897 rb_str_buf_append(str, arg_str);
3898 }
3899
3900 return str;
3901}
3902
3903/*
3904 * call-seq:
3905 * append_as_bytes(*objects) -> self
3906 *
3907 * Concatenates each object in +objects+ into +self+; returns +self+;
3908 * performs no encoding validation or conversion:
3909 *
3910 * s = 'foo'
3911 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3912 * s.valid_encoding? # => false
3913 * s.append_as_bytes("\xAC 12")
3914 * s.valid_encoding? # => true
3915 *
3916 * When a given object is an integer,
3917 * the value is considered an 8-bit byte;
3918 * if the integer occupies more than one byte (i.e,. is greater than 255),
3919 * appends only the low-order byte (similar to String#setbyte):
3920 *
3921 * s = ""
3922 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3923 * s.bytesize # => 2
3924 *
3925 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3926 */
3927
3928VALUE
3929rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3930{
3931 long needed_capacity = 0;
3932 volatile VALUE t0;
3933 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3934
3935 for (int index = 0; index < argc; index++) {
3936 VALUE obj = argv[index];
3937 enum ruby_value_type type = types[index] = rb_type(obj);
3938 switch (type) {
3939 case T_FIXNUM:
3940 case T_BIGNUM:
3941 needed_capacity++;
3942 break;
3943 case T_STRING:
3944 needed_capacity += RSTRING_LEN(obj);
3945 break;
3946 default:
3947 rb_raise(
3949 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3950 rb_obj_class(obj)
3951 );
3952 break;
3953 }
3954 }
3955
3956 str_ensure_available_capa(str, needed_capacity);
3957 char *sptr = RSTRING_END(str);
3958
3959 for (int index = 0; index < argc; index++) {
3960 VALUE obj = argv[index];
3961 enum ruby_value_type type = types[index];
3962 switch (type) {
3963 case T_FIXNUM:
3964 case T_BIGNUM: {
3965 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3966 char byte = (char)(NUM2INT(obj) & 0xFF);
3967 *sptr = byte;
3968 sptr++;
3969 break;
3970 }
3971 case T_STRING: {
3972 const char *ptr;
3973 long len;
3974 RSTRING_GETMEM(obj, ptr, len);
3975 memcpy(sptr, ptr, len);
3976 sptr += len;
3977 break;
3978 }
3979 default:
3980 rb_bug("append_as_bytes arguments should have been validated");
3981 }
3982 }
3983
3984 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3985 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3986
3987 int cr = ENC_CODERANGE(str);
3988 switch (cr) {
3989 case ENC_CODERANGE_7BIT: {
3990 for (int index = 0; index < argc; index++) {
3991 VALUE obj = argv[index];
3992 enum ruby_value_type type = types[index];
3993 switch (type) {
3994 case T_FIXNUM:
3995 case T_BIGNUM: {
3996 if (!ISASCII(NUM2INT(obj))) {
3997 goto clear_cr;
3998 }
3999 break;
4000 }
4001 case T_STRING: {
4002 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
4003 goto clear_cr;
4004 }
4005 break;
4006 }
4007 default:
4008 rb_bug("append_as_bytes arguments should have been validated");
4009 }
4010 }
4011 break;
4012 }
4014 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
4015 goto keep_cr;
4016 }
4017 else {
4018 goto clear_cr;
4019 }
4020 break;
4021 default:
4022 goto clear_cr;
4023 break;
4024 }
4025
4026 RB_GC_GUARD(t0);
4027
4028 clear_cr:
4029 // If no fast path was hit, we clear the coderange.
4030 // append_as_bytes is predominantly meant to be used in
4031 // buffering situation, hence it's likely the coderange
4032 // will never be scanned, so it's not worth spending time
4033 // precomputing the coderange except for simple and common
4034 // situations.
4036 keep_cr:
4037 return str;
4038}
4039
4040/*
4041 * call-seq:
4042 * self << object -> self
4043 *
4044 * Appends a string representation of +object+ to +self+;
4045 * returns +self+.
4046 *
4047 * If +object+ is a string, appends it to +self+:
4048 *
4049 * s = 'foo'
4050 * s << 'bar' # => "foobar"
4051 * s # => "foobar"
4052 *
4053 * If +object+ is an integer,
4054 * its value is considered a codepoint;
4055 * converts the value to a character before concatenating:
4056 *
4057 * s = 'foo'
4058 * s << 33 # => "foo!"
4059 *
4060 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
4061 * and the encoding of +self+ is Encoding::US_ASCII,
4062 * changes the encoding to Encoding::ASCII_8BIT:
4063 *
4064 * s = 'foo'.encode(Encoding::US_ASCII)
4065 * s.encoding # => #<Encoding:US-ASCII>
4066 * s << 0xff # => "foo\xFF"
4067 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
4068 *
4069 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
4070 *
4071 * s = 'foo'
4072 * s.encoding # => <Encoding:UTF-8>
4073 * s << 0x00110000 # 1114112 out of char range (RangeError)
4074 * s = 'foo'.encode(Encoding::EUC_JP)
4075 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4076 *
4077 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4078 */
4079VALUE
4081{
4082 unsigned int code;
4083 rb_encoding *enc = STR_ENC_GET(str1);
4084 int encidx;
4085
4086 if (RB_INTEGER_TYPE_P(str2)) {
4087 if (rb_num_to_uint(str2, &code) == 0) {
4088 }
4089 else if (FIXNUM_P(str2)) {
4090 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4091 }
4092 else {
4093 rb_raise(rb_eRangeError, "bignum out of char range");
4094 }
4095 }
4096 else {
4097 return rb_str_append(str1, str2);
4098 }
4099
4100 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4101
4102 if (encidx >= 0) {
4103 rb_str_buf_cat_byte(str1, (unsigned char)code);
4104 }
4105 else {
4106 long pos = RSTRING_LEN(str1);
4107 int cr = ENC_CODERANGE(str1);
4108 int len;
4109 char *buf;
4110
4111 switch (len = rb_enc_codelen(code, enc)) {
4112 case ONIGERR_INVALID_CODE_POINT_VALUE:
4113 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4114 break;
4115 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4116 case 0:
4117 rb_raise(rb_eRangeError, "%u out of char range", code);
4118 break;
4119 }
4120 buf = ALLOCA_N(char, len + 1);
4121 rb_enc_mbcput(code, buf, enc);
4122 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4123 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4124 }
4125 rb_str_resize(str1, pos+len);
4126 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4127 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4129 }
4130 else if (cr == ENC_CODERANGE_BROKEN) {
4132 }
4133 ENC_CODERANGE_SET(str1, cr);
4134 }
4135 return str1;
4136}
4137
4138int
4139rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4140{
4141 int encidx = rb_enc_to_index(enc);
4142
4143 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4144 /* US-ASCII automatically extended to ASCII-8BIT */
4145 if (code > 0xFF) {
4146 rb_raise(rb_eRangeError, "%u out of char range", code);
4147 }
4148 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4149 return ENCINDEX_ASCII_8BIT;
4150 }
4151 return encidx;
4152 }
4153 else {
4154 return -1;
4155 }
4156}
4157
4158/*
4159 * call-seq:
4160 * prepend(*other_strings) -> new_string
4161 *
4162 * Prefixes to +self+ the concatenation of the given +other_strings+; returns +self+:
4163 *
4164 * 'baz'.prepend('foo', 'bar') # => "foobarbaz"
4165 *
4166 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4167 *
4168 */
4169
4170static VALUE
4171rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4172{
4173 str_modifiable(str);
4174
4175 if (argc == 1) {
4176 rb_str_update(str, 0L, 0L, argv[0]);
4177 }
4178 else if (argc > 1) {
4179 int i;
4180 VALUE arg_str = rb_str_tmp_new(0);
4181 rb_enc_copy(arg_str, str);
4182 for (i = 0; i < argc; i++) {
4183 rb_str_append(arg_str, argv[i]);
4184 }
4185 rb_str_update(str, 0L, 0L, arg_str);
4186 }
4187
4188 return str;
4189}
4190
4191st_index_t
4193{
4194 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4195 st_index_t precomputed_hash;
4196 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4197
4198 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4199 return precomputed_hash;
4200 }
4201
4202 return str_do_hash(str);
4203}
4204
4205int
4207{
4208 long len1, len2;
4209 const char *ptr1, *ptr2;
4210 RSTRING_GETMEM(str1, ptr1, len1);
4211 RSTRING_GETMEM(str2, ptr2, len2);
4212 return (len1 != len2 ||
4213 !rb_str_comparable(str1, str2) ||
4214 memcmp(ptr1, ptr2, len1) != 0);
4215}
4216
4217/*
4218 * call-seq:
4219 * hash -> integer
4220 *
4221 * :include: doc/string/hash.rdoc
4222 *
4223 */
4224
4225static VALUE
4226rb_str_hash_m(VALUE str)
4227{
4228 st_index_t hval = rb_str_hash(str);
4229 return ST2FIX(hval);
4230}
4231
4232#define lesser(a,b) (((a)>(b))?(b):(a))
4233
4234int
4236{
4237 int idx1, idx2;
4238 int rc1, rc2;
4239
4240 if (RSTRING_LEN(str1) == 0) return TRUE;
4241 if (RSTRING_LEN(str2) == 0) return TRUE;
4242 idx1 = ENCODING_GET(str1);
4243 idx2 = ENCODING_GET(str2);
4244 if (idx1 == idx2) return TRUE;
4245 rc1 = rb_enc_str_coderange(str1);
4246 rc2 = rb_enc_str_coderange(str2);
4247 if (rc1 == ENC_CODERANGE_7BIT) {
4248 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4249 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4250 return TRUE;
4251 }
4252 if (rc2 == ENC_CODERANGE_7BIT) {
4253 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4254 return TRUE;
4255 }
4256 return FALSE;
4257}
4258
4259int
4261{
4262 long len1, len2;
4263 const char *ptr1, *ptr2;
4264 int retval;
4265
4266 if (str1 == str2) return 0;
4267 RSTRING_GETMEM(str1, ptr1, len1);
4268 RSTRING_GETMEM(str2, ptr2, len2);
4269 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4270 if (len1 == len2) {
4271 if (!rb_str_comparable(str1, str2)) {
4272 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4273 return 1;
4274 return -1;
4275 }
4276 return 0;
4277 }
4278 if (len1 > len2) return 1;
4279 return -1;
4280 }
4281 if (retval > 0) return 1;
4282 return -1;
4283}
4284
4285/*
4286 * call-seq:
4287 * self == other -> true or false
4288 *
4289 * Returns whether +other+ is equal to +self+.
4290 *
4291 * When +other+ is a string, returns whether +other+ has the same length and content as +self+:
4292 *
4293 * s = 'foo'
4294 * s == 'foo' # => true
4295 * s == 'food' # => false
4296 * s == 'FOO' # => false
4297 *
4298 * Returns +false+ if the two strings' encodings are not compatible:
4299 *
4300 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4301 *
4302 * When +other+ is not a string:
4303 *
4304 * - If +other+ responds to method <tt>to_str</tt>,
4305 * <tt>other == self</tt> is called and its return value is returned.
4306 * - If +other+ does not respond to <tt>to_str</tt>,
4307 * +false+ is returned.
4308 *
4309 * Related: {Comparing}[rdoc-ref:String@Comparing].
4310 */
4311
4312VALUE
4314{
4315 if (str1 == str2) return Qtrue;
4316 if (!RB_TYPE_P(str2, T_STRING)) {
4317 if (!rb_respond_to(str2, idTo_str)) {
4318 return Qfalse;
4319 }
4320 return rb_equal(str2, str1);
4321 }
4322 return rb_str_eql_internal(str1, str2);
4323}
4324
4325/*
4326 * call-seq:
4327 * eql?(object) -> true or false
4328 *
4329 * :include: doc/string/eql_p.rdoc
4330 *
4331 */
4332
4333VALUE
4334rb_str_eql(VALUE str1, VALUE str2)
4335{
4336 if (str1 == str2) return Qtrue;
4337 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4338 return rb_str_eql_internal(str1, str2);
4339}
4340
4341/*
4342 * call-seq:
4343 * self <=> other -> -1, 0, 1, or nil
4344 *
4345 * Compares +self+ and +other+,
4346 * evaluating their _contents_, not their _lengths_.
4347 *
4348 * Returns:
4349 *
4350 * - +-1+, if +self+ is smaller.
4351 * - +0+, if the two are equal.
4352 * - +1+, if +self+ is larger.
4353 * - +nil+, if the two are incomparable.
4354 *
4355 * Examples:
4356 *
4357 * 'a' <=> 'b' # => -1
4358 * 'a' <=> 'ab' # => -1
4359 * 'a' <=> 'a' # => 0
4360 * 'b' <=> 'a' # => 1
4361 * 'ab' <=> 'a' # => 1
4362 * 'a' <=> :a # => nil
4363 *
4364 * \Class \String includes module Comparable,
4365 * each of whose methods uses String#<=> for comparison.
4366 *
4367 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4368 */
4369
4370static VALUE
4371rb_str_cmp_m(VALUE str1, VALUE str2)
4372{
4373 int result;
4374 VALUE s = rb_check_string_type(str2);
4375 if (NIL_P(s)) {
4376 return rb_invcmp(str1, str2);
4377 }
4378 result = rb_str_cmp(str1, s);
4379 return INT2FIX(result);
4380}
4381
4382static VALUE str_casecmp(VALUE str1, VALUE str2);
4383static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4384
4385/*
4386 * call-seq:
4387 * casecmp(other_string) -> -1, 0, 1, or nil
4388 *
4389 * Ignoring case, compares +self+ and +other_string+; returns:
4390 *
4391 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4392 * - 0 if the two are equal.
4393 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4394 * - +nil+ if the two are incomparable.
4395 *
4396 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4397 *
4398 * Examples:
4399 *
4400 * 'foo'.casecmp('goo') # => -1
4401 * 'goo'.casecmp('foo') # => 1
4402 * 'foo'.casecmp('food') # => -1
4403 * 'food'.casecmp('foo') # => 1
4404 * 'FOO'.casecmp('foo') # => 0
4405 * 'foo'.casecmp('FOO') # => 0
4406 * 'foo'.casecmp(1) # => nil
4407 *
4408 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4409 */
4410
4411static VALUE
4412rb_str_casecmp(VALUE str1, VALUE str2)
4413{
4414 VALUE s = rb_check_string_type(str2);
4415 if (NIL_P(s)) {
4416 return Qnil;
4417 }
4418 return str_casecmp(str1, s);
4419}
4420
4421static VALUE
4422str_casecmp(VALUE str1, VALUE str2)
4423{
4424 long len;
4425 rb_encoding *enc;
4426 const char *p1, *p1end, *p2, *p2end;
4427
4428 enc = rb_enc_compatible(str1, str2);
4429 if (!enc) {
4430 return Qnil;
4431 }
4432
4433 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4434 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4435 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4436 while (p1 < p1end && p2 < p2end) {
4437 if (*p1 != *p2) {
4438 unsigned int c1 = TOLOWER(*p1 & 0xff);
4439 unsigned int c2 = TOLOWER(*p2 & 0xff);
4440 if (c1 != c2)
4441 return INT2FIX(c1 < c2 ? -1 : 1);
4442 }
4443 p1++;
4444 p2++;
4445 }
4446 }
4447 else {
4448 while (p1 < p1end && p2 < p2end) {
4449 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4450 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4451
4452 if (0 <= c1 && 0 <= c2) {
4453 c1 = TOLOWER(c1);
4454 c2 = TOLOWER(c2);
4455 if (c1 != c2)
4456 return INT2FIX(c1 < c2 ? -1 : 1);
4457 }
4458 else {
4459 int r;
4460 l1 = rb_enc_mbclen(p1, p1end, enc);
4461 l2 = rb_enc_mbclen(p2, p2end, enc);
4462 len = l1 < l2 ? l1 : l2;
4463 r = memcmp(p1, p2, len);
4464 if (r != 0)
4465 return INT2FIX(r < 0 ? -1 : 1);
4466 if (l1 != l2)
4467 return INT2FIX(l1 < l2 ? -1 : 1);
4468 }
4469 p1 += l1;
4470 p2 += l2;
4471 }
4472 }
4473 if (p1 == p1end && p2 == p2end) return INT2FIX(0);
4474 if (p1 == p1end) return INT2FIX(-1);
4475 return INT2FIX(1);
4476}
4477
4478/*
4479 * call-seq:
4480 * casecmp?(other_string) -> true, false, or nil
4481 *
4482 * Returns +true+ if +self+ and +other_string+ are equal after
4483 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4484 *
4485 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4486 *
4487 * Examples:
4488 *
4489 * 'foo'.casecmp?('goo') # => false
4490 * 'goo'.casecmp?('foo') # => false
4491 * 'foo'.casecmp?('food') # => false
4492 * 'food'.casecmp?('foo') # => false
4493 * 'FOO'.casecmp?('foo') # => true
4494 * 'foo'.casecmp?('FOO') # => true
4495 * 'foo'.casecmp?(1) # => nil
4496 *
4497 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4498 */
4499
4500static VALUE
4501rb_str_casecmp_p(VALUE str1, VALUE str2)
4502{
4503 VALUE s = rb_check_string_type(str2);
4504 if (NIL_P(s)) {
4505 return Qnil;
4506 }
4507 return str_casecmp_p(str1, s);
4508}
4509
4510static VALUE
4511str_casecmp_p(VALUE str1, VALUE str2)
4512{
4513 rb_encoding *enc;
4514 VALUE folded_str1, folded_str2;
4515 VALUE fold_opt = sym_fold;
4516
4517 enc = rb_enc_compatible(str1, str2);
4518 if (!enc) {
4519 return Qnil;
4520 }
4521
4522 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4523 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4524
4525 return rb_str_eql(folded_str1, folded_str2);
4526}
4527
4528static long
4529strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4530 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4531{
4532 const char *search_start = str_ptr;
4533 long pos, search_len = str_len - offset;
4534
4535 for (;;) {
4536 const char *t;
4537 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4538 if (pos < 0) return pos;
4539 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4540 if (t == search_start + pos) break;
4541 search_len -= t - search_start;
4542 if (search_len <= 0) return -1;
4543 offset += t - search_start;
4544 search_start = t;
4545 }
4546 return pos + offset;
4547}
4548
4549/* found index in byte */
4550#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4551#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4552
4553static long
4554rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4555{
4556 const char *str_ptr, *str_ptr_end, *sub_ptr;
4557 long str_len, sub_len;
4558 rb_encoding *enc;
4559
4560 enc = rb_enc_check(str, sub);
4561 if (is_broken_string(sub)) return -1;
4562
4563 str_ptr = RSTRING_PTR(str);
4564 str_ptr_end = RSTRING_END(str);
4565 str_len = RSTRING_LEN(str);
4566 sub_ptr = RSTRING_PTR(sub);
4567 sub_len = RSTRING_LEN(sub);
4568
4569 if (str_len < sub_len) return -1;
4570
4571 if (offset != 0) {
4572 long str_len_char, sub_len_char;
4573 int single_byte = single_byte_optimizable(str);
4574 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4575 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4576 if (offset < 0) {
4577 offset += str_len_char;
4578 if (offset < 0) return -1;
4579 }
4580 if (str_len_char - offset < sub_len_char) return -1;
4581 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4582 str_ptr += offset;
4583 }
4584 if (sub_len == 0) return offset;
4585
4586 /* need proceed one character at a time */
4587 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4588}
4589
4590
4591/*
4592 * call-seq:
4593 * index(pattern, offset = 0) -> integer or nil
4594 *
4595 * :include: doc/string/index.rdoc
4596 *
4597 */
4598
4599static VALUE
4600rb_str_index_m(int argc, VALUE *argv, VALUE str)
4601{
4602 VALUE sub;
4603 VALUE initpos;
4604 rb_encoding *enc = STR_ENC_GET(str);
4605 long pos;
4606
4607 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4608 long slen = str_strlen(str, enc); /* str's enc */
4609 pos = NUM2LONG(initpos);
4610 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4611 if (RB_TYPE_P(sub, T_REGEXP)) {
4613 }
4614 return Qnil;
4615 }
4616 }
4617 else {
4618 pos = 0;
4619 }
4620
4621 if (RB_TYPE_P(sub, T_REGEXP)) {
4622 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4623 enc, single_byte_optimizable(str));
4624
4625 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4626 VALUE match = rb_backref_get();
4627 struct re_registers *regs = RMATCH_REGS(match);
4628 pos = rb_str_sublen(str, BEG(0));
4629 return LONG2NUM(pos);
4630 }
4631 }
4632 else {
4633 StringValue(sub);
4634 pos = rb_str_index(str, sub, pos);
4635 if (pos >= 0) {
4636 pos = rb_str_sublen(str, pos);
4637 return LONG2NUM(pos);
4638 }
4639 }
4640 return Qnil;
4641}
4642
4643/* Ensure that the given pos is a valid character boundary.
4644 * Note that in this function, "character" means a code point
4645 * (Unicode scalar value), not a grapheme cluster.
4646 */
4647static void
4648str_ensure_byte_pos(VALUE str, long pos)
4649{
4650 if (!single_byte_optimizable(str)) {
4651 const char *s = RSTRING_PTR(str);
4652 const char *e = RSTRING_END(str);
4653 const char *p = s + pos;
4654 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4655 rb_raise(rb_eIndexError,
4656 "offset %ld does not land on character boundary", pos);
4657 }
4658 }
4659}
4660
4661/*
4662 * call-seq:
4663 * byteindex(object, offset = 0) -> integer or nil
4664 *
4665 * Returns the 0-based integer index of a substring of +self+
4666 * specified by +object+ (a string or Regexp) and +offset+,
4667 * or +nil+ if there is no such substring;
4668 * the returned index is the count of _bytes_ (not characters).
4669 *
4670 * When +object+ is a string,
4671 * returns the index of the first found substring equal to +object+:
4672 *
4673 * s = 'foo' # => "foo"
4674 * s.size # => 3 # Three 1-byte characters.
4675 * s.bytesize # => 3 # Three bytes.
4676 * s.byteindex('f') # => 0
4677 * s.byteindex('o') # => 1
4678 * s.byteindex('oo') # => 1
4679 * s.byteindex('ooo') # => nil
4680 *
4681 * When +object+ is a Regexp,
4682 * returns the index of the first found substring matching +object+;
4683 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4684 *
4685 * s = 'foo'
4686 * s.byteindex(/f/) # => 0
4687 * $~ # => #<MatchData "f">
4688 * s.byteindex(/o/) # => 1
4689 * s.byteindex(/oo/) # => 1
4690 * s.byteindex(/ooo/) # => nil
4691 * $~ # => nil
4692 *
4693 * \Integer argument +offset+, if given, specifies the 0-based index
4694 * of the byte where searching is to begin.
4695 *
4696 * When +offset+ is non-negative,
4697 * searching begins at byte position +offset+:
4698 *
4699 * s = 'foo'
4700 * s.byteindex('o', 1) # => 1
4701 * s.byteindex('o', 2) # => 2
4702 * s.byteindex('o', 3) # => nil
4703 *
4704 * When +offset+ is negative, counts backward from the end of +self+:
4705 *
4706 * s = 'foo'
4707 * s.byteindex('o', -1) # => 2
4708 * s.byteindex('o', -2) # => 1
4709 * s.byteindex('o', -3) # => 1
4710 * s.byteindex('o', -4) # => nil
4711 *
4712 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4713 *
4714 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4715 * s.size # => 2 # Two 3-byte characters.
4716 * s.bytesize # => 6 # Six bytes.
4717 * s.byteindex("\uFFFF") # => 0
4718 * s.byteindex("\uFFFF", 1) # Raises IndexError
4719 * s.byteindex("\uFFFF", 2) # Raises IndexError
4720 * s.byteindex("\uFFFF", 3) # => 3
4721 * s.byteindex("\uFFFF", 4) # Raises IndexError
4722 * s.byteindex("\uFFFF", 5) # Raises IndexError
4723 * s.byteindex("\uFFFF", 6) # => nil
4724 *
4725 * Related: see {Querying}[rdoc-ref:String@Querying].
4726 */
4727
4728static VALUE
4729rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4730{
4731 VALUE sub;
4732 VALUE initpos;
4733 long pos;
4734
4735 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4736 long slen = RSTRING_LEN(str);
4737 pos = NUM2LONG(initpos);
4738 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4739 if (RB_TYPE_P(sub, T_REGEXP)) {
4741 }
4742 return Qnil;
4743 }
4744 }
4745 else {
4746 pos = 0;
4747 }
4748
4749 str_ensure_byte_pos(str, pos);
4750
4751 if (RB_TYPE_P(sub, T_REGEXP)) {
4752 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4753 VALUE match = rb_backref_get();
4754 struct re_registers *regs = RMATCH_REGS(match);
4755 pos = BEG(0);
4756 return LONG2NUM(pos);
4757 }
4758 }
4759 else {
4760 StringValue(sub);
4761 pos = rb_str_byteindex(str, sub, pos);
4762 if (pos >= 0) return LONG2NUM(pos);
4763 }
4764 return Qnil;
4765}
4766
4767#ifndef HAVE_MEMRCHR
4768static void*
4769memrchr(const char *search_str, int chr, long search_len)
4770{
4771 const char *ptr = search_str + search_len;
4772 while (ptr > search_str) {
4773 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4774 }
4775
4776 return ((void *)0);
4777}
4778#endif
4779
4780static long
4781str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4782{
4783 char *hit, *adjusted;
4784 int c;
4785 long slen, searchlen;
4786 char *sbeg, *e, *t;
4787
4788 sbeg = RSTRING_PTR(str);
4789 slen = RSTRING_LEN(sub);
4790 if (slen == 0) return s - sbeg;
4791 e = RSTRING_END(str);
4792 t = RSTRING_PTR(sub);
4793 c = *t & 0xff;
4794 searchlen = s - sbeg + 1;
4795
4796 if (memcmp(s, t, slen) == 0) {
4797 return s - sbeg;
4798 }
4799
4800 do {
4801 hit = memrchr(sbeg, c, searchlen);
4802 if (!hit) break;
4803 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4804 if (hit != adjusted) {
4805 searchlen = adjusted - sbeg;
4806 continue;
4807 }
4808 if (memcmp(hit, t, slen) == 0)
4809 return hit - sbeg;
4810 searchlen = adjusted - sbeg;
4811 } while (searchlen > 0);
4812
4813 return -1;
4814}
4815
4816/* found index in byte */
4817static long
4818rb_str_rindex(VALUE str, VALUE sub, long pos)
4819{
4820 long len, slen;
4821 char *sbeg, *s;
4822 rb_encoding *enc;
4823 int singlebyte;
4824
4825 enc = rb_enc_check(str, sub);
4826 if (is_broken_string(sub)) return -1;
4827 singlebyte = single_byte_optimizable(str);
4828 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4829 slen = str_strlen(sub, enc); /* rb_enc_check */
4830
4831 /* substring longer than string */
4832 if (len < slen) return -1;
4833 if (len - pos < slen) pos = len - slen;
4834 if (len == 0) return pos;
4835
4836 sbeg = RSTRING_PTR(str);
4837
4838 if (pos == 0) {
4839 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4840 return 0;
4841 else
4842 return -1;
4843 }
4844
4845 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4846 return str_rindex(str, sub, s, enc);
4847}
4848
4849/*
4850 * call-seq:
4851 * rindex(pattern, offset = self.length) -> integer or nil
4852 *
4853 * :include:doc/string/rindex.rdoc
4854 *
4855 */
4856
4857static VALUE
4858rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4859{
4860 VALUE sub;
4861 VALUE initpos;
4862 rb_encoding *enc = STR_ENC_GET(str);
4863 long pos, len = str_strlen(str, enc); /* str's enc */
4864
4865 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4866 pos = NUM2LONG(initpos);
4867 if (pos < 0 && (pos += len) < 0) {
4868 if (RB_TYPE_P(sub, T_REGEXP)) {
4870 }
4871 return Qnil;
4872 }
4873 if (pos > len) pos = len;
4874 }
4875 else {
4876 pos = len;
4877 }
4878
4879 if (RB_TYPE_P(sub, T_REGEXP)) {
4880 /* enc = rb_enc_check(str, sub); */
4881 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4882 enc, single_byte_optimizable(str));
4883
4884 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4885 VALUE match = rb_backref_get();
4886 struct re_registers *regs = RMATCH_REGS(match);
4887 pos = rb_str_sublen(str, BEG(0));
4888 return LONG2NUM(pos);
4889 }
4890 }
4891 else {
4892 StringValue(sub);
4893 pos = rb_str_rindex(str, sub, pos);
4894 if (pos >= 0) {
4895 pos = rb_str_sublen(str, pos);
4896 return LONG2NUM(pos);
4897 }
4898 }
4899 return Qnil;
4900}
4901
4902static long
4903rb_str_byterindex(VALUE str, VALUE sub, long pos)
4904{
4905 long len, slen;
4906 char *sbeg, *s;
4907 rb_encoding *enc;
4908
4909 enc = rb_enc_check(str, sub);
4910 if (is_broken_string(sub)) return -1;
4911 len = RSTRING_LEN(str);
4912 slen = RSTRING_LEN(sub);
4913
4914 /* substring longer than string */
4915 if (len < slen) return -1;
4916 if (len - pos < slen) pos = len - slen;
4917 if (len == 0) return pos;
4918
4919 sbeg = RSTRING_PTR(str);
4920
4921 if (pos == 0) {
4922 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4923 return 0;
4924 else
4925 return -1;
4926 }
4927
4928 s = sbeg + pos;
4929 return str_rindex(str, sub, s, enc);
4930}
4931
4932/*
4933 * call-seq:
4934 * byterindex(object, offset = self.bytesize) -> integer or nil
4935 *
4936 * Returns the 0-based integer index of a substring of +self+
4937 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4938 * or +nil+ if there is no such substring;
4939 * the returned index is the count of _bytes_ (not characters).
4940 *
4941 * When +object+ is a string,
4942 * returns the index of the _last_ found substring equal to +object+:
4943 *
4944 * s = 'foo' # => "foo"
4945 * s.size # => 3 # Three 1-byte characters.
4946 * s.bytesize # => 3 # Three bytes.
4947 * s.byterindex('f') # => 0
4948 * s.byterindex('o') # => 2
4949 * s.byterindex('oo') # => 1
4950 * s.byterindex('ooo') # => nil
4951 *
4952 * When +object+ is a Regexp,
4953 * returns the index of the last found substring matching +object+;
4954 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4955 *
4956 * s = 'foo'
4957 * s.byterindex(/f/) # => 0
4958 * $~ # => #<MatchData "f">
4959 * s.byterindex(/o/) # => 2
4960 * s.byterindex(/oo/) # => 1
4961 * s.byterindex(/ooo/) # => nil
4962 * $~ # => nil
4963 *
4964 * The last match means starting at the possible last position,
4965 * not the last of the longest matches:
4966 *
4967 * s = 'foo'
4968 * s.byterindex(/o+/) # => 2
4969 * $~ #=> #<MatchData "o">
4970 *
4971 * To get the last longest match, use a negative lookbehind:
4972 *
4973 * s = 'foo'
4974 * s.byterindex(/(?<!o)o+/) # => 1
4975 * $~ # => #<MatchData "oo">
4976 *
4977 * Or use method #byteindex with negative lookahead:
4978 *
4979 * s = 'foo'
4980 * s.byteindex(/o+(?!.*o)/) # => 1
4981 * $~ #=> #<MatchData "oo">
4982 *
4983 * \Integer argument +offset+, if given, specifies the 0-based index
4984 * of the byte where searching is to end.
4985 *
4986 * When +offset+ is non-negative,
4987 * searching ends at byte position +offset+:
4988 *
4989 * s = 'foo'
4990 * s.byterindex('o', 0) # => nil
4991 * s.byterindex('o', 1) # => 1
4992 * s.byterindex('o', 2) # => 2
4993 * s.byterindex('o', 3) # => 2
4994 *
4995 * When +offset+ is negative, counts backward from the end of +self+:
4996 *
4997 * s = 'foo'
4998 * s.byterindex('o', -1) # => 2
4999 * s.byterindex('o', -2) # => 1
5000 * s.byterindex('o', -3) # => nil
5001 *
5002 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
5003 *
5004 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
5005 * s.size # => 2 # Two 3-byte characters.
5006 * s.bytesize # => 6 # Six bytes.
5007 * s.byterindex("\uFFFF") # => 3
5008 * s.byterindex("\uFFFF", 1) # Raises IndexError
5009 * s.byterindex("\uFFFF", 2) # Raises IndexError
5010 * s.byterindex("\uFFFF", 3) # => 3
5011 * s.byterindex("\uFFFF", 4) # Raises IndexError
5012 * s.byterindex("\uFFFF", 5) # Raises IndexError
5013 * s.byterindex("\uFFFF", 6) # => nil
5014 *
5015 * Related: see {Querying}[rdoc-ref:String@Querying].
5016 */
5017
5018static VALUE
5019rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5020{
5021 VALUE sub;
5022 VALUE initpos;
5023 long pos, len = RSTRING_LEN(str);
5024
5025 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5026 pos = NUM2LONG(initpos);
5027 if (pos < 0 && (pos += len) < 0) {
5028 if (RB_TYPE_P(sub, T_REGEXP)) {
5030 }
5031 return Qnil;
5032 }
5033 if (pos > len) pos = len;
5034 }
5035 else {
5036 pos = len;
5037 }
5038
5039 str_ensure_byte_pos(str, pos);
5040
5041 if (RB_TYPE_P(sub, T_REGEXP)) {
5042 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5043 VALUE match = rb_backref_get();
5044 struct re_registers *regs = RMATCH_REGS(match);
5045 pos = BEG(0);
5046 return LONG2NUM(pos);
5047 }
5048 }
5049 else {
5050 StringValue(sub);
5051 pos = rb_str_byterindex(str, sub, pos);
5052 if (pos >= 0) return LONG2NUM(pos);
5053 }
5054 return Qnil;
5055}
5056
5057/*
5058 * call-seq:
5059 * self =~ other -> integer or nil
5060 *
5061 * When +other+ is a Regexp:
5062 *
5063 * - Returns the integer index (in characters) of the first match
5064 * for +self+ and +other+, or +nil+ if none;
5065 * - Updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables].
5066 *
5067 * Examples:
5068 *
5069 * 'foo' =~ /f/ # => 0
5070 * $~ # => #<MatchData "f">
5071 * 'foo' =~ /o/ # => 1
5072 * $~ # => #<MatchData "o">
5073 * 'foo' =~ /x/ # => nil
5074 * $~ # => nil
5075 *
5076 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5077 * (see Regexp#=~):
5078 *
5079 * number = nil
5080 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5081 * number # => nil # Not assigned.
5082 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5083 * number # => "9" # Assigned.
5084 *
5085 * When +other+ is not a Regexp, returns the value
5086 * returned by <tt>other =~ self</tt>.
5087 *
5088 * Related: see {Querying}[rdoc-ref:String@Querying].
5089 */
5090
5091static VALUE
5092rb_str_match(VALUE x, VALUE y)
5093{
5094 switch (OBJ_BUILTIN_TYPE(y)) {
5095 case T_STRING:
5096 rb_raise(rb_eTypeError, "type mismatch: String given");
5097
5098 case T_REGEXP:
5099 return rb_reg_match(y, x);
5100
5101 default:
5102 return rb_funcall(y, idEqTilde, 1, x);
5103 }
5104}
5105
5106
5107static VALUE get_pat(VALUE);
5108
5109
5110/*
5111 * call-seq:
5112 * match(pattern, offset = 0) -> matchdata or nil
5113 * match(pattern, offset = 0) {|matchdata| ... } -> object
5114 *
5115 * Creates a MatchData object based on +self+ and the given arguments;
5116 * updates {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5117 *
5118 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5119 *
5120 * regexp = Regexp.new(pattern)
5121 *
5122 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5123 * (see Regexp#match):
5124 *
5125 * matchdata = regexp.match(self[offset..])
5126 *
5127 * With no block given, returns the computed +matchdata+ or +nil+:
5128 *
5129 * 'foo'.match('f') # => #<MatchData "f">
5130 * 'foo'.match('o') # => #<MatchData "o">
5131 * 'foo'.match('x') # => nil
5132 * 'foo'.match('f', 1) # => nil
5133 * 'foo'.match('o', 1) # => #<MatchData "o">
5134 *
5135 * With a block given and computed +matchdata+ non-nil, calls the block with +matchdata+;
5136 * returns the block's return value:
5137 *
5138 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5139 *
5140 * With a block given and +nil+ +matchdata+, does not call the block:
5141 *
5142 * 'foo'.match(/x/) {|matchdata| fail 'Cannot happen' } # => nil
5143 *
5144 * Related: see {Querying}[rdoc-ref:String@Querying].
5145 */
5146
5147static VALUE
5148rb_str_match_m(int argc, VALUE *argv, VALUE str)
5149{
5150 VALUE re, result;
5151 if (argc < 1)
5152 rb_check_arity(argc, 1, 2);
5153 re = argv[0];
5154 argv[0] = str;
5155 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5156 if (!NIL_P(result) && rb_block_given_p()) {
5157 return rb_yield(result);
5158 }
5159 return result;
5160}
5161
5162/*
5163 * call-seq:
5164 * match?(pattern, offset = 0) -> true or false
5165 *
5166 * Returns whether a match is found for +self+ and the given arguments;
5167 * does not update {Regexp Global Variables}[rdoc-ref:Regexp@Global+Variables].
5168 *
5169 * Computes +regexp+ by converting +pattern+ (if not already a Regexp):
5170 *
5171 * regexp = Regexp.new(pattern)
5172 *
5173 * Returns +true+ if <tt>self[offset..].match(regexp)</tt> returns a MatchData object,
5174 * +false+ otherwise:
5175 *
5176 * 'foo'.match?(/o/) # => true
5177 * 'foo'.match?('o') # => true
5178 * 'foo'.match?(/x/) # => false
5179 * 'foo'.match?('f', 1) # => false
5180 * 'foo'.match?('o', 1) # => true
5181 *
5182 * Related: see {Querying}[rdoc-ref:String@Querying].
5183 */
5184
5185static VALUE
5186rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5187{
5188 VALUE re;
5189 rb_check_arity(argc, 1, 2);
5190 re = get_pat(argv[0]);
5191 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5192}
5193
5194enum neighbor_char {
5195 NEIGHBOR_NOT_CHAR,
5196 NEIGHBOR_FOUND,
5197 NEIGHBOR_WRAPPED
5198};
5199
5200static enum neighbor_char
5201enc_succ_char(char *p, long len, rb_encoding *enc)
5202{
5203 long i;
5204 int l;
5205
5206 if (rb_enc_mbminlen(enc) > 1) {
5207 /* wchar, trivial case */
5208 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5209 if (!MBCLEN_CHARFOUND_P(r)) {
5210 return NEIGHBOR_NOT_CHAR;
5211 }
5212 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5213 l = rb_enc_code_to_mbclen(c, enc);
5214 if (!l) return NEIGHBOR_NOT_CHAR;
5215 if (l != len) return NEIGHBOR_WRAPPED;
5216 rb_enc_mbcput(c, p, enc);
5217 r = rb_enc_precise_mbclen(p, p + len, enc);
5218 if (!MBCLEN_CHARFOUND_P(r)) {
5219 return NEIGHBOR_NOT_CHAR;
5220 }
5221 return NEIGHBOR_FOUND;
5222 }
5223 while (1) {
5224 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5225 p[i] = '\0';
5226 if (i < 0)
5227 return NEIGHBOR_WRAPPED;
5228 ++((unsigned char*)p)[i];
5229 l = rb_enc_precise_mbclen(p, p+len, enc);
5230 if (MBCLEN_CHARFOUND_P(l)) {
5231 l = MBCLEN_CHARFOUND_LEN(l);
5232 if (l == len) {
5233 return NEIGHBOR_FOUND;
5234 }
5235 else {
5236 memset(p+l, 0xff, len-l);
5237 }
5238 }
5239 if (MBCLEN_INVALID_P(l) && i < len-1) {
5240 long len2;
5241 int l2;
5242 for (len2 = len-1; 0 < len2; len2--) {
5243 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5244 if (!MBCLEN_INVALID_P(l2))
5245 break;
5246 }
5247 memset(p+len2+1, 0xff, len-(len2+1));
5248 }
5249 }
5250}
5251
5252static enum neighbor_char
5253enc_pred_char(char *p, long len, rb_encoding *enc)
5254{
5255 long i;
5256 int l;
5257 if (rb_enc_mbminlen(enc) > 1) {
5258 /* wchar, trivial case */
5259 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5260 if (!MBCLEN_CHARFOUND_P(r)) {
5261 return NEIGHBOR_NOT_CHAR;
5262 }
5263 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5264 if (!c) return NEIGHBOR_NOT_CHAR;
5265 --c;
5266 l = rb_enc_code_to_mbclen(c, enc);
5267 if (!l) return NEIGHBOR_NOT_CHAR;
5268 if (l != len) return NEIGHBOR_WRAPPED;
5269 rb_enc_mbcput(c, p, enc);
5270 r = rb_enc_precise_mbclen(p, p + len, enc);
5271 if (!MBCLEN_CHARFOUND_P(r)) {
5272 return NEIGHBOR_NOT_CHAR;
5273 }
5274 return NEIGHBOR_FOUND;
5275 }
5276 while (1) {
5277 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5278 p[i] = '\xff';
5279 if (i < 0)
5280 return NEIGHBOR_WRAPPED;
5281 --((unsigned char*)p)[i];
5282 l = rb_enc_precise_mbclen(p, p+len, enc);
5283 if (MBCLEN_CHARFOUND_P(l)) {
5284 l = MBCLEN_CHARFOUND_LEN(l);
5285 if (l == len) {
5286 return NEIGHBOR_FOUND;
5287 }
5288 else {
5289 memset(p+l, 0, len-l);
5290 }
5291 }
5292 if (MBCLEN_INVALID_P(l) && i < len-1) {
5293 long len2;
5294 int l2;
5295 for (len2 = len-1; 0 < len2; len2--) {
5296 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5297 if (!MBCLEN_INVALID_P(l2))
5298 break;
5299 }
5300 memset(p+len2+1, 0, len-(len2+1));
5301 }
5302 }
5303}
5304
5305/*
5306 overwrite +p+ by succeeding letter in +enc+ and returns
5307 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5308 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5309 assuming each ranges are successive, and mbclen
5310 never change in each ranges.
5311 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5312 character.
5313 */
5314static enum neighbor_char
5315enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5316{
5317 enum neighbor_char ret;
5318 unsigned int c;
5319 int ctype;
5320 int range;
5321 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5322
5323 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5324 int try;
5325 const int max_gaps = 1;
5326
5327 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5328 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5329 ctype = ONIGENC_CTYPE_DIGIT;
5330 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5331 ctype = ONIGENC_CTYPE_ALPHA;
5332 else
5333 return NEIGHBOR_NOT_CHAR;
5334
5335 MEMCPY(save, p, char, len);
5336 for (try = 0; try <= max_gaps; ++try) {
5337 ret = enc_succ_char(p, len, enc);
5338 if (ret == NEIGHBOR_FOUND) {
5339 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5340 if (rb_enc_isctype(c, ctype, enc))
5341 return NEIGHBOR_FOUND;
5342 }
5343 }
5344 MEMCPY(p, save, char, len);
5345 range = 1;
5346 while (1) {
5347 MEMCPY(save, p, char, len);
5348 ret = enc_pred_char(p, len, enc);
5349 if (ret == NEIGHBOR_FOUND) {
5350 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5351 if (!rb_enc_isctype(c, ctype, enc)) {
5352 MEMCPY(p, save, char, len);
5353 break;
5354 }
5355 }
5356 else {
5357 MEMCPY(p, save, char, len);
5358 break;
5359 }
5360 range++;
5361 }
5362 if (range == 1) {
5363 return NEIGHBOR_NOT_CHAR;
5364 }
5365
5366 if (ctype != ONIGENC_CTYPE_DIGIT) {
5367 MEMCPY(carry, p, char, len);
5368 return NEIGHBOR_WRAPPED;
5369 }
5370
5371 MEMCPY(carry, p, char, len);
5372 enc_succ_char(carry, len, enc);
5373 return NEIGHBOR_WRAPPED;
5374}
5375
5376
5377static VALUE str_succ(VALUE str);
5378
5379/*
5380 * call-seq:
5381 * succ -> new_str
5382 *
5383 * :include: doc/string/succ.rdoc
5384 *
5385 */
5386
5387VALUE
5389{
5390 VALUE str;
5391 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5392 rb_enc_cr_str_copy_for_substr(str, orig);
5393 return str_succ(str);
5394}
5395
5396static VALUE
5397str_succ(VALUE str)
5398{
5399 rb_encoding *enc;
5400 char *sbeg, *s, *e, *last_alnum = 0;
5401 int found_alnum = 0;
5402 long l, slen;
5403 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5404 long carry_pos = 0, carry_len = 1;
5405 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5406
5407 slen = RSTRING_LEN(str);
5408 if (slen == 0) return str;
5409
5410 enc = STR_ENC_GET(str);
5411 sbeg = RSTRING_PTR(str);
5412 s = e = sbeg + slen;
5413
5414 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5415 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5416 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5417 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5418 break;
5419 }
5420 }
5421 l = rb_enc_precise_mbclen(s, e, enc);
5422 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5423 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5424 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5425 switch (neighbor) {
5426 case NEIGHBOR_NOT_CHAR:
5427 continue;
5428 case NEIGHBOR_FOUND:
5429 return str;
5430 case NEIGHBOR_WRAPPED:
5431 last_alnum = s;
5432 break;
5433 }
5434 found_alnum = 1;
5435 carry_pos = s - sbeg;
5436 carry_len = l;
5437 }
5438 if (!found_alnum) { /* str contains no alnum */
5439 s = e;
5440 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5441 enum neighbor_char neighbor;
5442 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5443 l = rb_enc_precise_mbclen(s, e, enc);
5444 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5445 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5446 MEMCPY(tmp, s, char, l);
5447 neighbor = enc_succ_char(tmp, l, enc);
5448 switch (neighbor) {
5449 case NEIGHBOR_FOUND:
5450 MEMCPY(s, tmp, char, l);
5451 return str;
5452 break;
5453 case NEIGHBOR_WRAPPED:
5454 MEMCPY(s, tmp, char, l);
5455 break;
5456 case NEIGHBOR_NOT_CHAR:
5457 break;
5458 }
5459 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5460 /* wrapped to \0...\0. search next valid char. */
5461 enc_succ_char(s, l, enc);
5462 }
5463 if (!rb_enc_asciicompat(enc)) {
5464 MEMCPY(carry, s, char, l);
5465 carry_len = l;
5466 }
5467 carry_pos = s - sbeg;
5468 }
5470 }
5471 RESIZE_CAPA(str, slen + carry_len);
5472 sbeg = RSTRING_PTR(str);
5473 s = sbeg + carry_pos;
5474 memmove(s + carry_len, s, slen - carry_pos);
5475 memmove(s, carry, carry_len);
5476 slen += carry_len;
5477 STR_SET_LEN(str, slen);
5478 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5479 rb_enc_str_coderange(str);
5480 return str;
5481}
5482
5483
5484/*
5485 * call-seq:
5486 * succ! -> self
5487 *
5488 * Like String#succ, but modifies +self+ in place; returns +self+.
5489 *
5490 * Related: see {Modifying}[rdoc-ref:String@Modifying].
5491 */
5492
5493static VALUE
5494rb_str_succ_bang(VALUE str)
5495{
5496 rb_str_modify(str);
5497 str_succ(str);
5498 return str;
5499}
5500
5501static int
5502all_digits_p(const char *s, long len)
5503{
5504 while (len-- > 0) {
5505 if (!ISDIGIT(*s)) return 0;
5506 s++;
5507 }
5508 return 1;
5509}
5510
5511static int
5512str_upto_i(VALUE str, VALUE arg)
5513{
5514 rb_yield(str);
5515 return 0;
5516}
5517
5518/*
5519 * call-seq:
5520 * upto(other_string, exclusive = false) {|string| ... } -> self
5521 * upto(other_string, exclusive = false) -> new_enumerator
5522 *
5523 * :include: doc/string/upto.rdoc
5524 *
5525 */
5526
5527static VALUE
5528rb_str_upto(int argc, VALUE *argv, VALUE beg)
5529{
5530 VALUE end, exclusive;
5531
5532 rb_scan_args(argc, argv, "11", &end, &exclusive);
5533 RETURN_ENUMERATOR(beg, argc, argv);
5534 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5535}
5536
5537VALUE
5538rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5539{
5540 VALUE current, after_end;
5541 ID succ;
5542 int n, ascii;
5543 rb_encoding *enc;
5544
5545 CONST_ID(succ, "succ");
5546 StringValue(end);
5547 enc = rb_enc_check(beg, end);
5548 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5549 /* single character */
5550 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5551 char c = RSTRING_PTR(beg)[0];
5552 char e = RSTRING_PTR(end)[0];
5553
5554 if (c > e || (excl && c == e)) return beg;
5555 for (;;) {
5556 VALUE str = rb_enc_str_new(&c, 1, enc);
5558 if ((*each)(str, arg)) break;
5559 if (!excl && c == e) break;
5560 c++;
5561 if (excl && c == e) break;
5562 }
5563 return beg;
5564 }
5565 /* both edges are all digits */
5566 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5567 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5568 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5569 VALUE b, e;
5570 int width;
5571
5572 width = RSTRING_LENINT(beg);
5573 b = rb_str_to_inum(beg, 10, FALSE);
5574 e = rb_str_to_inum(end, 10, FALSE);
5575 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5576 long bi = FIX2LONG(b);
5577 long ei = FIX2LONG(e);
5578 rb_encoding *usascii = rb_usascii_encoding();
5579
5580 while (bi <= ei) {
5581 if (excl && bi == ei) break;
5582 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5583 bi++;
5584 }
5585 }
5586 else {
5587 ID op = excl ? '<' : idLE;
5588 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5589
5590 args[0] = INT2FIX(width);
5591 while (rb_funcall(b, op, 1, e)) {
5592 args[1] = b;
5593 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5594 b = rb_funcallv(b, succ, 0, 0);
5595 }
5596 }
5597 return beg;
5598 }
5599 /* normal case */
5600 n = rb_str_cmp(beg, end);
5601 if (n > 0 || (excl && n == 0)) return beg;
5602
5603 after_end = rb_funcallv(end, succ, 0, 0);
5604 current = str_duplicate(rb_cString, beg);
5605 while (!rb_str_equal(current, after_end)) {
5606 VALUE next = Qnil;
5607 if (excl || !rb_str_equal(current, end))
5608 next = rb_funcallv(current, succ, 0, 0);
5609 if ((*each)(current, arg)) break;
5610 if (NIL_P(next)) break;
5611 current = next;
5612 StringValue(current);
5613 if (excl && rb_str_equal(current, end)) break;
5614 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5615 break;
5616 }
5617
5618 return beg;
5619}
5620
5621VALUE
5622rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5623{
5624 VALUE current;
5625 ID succ;
5626
5627 CONST_ID(succ, "succ");
5628 /* both edges are all digits */
5629 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5630 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5631 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5632 int width = RSTRING_LENINT(beg);
5633 b = rb_str_to_inum(beg, 10, FALSE);
5634 if (FIXNUM_P(b)) {
5635 long bi = FIX2LONG(b);
5636 rb_encoding *usascii = rb_usascii_encoding();
5637
5638 while (FIXABLE(bi)) {
5639 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5640 bi++;
5641 }
5642 b = LONG2NUM(bi);
5643 }
5644 args[0] = INT2FIX(width);
5645 while (1) {
5646 args[1] = b;
5647 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5648 b = rb_funcallv(b, succ, 0, 0);
5649 }
5650 }
5651 /* normal case */
5652 current = str_duplicate(rb_cString, beg);
5653 while (1) {
5654 VALUE next = rb_funcallv(current, succ, 0, 0);
5655 if ((*each)(current, arg)) break;
5656 current = next;
5657 StringValue(current);
5658 if (RSTRING_LEN(current) == 0)
5659 break;
5660 }
5661
5662 return beg;
5663}
5664
5665static int
5666include_range_i(VALUE str, VALUE arg)
5667{
5668 VALUE *argp = (VALUE *)arg;
5669 if (!rb_equal(str, *argp)) return 0;
5670 *argp = Qnil;
5671 return 1;
5672}
5673
5674VALUE
5675rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5676{
5677 beg = rb_str_new_frozen(beg);
5678 StringValue(end);
5679 end = rb_str_new_frozen(end);
5680 if (NIL_P(val)) return Qfalse;
5681 val = rb_check_string_type(val);
5682 if (NIL_P(val)) return Qfalse;
5683 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5684 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5685 rb_enc_asciicompat(STR_ENC_GET(val))) {
5686 const char *bp = RSTRING_PTR(beg);
5687 const char *ep = RSTRING_PTR(end);
5688 const char *vp = RSTRING_PTR(val);
5689 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5690 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5691 return Qfalse;
5692 else {
5693 char b = *bp;
5694 char e = *ep;
5695 char v = *vp;
5696
5697 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5698 if (b <= v && v < e) return Qtrue;
5699 return RBOOL(!RTEST(exclusive) && v == e);
5700 }
5701 }
5702 }
5703#if 0
5704 /* both edges are all digits */
5705 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5706 all_digits_p(bp, RSTRING_LEN(beg)) &&
5707 all_digits_p(ep, RSTRING_LEN(end))) {
5708 /* TODO */
5709 }
5710#endif
5711 }
5712 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5713
5714 return RBOOL(NIL_P(val));
5715}
5716
5717static VALUE
5718rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5719{
5720 if (rb_reg_search(re, str, 0, 0) >= 0) {
5721 VALUE match = rb_backref_get();
5722 int nth = rb_reg_backref_number(match, backref);
5723 return rb_reg_nth_match(nth, match);
5724 }
5725 return Qnil;
5726}
5727
5728static VALUE
5729rb_str_aref(VALUE str, VALUE indx)
5730{
5731 long idx;
5732
5733 if (FIXNUM_P(indx)) {
5734 idx = FIX2LONG(indx);
5735 }
5736 else if (RB_TYPE_P(indx, T_REGEXP)) {
5737 return rb_str_subpat(str, indx, INT2FIX(0));
5738 }
5739 else if (RB_TYPE_P(indx, T_STRING)) {
5740 if (rb_str_index(str, indx, 0) != -1)
5741 return str_duplicate(rb_cString, indx);
5742 return Qnil;
5743 }
5744 else {
5745 /* check if indx is Range */
5746 long beg, len = str_strlen(str, NULL);
5747 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5748 case Qfalse:
5749 break;
5750 case Qnil:
5751 return Qnil;
5752 default:
5753 return rb_str_substr(str, beg, len);
5754 }
5755 idx = NUM2LONG(indx);
5756 }
5757
5758 return str_substr(str, idx, 1, FALSE);
5759}
5760
5761
5762/*
5763 * call-seq:
5764 * self[offset] -> new_string or nil
5765 * self[offset, size] -> new_string or nil
5766 * self[range] -> new_string or nil
5767 * self[regexp, capture = 0] -> new_string or nil
5768 * self[substring] -> new_string or nil
5769 *
5770 * :include: doc/string/aref.rdoc
5771 *
5772 */
5773
5774static VALUE
5775rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5776{
5777 if (argc == 2) {
5778 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5779 return rb_str_subpat(str, argv[0], argv[1]);
5780 }
5781 else {
5782 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5783 }
5784 }
5785 rb_check_arity(argc, 1, 2);
5786 return rb_str_aref(str, argv[0]);
5787}
5788
5789VALUE
5791{
5792 char *ptr = RSTRING_PTR(str);
5793 long olen = RSTRING_LEN(str), nlen;
5794
5795 str_modifiable(str);
5796 if (len > olen) len = olen;
5797 nlen = olen - len;
5798 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5799 char *oldptr = ptr;
5800 size_t old_capa = RSTRING(str)->as.heap.aux.capa + TERM_LEN(str);
5801 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5802 STR_SET_EMBED(str);
5803 ptr = RSTRING(str)->as.embed.ary;
5804 memmove(ptr, oldptr + len, nlen);
5805 if (fl == STR_NOEMBED) {
5806 SIZED_FREE_N(oldptr, old_capa);
5807 }
5808 }
5809 else {
5810 if (!STR_SHARED_P(str)) {
5811 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5812 rb_enc_cr_str_exact_copy(shared, str);
5813 OBJ_FREEZE(shared);
5814 }
5815 ptr = RSTRING(str)->as.heap.ptr += len;
5816 }
5817 STR_SET_LEN(str, nlen);
5818
5819 if (!SHARABLE_MIDDLE_SUBSTRING) {
5820 TERM_FILL(ptr + nlen, TERM_LEN(str));
5821 }
5823 return str;
5824}
5825
5826static void
5827rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5828{
5829 char *sptr;
5830 long slen;
5831 int cr;
5832
5833 if (beg == 0 && vlen == 0) {
5834 rb_str_drop_bytes(str, len);
5835 return;
5836 }
5837
5838 str_modify_keep_cr(str);
5839 RSTRING_GETMEM(str, sptr, slen);
5840 if (len < vlen) {
5841 /* expand string */
5842 RESIZE_CAPA(str, slen + vlen - len);
5843 sptr = RSTRING_PTR(str);
5844 }
5845
5847 cr = rb_enc_str_coderange(val);
5848 else
5850
5851 if (vlen != len) {
5852 memmove(sptr + beg + vlen,
5853 sptr + beg + len,
5854 slen - (beg + len));
5855 }
5856 if (vlen < beg && len < 0) {
5857 MEMZERO(sptr + slen, char, -len);
5858 }
5859 if (vlen > 0) {
5860 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5861 }
5862 slen += vlen - len;
5863 STR_SET_LEN(str, slen);
5864 TERM_FILL(&sptr[slen], TERM_LEN(str));
5865 ENC_CODERANGE_SET(str, cr);
5866}
5867
5868static inline void
5869rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5870{
5871 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5872}
5873
5874void
5875rb_str_update(VALUE str, long beg, long len, VALUE val)
5876{
5877 long slen;
5878 char *p, *e;
5879 rb_encoding *enc;
5880 int singlebyte = single_byte_optimizable(str);
5881 int cr;
5882
5883 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5884
5885 StringValue(val);
5886 enc = rb_enc_check(str, val);
5887 slen = str_strlen(str, enc); /* rb_enc_check */
5888
5889 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5890 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5891 }
5892 if (beg < 0) {
5893 beg += slen;
5894 }
5895 RUBY_ASSERT(beg >= 0);
5896 RUBY_ASSERT(beg <= slen);
5897
5898 if (len > slen - beg) {
5899 len = slen - beg;
5900 }
5901 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5902 if (!p) p = RSTRING_END(str);
5903 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5904 if (!e) e = RSTRING_END(str);
5905 /* error check */
5906 beg = p - RSTRING_PTR(str); /* physical position */
5907 len = e - p; /* physical length */
5908 rb_str_update_0(str, beg, len, val);
5909 rb_enc_associate(str, enc);
5911 if (cr != ENC_CODERANGE_BROKEN)
5912 ENC_CODERANGE_SET(str, cr);
5913}
5914
5915static void
5916rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5917{
5918 int nth;
5919 VALUE match;
5920 long start, end, len;
5921 rb_encoding *enc;
5922 struct re_registers *regs;
5923
5924 if (rb_reg_search(re, str, 0, 0) < 0) {
5925 rb_raise(rb_eIndexError, "regexp not matched");
5926 }
5927 match = rb_backref_get();
5928 nth = rb_reg_backref_number(match, backref);
5929 regs = RMATCH_REGS(match);
5930 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5931 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5932 }
5933 if (nth < 0) {
5934 nth += regs->num_regs;
5935 }
5936
5937 start = BEG(nth);
5938 if (start == -1) {
5939 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5940 }
5941 end = END(nth);
5942 len = end - start;
5943 StringValue(val);
5944 enc = rb_enc_check_str(str, val);
5945 rb_str_update_0(str, start, len, val);
5946 rb_enc_associate(str, enc);
5947}
5948
5949static VALUE
5950rb_str_aset(VALUE str, VALUE indx, VALUE val)
5951{
5952 long idx, beg;
5953
5954 switch (TYPE(indx)) {
5955 case T_REGEXP:
5956 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5957 return val;
5958
5959 case T_STRING:
5960 beg = rb_str_index(str, indx, 0);
5961 if (beg < 0) {
5962 rb_raise(rb_eIndexError, "string not matched");
5963 }
5964 beg = rb_str_sublen(str, beg);
5965 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5966 return val;
5967
5968 default:
5969 /* check if indx is Range */
5970 {
5971 long beg, len;
5972 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5973 rb_str_update(str, beg, len, val);
5974 return val;
5975 }
5976 }
5977 /* FALLTHROUGH */
5978
5979 case T_FIXNUM:
5980 idx = NUM2LONG(indx);
5981 rb_str_update(str, idx, 1, val);
5982 return val;
5983 }
5984}
5985
5986/*
5987 * call-seq:
5988 * self[index] = other_string -> new_string
5989 * self[start, length] = other_string -> new_string
5990 * self[range] = other_string -> new_string
5991 * self[regexp, capture = 0] = other_string -> new_string
5992 * self[substring] = other_string -> new_string
5993 *
5994 * :include: doc/string/aset.rdoc
5995 *
5996 */
5997
5998static VALUE
5999rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6000{
6001 if (argc == 3) {
6002 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6003 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6004 }
6005 else {
6006 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6007 }
6008 return argv[2];
6009 }
6010 rb_check_arity(argc, 2, 3);
6011 return rb_str_aset(str, argv[0], argv[1]);
6012}
6013
6014/*
6015 * call-seq:
6016 * insert(offset, other_string) -> self
6017 *
6018 * :include: doc/string/insert.rdoc
6019 *
6020 */
6021
6022static VALUE
6023rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6024{
6025 long pos = NUM2LONG(idx);
6026
6027 if (pos == -1) {
6028 return rb_str_append(str, str2);
6029 }
6030 else if (pos < 0) {
6031 pos++;
6032 }
6033 rb_str_update(str, pos, 0, str2);
6034 return str;
6035}
6036
6037
6038/*
6039 * call-seq:
6040 * slice!(index) -> new_string or nil
6041 * slice!(start, length) -> new_string or nil
6042 * slice!(range) -> new_string or nil
6043 * slice!(regexp, capture = 0) -> new_string or nil
6044 * slice!(substring) -> new_string or nil
6045 *
6046 * Like String#[] (and its alias String#slice), except that:
6047 *
6048 * - Performs substitutions in +self+ (not in a copy of +self+).
6049 * - Returns the removed substring if any modifications were made, +nil+ otherwise.
6050 *
6051 * A few examples:
6052 *
6053 * s = 'hello'
6054 * s.slice!('e') # => "e"
6055 * s # => "hllo"
6056 * s.slice!('e') # => nil
6057 * s # => "hllo"
6058 *
6059 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6060 */
6061
6062static VALUE
6063rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6064{
6065 VALUE result = Qnil;
6066 VALUE indx;
6067 long beg, len = 1;
6068 char *p;
6069
6070 rb_check_arity(argc, 1, 2);
6071 str_modify_keep_cr(str);
6072 indx = argv[0];
6073 if (RB_TYPE_P(indx, T_REGEXP)) {
6074 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6075 VALUE match = rb_backref_get();
6076 struct re_registers *regs = RMATCH_REGS(match);
6077 int nth = 0;
6078 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6079 if ((nth += regs->num_regs) <= 0) return Qnil;
6080 }
6081 else if (nth >= regs->num_regs) return Qnil;
6082 beg = BEG(nth);
6083 len = END(nth) - beg;
6084 goto subseq;
6085 }
6086 else if (argc == 2) {
6087 beg = NUM2LONG(indx);
6088 len = NUM2LONG(argv[1]);
6089 goto num_index;
6090 }
6091 else if (FIXNUM_P(indx)) {
6092 beg = FIX2LONG(indx);
6093 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6094 if (!len) return Qnil;
6095 beg = p - RSTRING_PTR(str);
6096 goto subseq;
6097 }
6098 else if (RB_TYPE_P(indx, T_STRING)) {
6099 beg = rb_str_index(str, indx, 0);
6100 if (beg == -1) return Qnil;
6101 len = RSTRING_LEN(indx);
6102 result = str_duplicate(rb_cString, indx);
6103 goto squash;
6104 }
6105 else {
6106 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6107 case Qnil:
6108 return Qnil;
6109 case Qfalse:
6110 beg = NUM2LONG(indx);
6111 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6112 if (!len) return Qnil;
6113 beg = p - RSTRING_PTR(str);
6114 goto subseq;
6115 default:
6116 goto num_index;
6117 }
6118 }
6119
6120 num_index:
6121 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6122 beg = p - RSTRING_PTR(str);
6123
6124 subseq:
6125 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6126 rb_enc_cr_str_copy_for_substr(result, str);
6127
6128 squash:
6129 if (len > 0) {
6130 if (beg == 0) {
6131 rb_str_drop_bytes(str, len);
6132 }
6133 else {
6134 char *sptr = RSTRING_PTR(str);
6135 long slen = RSTRING_LEN(str);
6136 if (beg + len > slen) /* pathological check */
6137 len = slen - beg;
6138 memmove(sptr + beg,
6139 sptr + beg + len,
6140 slen - (beg + len));
6141 slen -= len;
6142 STR_SET_LEN(str, slen);
6143 TERM_FILL(&sptr[slen], TERM_LEN(str));
6144 }
6145 }
6146 return result;
6147}
6148
6149static VALUE
6150get_pat(VALUE pat)
6151{
6152 VALUE val;
6153
6154 switch (OBJ_BUILTIN_TYPE(pat)) {
6155 case T_REGEXP:
6156 return pat;
6157
6158 case T_STRING:
6159 break;
6160
6161 default:
6162 val = rb_check_string_type(pat);
6163 if (NIL_P(val)) {
6164 Check_Type(pat, T_REGEXP);
6165 }
6166 pat = val;
6167 }
6168
6169 return rb_reg_regcomp(pat);
6170}
6171
6172static VALUE
6173get_pat_quoted(VALUE pat, int check)
6174{
6175 VALUE val;
6176
6177 switch (OBJ_BUILTIN_TYPE(pat)) {
6178 case T_REGEXP:
6179 return pat;
6180
6181 case T_STRING:
6182 break;
6183
6184 default:
6185 val = rb_check_string_type(pat);
6186 if (NIL_P(val)) {
6187 Check_Type(pat, T_REGEXP);
6188 }
6189 pat = val;
6190 }
6191 if (check && is_broken_string(pat)) {
6192 rb_exc_raise(rb_reg_check_preprocess(pat));
6193 }
6194 return pat;
6195}
6196
6197static long
6198rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6199{
6200 if (BUILTIN_TYPE(pat) == T_STRING) {
6201 pos = rb_str_byteindex(str, pat, pos);
6202 if (set_backref_str) {
6203 if (pos >= 0) {
6204 str = rb_str_new_frozen_String(str);
6205 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6206 if (match) {
6207 *match = match_data;
6208 }
6209 }
6210 else {
6212 }
6213 }
6214 return pos;
6215 }
6216 else {
6217 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6218 }
6219}
6220
6221static long
6222rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6223{
6224 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6225}
6226
6227
6228/*
6229 * call-seq:
6230 * sub!(pattern, replacement) -> self or nil
6231 * sub!(pattern) {|match| ... } -> self or nil
6232 *
6233 * Like String#sub, except that:
6234 *
6235 * - Changes are made to +self+, not to copy of +self+.
6236 * - Returns +self+ if any changes are made, +nil+ otherwise.
6237 *
6238 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6239 */
6240
6241static VALUE
6242rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6243{
6244 VALUE pat, repl, hash = Qnil;
6245 int iter = 0;
6246 long plen;
6247 int min_arity = rb_block_given_p() ? 1 : 2;
6248 long beg;
6249
6250 rb_check_arity(argc, min_arity, 2);
6251 if (argc == 1) {
6252 iter = 1;
6253 }
6254 else {
6255 repl = argv[1];
6256 if (!RB_TYPE_P(repl, T_STRING)) {
6257 hash = rb_check_hash_type(repl);
6258 if (NIL_P(hash)) {
6259 StringValue(repl);
6260 }
6261 }
6262 }
6263
6264 pat = get_pat_quoted(argv[0], 1);
6265
6266 str_modifiable(str);
6267 beg = rb_pat_search(pat, str, 0, 1);
6268 if (beg >= 0) {
6269 rb_encoding *enc;
6270 int cr = ENC_CODERANGE(str);
6271 long beg0, end0;
6272 VALUE match, match0 = Qnil;
6273 struct re_registers *regs;
6274 char *p, *rp;
6275 long len, rlen;
6276
6277 match = rb_backref_get();
6278 regs = RMATCH_REGS(match);
6279 if (RB_TYPE_P(pat, T_STRING)) {
6280 beg0 = beg;
6281 end0 = beg0 + RSTRING_LEN(pat);
6282 match0 = pat;
6283 }
6284 else {
6285 beg0 = BEG(0);
6286 end0 = END(0);
6287 if (iter) match0 = rb_reg_nth_match(0, match);
6288 }
6289
6290 if (iter || !NIL_P(hash)) {
6291 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6292
6293 if (iter) {
6294 repl = rb_obj_as_string(rb_yield(match0));
6295 }
6296 else {
6297 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6298 repl = rb_obj_as_string(repl);
6299 }
6300 str_mod_check(str, p, len);
6301 rb_check_frozen(str);
6302 }
6303 else {
6304 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6305 }
6306
6307 enc = rb_enc_compatible(str, repl);
6308 if (!enc) {
6309 rb_encoding *str_enc = STR_ENC_GET(str);
6310 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6311 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6312 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6313 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6314 rb_enc_inspect_name(str_enc),
6315 rb_enc_inspect_name(STR_ENC_GET(repl)));
6316 }
6317 enc = STR_ENC_GET(repl);
6318 }
6319 rb_str_modify(str);
6320 rb_enc_associate(str, enc);
6322 int cr2 = ENC_CODERANGE(repl);
6323 if (cr2 == ENC_CODERANGE_BROKEN ||
6324 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6326 else
6327 cr = cr2;
6328 }
6329 plen = end0 - beg0;
6330 rlen = RSTRING_LEN(repl);
6331 len = RSTRING_LEN(str);
6332 if (rlen > plen) {
6333 RESIZE_CAPA(str, len + rlen - plen);
6334 }
6335 p = RSTRING_PTR(str);
6336 if (rlen != plen) {
6337 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6338 }
6339 rp = RSTRING_PTR(repl);
6340 memmove(p + beg0, rp, rlen);
6341 len += rlen - plen;
6342 STR_SET_LEN(str, len);
6343 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6344 ENC_CODERANGE_SET(str, cr);
6345
6346 RB_GC_GUARD(match);
6347
6348 return str;
6349 }
6350 return Qnil;
6351}
6352
6353
6354/*
6355 * call-seq:
6356 * sub(pattern, replacement) -> new_string
6357 * sub(pattern) {|match| ... } -> new_string
6358 *
6359 * :include: doc/string/sub.rdoc
6360 */
6361
6362static VALUE
6363rb_str_sub(int argc, VALUE *argv, VALUE str)
6364{
6365 str = str_duplicate(rb_cString, str);
6366 rb_str_sub_bang(argc, argv, str);
6367 return str;
6368}
6369
6370static VALUE
6371str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6372{
6373 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6374 long beg, beg0, end0;
6375 long offset, blen, slen, len, last;
6376 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6377 char *sp, *cp;
6378 int need_backref_str = -1;
6379 rb_encoding *str_enc;
6380
6381 switch (argc) {
6382 case 1:
6383 RETURN_ENUMERATOR(str, argc, argv);
6384 mode = ITER;
6385 break;
6386 case 2:
6387 repl = argv[1];
6388 if (!RB_TYPE_P(repl, T_STRING)) {
6389 hash = rb_check_hash_type(repl);
6390 if (NIL_P(hash)) {
6391 StringValue(repl);
6392 }
6393 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6394 mode = FAST_MAP;
6395 }
6396 else {
6397 mode = MAP;
6398 }
6399 }
6400 break;
6401 default:
6402 rb_error_arity(argc, 1, 2);
6403 }
6404
6405 pat = get_pat_quoted(argv[0], 1);
6406 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6407
6408 if (beg < 0) {
6409 if (bang) return Qnil; /* no match, no substitution */
6410 return str_duplicate(rb_cString, str);
6411 }
6412
6413 offset = 0;
6414 blen = RSTRING_LEN(str) + 30; /* len + margin */
6415 dest = rb_str_buf_new(blen);
6416 sp = RSTRING_PTR(str);
6417 slen = RSTRING_LEN(str);
6418 cp = sp;
6419 str_enc = STR_ENC_GET(str);
6420 rb_enc_associate(dest, str_enc);
6421 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6422
6423 do {
6424 struct re_registers *regs = RMATCH_REGS(match);
6425 if (RB_TYPE_P(pat, T_STRING)) {
6426 beg0 = beg;
6427 end0 = beg0 + RSTRING_LEN(pat);
6428 match0 = pat;
6429 }
6430 else {
6431 beg0 = BEG(0);
6432 end0 = END(0);
6433 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6434 }
6435
6436 if (mode != STR) {
6437 if (mode == ITER) {
6438 val = rb_obj_as_string(rb_yield(match0));
6439 }
6440 else {
6441 struct RString fake_str = {RBASIC_INIT};
6442 VALUE key;
6443 if (mode == FAST_MAP) {
6444 // It is safe to use a fake_str here because we established that it won't escape,
6445 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6446 // default proc.
6447 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6448 }
6449 else {
6450 key = rb_str_subseq(str, beg0, end0 - beg0);
6451 }
6452 val = rb_hash_aref(hash, key);
6453 val = rb_obj_as_string(val);
6454 }
6455 str_mod_check(str, sp, slen);
6456 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6457 rb_raise(rb_eRuntimeError, "block should not cheat");
6458 }
6459 }
6460 else if (need_backref_str) {
6461 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6462 if (need_backref_str < 0) {
6463 need_backref_str = val != repl;
6464 }
6465 }
6466 else {
6467 val = repl;
6468 }
6469
6470 len = beg0 - offset; /* copy pre-match substr */
6471 if (len) {
6472 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6473 }
6474
6475 rb_str_buf_append(dest, val);
6476
6477 last = offset;
6478 offset = end0;
6479 if (beg0 == end0) {
6480 /*
6481 * Always consume at least one character of the input string
6482 * in order to prevent infinite loops.
6483 */
6484 if (RSTRING_LEN(str) <= end0) break;
6485 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6486 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6487 offset = end0 + len;
6488 }
6489 cp = RSTRING_PTR(str) + offset;
6490 if (offset > RSTRING_LEN(str)) break;
6491
6492 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6493 if (mode != FAST_MAP && mode != STR) {
6494 match = Qnil;
6495 }
6496 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6497
6498 RB_GC_GUARD(match);
6499 } while (beg >= 0);
6500
6501 if (RSTRING_LEN(str) > offset) {
6502 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6503 }
6504 rb_pat_search0(pat, str, last, 1, &match);
6505 if (bang) {
6506 str_shared_replace(str, dest);
6507 }
6508 else {
6509 str = dest;
6510 }
6511
6512 return str;
6513}
6514
6515
6516/*
6517 * call-seq:
6518 * gsub!(pattern, replacement) -> self or nil
6519 * gsub!(pattern) {|match| ... } -> self or nil
6520 * gsub!(pattern) -> an_enumerator
6521 *
6522 * Like String#gsub, except that:
6523 *
6524 * - Performs substitutions in +self+ (not in a copy of +self+).
6525 * - Returns +self+ if any characters are removed, +nil+ otherwise.
6526 *
6527 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6528 */
6529
6530static VALUE
6531rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6532{
6533 str_modify_keep_cr(str);
6534 return str_gsub(argc, argv, str, 1);
6535}
6536
6537
6538/*
6539 * call-seq:
6540 * gsub(pattern, replacement) -> new_string
6541 * gsub(pattern) {|match| ... } -> new_string
6542 * gsub(pattern) -> enumerator
6543 *
6544 * Returns a copy of +self+ with zero or more substrings replaced.
6545 *
6546 * Argument +pattern+ may be a string or a Regexp;
6547 * argument +replacement+ may be a string or a Hash.
6548 * Varying types for the argument values makes this method very versatile.
6549 *
6550 * Below are some simple examples;
6551 * for many more examples, see {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6552 *
6553 * With arguments +pattern+ and string +replacement+ given,
6554 * replaces each matching substring with the given +replacement+ string:
6555 *
6556 * s = 'abracadabra'
6557 * s.gsub('ab', 'AB') # => "ABracadABra"
6558 * s.gsub(/[a-c]/, 'X') # => "XXrXXXdXXrX"
6559 *
6560 * With arguments +pattern+ and hash +replacement+ given,
6561 * replaces each matching substring with a value from the given +replacement+ hash,
6562 * or removes it:
6563 *
6564 * h = {'a' => 'A', 'b' => 'B', 'c' => 'C'}
6565 * s.gsub(/[a-c]/, h) # => "ABrACAdABrA" # 'a', 'b', 'c' replaced.
6566 * s.gsub(/[a-d]/, h) # => "ABrACAABrA" # 'd' removed.
6567 *
6568 * With argument +pattern+ and a block given,
6569 * calls the block with each matching substring;
6570 * replaces that substring with the block's return value:
6571 *
6572 * s.gsub(/[a-d]/) {|substring| substring.upcase }
6573 * # => "ABrACADABrA"
6574 *
6575 * With argument +pattern+ and no block given,
6576 * returns a new Enumerator.
6577 *
6578 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6579 */
6580
6581static VALUE
6582rb_str_gsub(int argc, VALUE *argv, VALUE str)
6583{
6584 return str_gsub(argc, argv, str, 0);
6585}
6586
6587
6588/*
6589 * call-seq:
6590 * replace(other_string) -> self
6591 *
6592 * Replaces the contents of +self+ with the contents of +other_string+;
6593 * returns +self+:
6594 *
6595 * s = 'foo' # => "foo"
6596 * s.replace('bar') # => "bar"
6597 *
6598 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6599 */
6600
6601VALUE
6603{
6604 str_modifiable(str);
6605 if (str == str2) return str;
6606
6607 StringValue(str2);
6608 str_discard(str);
6609 return str_replace(str, str2);
6610}
6611
6612/*
6613 * call-seq:
6614 * clear -> self
6615 *
6616 * Removes the contents of +self+:
6617 *
6618 * s = 'foo'
6619 * s.clear # => ""
6620 * s # => ""
6621 *
6622 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6623 */
6624
6625static VALUE
6626rb_str_clear(VALUE str)
6627{
6628 str_discard(str);
6629 STR_SET_EMBED(str);
6630 STR_SET_LEN(str, 0);
6631 RSTRING_PTR(str)[0] = 0;
6632 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6634 else
6636 return str;
6637}
6638
6639/*
6640 * call-seq:
6641 * chr -> string
6642 *
6643 * :include: doc/string/chr.rdoc
6644 *
6645 */
6646
6647static VALUE
6648rb_str_chr(VALUE str)
6649{
6650 return rb_str_substr(str, 0, 1);
6651}
6652
6653/*
6654 * call-seq:
6655 * getbyte(index) -> integer or nil
6656 *
6657 * :include: doc/string/getbyte.rdoc
6658 *
6659 */
6660VALUE
6661rb_str_getbyte(VALUE str, VALUE index)
6662{
6663 long pos = NUM2LONG(index);
6664
6665 if (pos < 0)
6666 pos += RSTRING_LEN(str);
6667 if (pos < 0 || RSTRING_LEN(str) <= pos)
6668 return Qnil;
6669
6670 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6671}
6672
6673/*
6674 * call-seq:
6675 * setbyte(index, integer) -> integer
6676 *
6677 * Sets the byte at zero-based offset +index+ to the value of the given +integer+;
6678 * returns +integer+:
6679 *
6680 * s = 'xyzzy'
6681 * s.setbyte(2, 129) # => 129
6682 * s # => "xy\x81zy"
6683 *
6684 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6685 */
6686VALUE
6687rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6688{
6689 long pos = NUM2LONG(index);
6690 long len = RSTRING_LEN(str);
6691 char *ptr, *head, *left = 0;
6692 rb_encoding *enc;
6693 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6694
6695 if (pos < -len || len <= pos)
6696 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6697 if (pos < 0)
6698 pos += len;
6699
6700 VALUE v = rb_to_int(value);
6701 VALUE w = rb_int_and(v, INT2FIX(0xff));
6702 char byte = (char)(NUM2INT(w) & 0xFF);
6703
6704 if (!str_independent(str))
6705 str_make_independent(str);
6706 enc = STR_ENC_GET(str);
6707 head = RSTRING_PTR(str);
6708 ptr = &head[pos];
6709 if (!STR_EMBED_P(str)) {
6710 cr = ENC_CODERANGE(str);
6711 switch (cr) {
6712 case ENC_CODERANGE_7BIT:
6713 left = ptr;
6714 *ptr = byte;
6715 if (ISASCII(byte)) goto end;
6716 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6717 if (!MBCLEN_CHARFOUND_P(nlen))
6719 else
6721 goto end;
6723 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6724 width = rb_enc_precise_mbclen(left, head+len, enc);
6725 *ptr = byte;
6726 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6727 if (!MBCLEN_CHARFOUND_P(nlen))
6729 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6731 goto end;
6732 }
6733 }
6735 *ptr = byte;
6736
6737 end:
6738 return value;
6739}
6740
6741static VALUE
6742str_byte_substr(VALUE str, long beg, long len, int empty)
6743{
6744 long n = RSTRING_LEN(str);
6745
6746 if (beg > n || len < 0) return Qnil;
6747 if (beg < 0) {
6748 beg += n;
6749 if (beg < 0) return Qnil;
6750 }
6751 if (len > n - beg)
6752 len = n - beg;
6753 if (len <= 0) {
6754 if (!empty) return Qnil;
6755 len = 0;
6756 }
6757
6758 VALUE str2 = str_subseq(str, beg, len);
6759
6760 str_enc_copy_direct(str2, str);
6761
6762 if (RSTRING_LEN(str2) == 0) {
6763 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6765 else
6767 }
6768 else {
6769 switch (ENC_CODERANGE(str)) {
6770 case ENC_CODERANGE_7BIT:
6772 break;
6773 default:
6775 break;
6776 }
6777 }
6778
6779 return str2;
6780}
6781
6782VALUE
6783rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6784{
6785 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6786}
6787
6788static VALUE
6789str_byte_aref(VALUE str, VALUE indx)
6790{
6791 long idx;
6792 if (FIXNUM_P(indx)) {
6793 idx = FIX2LONG(indx);
6794 }
6795 else {
6796 /* check if indx is Range */
6797 long beg, len = RSTRING_LEN(str);
6798
6799 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6800 case Qfalse:
6801 break;
6802 case Qnil:
6803 return Qnil;
6804 default:
6805 return str_byte_substr(str, beg, len, TRUE);
6806 }
6807
6808 idx = NUM2LONG(indx);
6809 }
6810 return str_byte_substr(str, idx, 1, FALSE);
6811}
6812
6813/*
6814 * call-seq:
6815 * byteslice(offset, length = 1) -> string or nil
6816 * byteslice(range) -> string or nil
6817 *
6818 * :include: doc/string/byteslice.rdoc
6819 */
6820
6821static VALUE
6822rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6823{
6824 if (argc == 2) {
6825 long beg = NUM2LONG(argv[0]);
6826 long len = NUM2LONG(argv[1]);
6827 return str_byte_substr(str, beg, len, TRUE);
6828 }
6829 rb_check_arity(argc, 1, 2);
6830 return str_byte_aref(str, argv[0]);
6831}
6832
6833static void
6834str_check_beg_len(VALUE str, long *beg, long *len)
6835{
6836 long end, slen = RSTRING_LEN(str);
6837
6838 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6839 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6840 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6841 }
6842 if (*beg < 0) {
6843 *beg += slen;
6844 }
6845 RUBY_ASSERT(*beg >= 0);
6846 RUBY_ASSERT(*beg <= slen);
6847
6848 if (*len > slen - *beg) {
6849 *len = slen - *beg;
6850 }
6851 end = *beg + *len;
6852 str_ensure_byte_pos(str, *beg);
6853 str_ensure_byte_pos(str, end);
6854}
6855
6856/*
6857 * call-seq:
6858 * bytesplice(offset, length, str) -> self
6859 * bytesplice(offset, length, str, str_offset, str_length) -> self
6860 * bytesplice(range, str) -> self
6861 * bytesplice(range, str, str_range) -> self
6862 *
6863 * :include: doc/string/bytesplice.rdoc
6864 */
6865
6866static VALUE
6867rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6868{
6869 long beg, len, vbeg, vlen;
6870 VALUE val;
6871 int cr;
6872
6873 rb_check_arity(argc, 2, 5);
6874 if (!(argc == 2 || argc == 3 || argc == 5)) {
6875 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6876 }
6877 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6878 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6879 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6880 rb_builtin_class_name(argv[0]));
6881 }
6882 val = argv[1];
6883 StringValue(val);
6884 if (argc == 2) {
6885 /* bytesplice(range, str) */
6886 vbeg = 0;
6887 vlen = RSTRING_LEN(val);
6888 }
6889 else {
6890 /* bytesplice(range, str, str_range) */
6891 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6892 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6893 rb_builtin_class_name(argv[2]));
6894 }
6895 }
6896 }
6897 else {
6898 beg = NUM2LONG(argv[0]);
6899 len = NUM2LONG(argv[1]);
6900 val = argv[2];
6901 StringValue(val);
6902 if (argc == 3) {
6903 /* bytesplice(index, length, str) */
6904 vbeg = 0;
6905 vlen = RSTRING_LEN(val);
6906 }
6907 else {
6908 /* bytesplice(index, length, str, str_index, str_length) */
6909 vbeg = NUM2LONG(argv[3]);
6910 vlen = NUM2LONG(argv[4]);
6911 }
6912 }
6913 str_check_beg_len(str, &beg, &len);
6914 str_check_beg_len(val, &vbeg, &vlen);
6915 str_modify_keep_cr(str);
6916
6917 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6918 rb_enc_associate(str, rb_enc_check(str, val));
6919 }
6920
6921 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6923 if (cr != ENC_CODERANGE_BROKEN)
6924 ENC_CODERANGE_SET(str, cr);
6925 return str;
6926}
6927
6928/*
6929 * call-seq:
6930 * reverse -> new_string
6931 *
6932 * Returns a new string with the characters from +self+ in reverse order.
6933 *
6934 * 'drawer'.reverse # => "reward"
6935 * 'reviled'.reverse # => "deliver"
6936 * 'stressed'.reverse # => "desserts"
6937 * 'semordnilaps'.reverse # => "spalindromes"
6938 *
6939 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
6940 */
6941
6942static VALUE
6943rb_str_reverse(VALUE str)
6944{
6945 rb_encoding *enc;
6946 VALUE rev;
6947 char *s, *e, *p;
6948 int cr;
6949
6950 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6951 enc = STR_ENC_GET(str);
6952 rev = rb_str_new(0, RSTRING_LEN(str));
6953 s = RSTRING_PTR(str); e = RSTRING_END(str);
6954 p = RSTRING_END(rev);
6955 cr = ENC_CODERANGE(str);
6956
6957 if (RSTRING_LEN(str) > 1) {
6958 if (single_byte_optimizable(str)) {
6959 while (s < e) {
6960 *--p = *s++;
6961 }
6962 }
6963 else if (cr == ENC_CODERANGE_VALID) {
6964 while (s < e) {
6965 int clen = rb_enc_fast_mbclen(s, e, enc);
6966
6967 p -= clen;
6968 memcpy(p, s, clen);
6969 s += clen;
6970 }
6971 }
6972 else {
6973 cr = rb_enc_asciicompat(enc) ?
6975 while (s < e) {
6976 int clen = rb_enc_mbclen(s, e, enc);
6977
6978 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6979 p -= clen;
6980 memcpy(p, s, clen);
6981 s += clen;
6982 }
6983 }
6984 }
6985 STR_SET_LEN(rev, RSTRING_LEN(str));
6986 str_enc_copy_direct(rev, str);
6987 ENC_CODERANGE_SET(rev, cr);
6988
6989 return rev;
6990}
6991
6992
6993/*
6994 * call-seq:
6995 * reverse! -> self
6996 *
6997 * Returns +self+ with its characters reversed:
6998 *
6999 * 'drawer'.reverse! # => "reward"
7000 * 'reviled'.reverse! # => "deliver"
7001 * 'stressed'.reverse! # => "desserts"
7002 * 'semordnilaps'.reverse! # => "spalindromes"
7003 *
7004 * Related: see {Modifying}[rdoc-ref:String@Modifying].
7005 */
7006
7007static VALUE
7008rb_str_reverse_bang(VALUE str)
7009{
7010 if (RSTRING_LEN(str) > 1) {
7011 if (single_byte_optimizable(str)) {
7012 char *s, *e, c;
7013
7014 str_modify_keep_cr(str);
7015 s = RSTRING_PTR(str);
7016 e = RSTRING_END(str) - 1;
7017 while (s < e) {
7018 c = *s;
7019 *s++ = *e;
7020 *e-- = c;
7021 }
7022 }
7023 else {
7024 str_shared_replace(str, rb_str_reverse(str));
7025 }
7026 }
7027 else {
7028 str_modify_keep_cr(str);
7029 }
7030 return str;
7031}
7032
7033
7034/*
7035 * call-seq:
7036 * include?(other_string) -> true or false
7037 *
7038 * Returns whether +self+ contains +other_string+:
7039 *
7040 * s = 'bar'
7041 * s.include?('ba') # => true
7042 * s.include?('ar') # => true
7043 * s.include?('bar') # => true
7044 * s.include?('a') # => true
7045 * s.include?('') # => true
7046 * s.include?('foo') # => false
7047 *
7048 * Related: see {Querying}[rdoc-ref:String@Querying].
7049 */
7050
7051VALUE
7052rb_str_include(VALUE str, VALUE arg)
7053{
7054 long i;
7055
7056 StringValue(arg);
7057 i = rb_str_index(str, arg, 0);
7058
7059 return RBOOL(i != -1);
7060}
7061
7062
7063/*
7064 * call-seq:
7065 * to_i(base = 10) -> integer
7066 *
7067 * Returns the result of interpreting leading characters in +self+
7068 * as an integer in the given +base+;
7069 * +base+ must be either +0+ or in range <tt>(2..36)</tt>:
7070 *
7071 * '123456'.to_i # => 123456
7072 * '123def'.to_i(16) # => 1195503
7073 *
7074 * With +base+ zero given, string +object+ may contain leading characters
7075 * to specify the actual base:
7076 *
7077 * '123def'.to_i(0) # => 123
7078 * '0123def'.to_i(0) # => 83
7079 * '0b123def'.to_i(0) # => 1
7080 * '0o123def'.to_i(0) # => 83
7081 * '0d123def'.to_i(0) # => 123
7082 * '0x123def'.to_i(0) # => 1195503
7083 *
7084 * Characters past a leading valid number (in the given +base+) are ignored:
7085 *
7086 * '12.345'.to_i # => 12
7087 * '12345'.to_i(2) # => 1
7088 *
7089 * Returns zero if there is no leading valid number:
7090 *
7091 * 'abcdef'.to_i # => 0
7092 * '2'.to_i(2) # => 0
7093 *
7094 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7095 */
7096
7097static VALUE
7098rb_str_to_i(int argc, VALUE *argv, VALUE str)
7099{
7100 int base = 10;
7101
7102 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7103 rb_raise(rb_eArgError, "invalid radix %d", base);
7104 }
7105 return rb_str_to_inum(str, base, FALSE);
7106}
7107
7108
7109/*
7110 * call-seq:
7111 * to_f -> float
7112 *
7113 * Returns the result of interpreting leading characters in +self+ as a Float:
7114 *
7115 * '3.14159'.to_f # => 3.14159
7116 * '1.234e-2'.to_f # => 0.01234
7117 *
7118 * Characters past a leading valid number are ignored:
7119 *
7120 * '3.14 (pi to two places)'.to_f # => 3.14
7121 *
7122 * Returns zero if there is no leading valid number:
7123 *
7124 * 'abcdef'.to_f # => 0.0
7125 *
7126 * See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
7127 */
7128
7129static VALUE
7130rb_str_to_f(VALUE str)
7131{
7132 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7133}
7134
7135
7136/*
7137 * call-seq:
7138 * to_s -> self or new_string
7139 *
7140 * Returns +self+ if +self+ is a +String+,
7141 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7142 *
7143 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7144 */
7145
7146static VALUE
7147rb_str_to_s(VALUE str)
7148{
7149 if (rb_obj_class(str) != rb_cString) {
7150 return str_duplicate(rb_cString, str);
7151 }
7152 return str;
7153}
7154
7155#if 0
7156static void
7157str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7158{
7159 char s[RUBY_MAX_CHAR_LEN];
7160 int n = rb_enc_codelen(c, enc);
7161
7162 rb_enc_mbcput(c, s, enc);
7163 rb_enc_str_buf_cat(str, s, n, enc);
7164}
7165#endif
7166
7167#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7168
7169int
7170rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7171{
7172 char buf[CHAR_ESC_LEN + 1];
7173 int l;
7174
7175#if SIZEOF_INT > 4
7176 c &= 0xffffffff;
7177#endif
7178 if (unicode_p) {
7179 if (c < 0x7F && ISPRINT(c)) {
7180 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7181 }
7182 else if (c < 0x10000) {
7183 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7184 }
7185 else {
7186 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7187 }
7188 }
7189 else {
7190 if (c < 0x100) {
7191 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7192 }
7193 else {
7194 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7195 }
7196 }
7197 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7198 rb_str_buf_cat(result, buf, l);
7199 return l;
7200}
7201
7202const char *
7203ruby_escaped_char(int c)
7204{
7205 switch (c) {
7206 case '\0': return "\\0";
7207 case '\n': return "\\n";
7208 case '\r': return "\\r";
7209 case '\t': return "\\t";
7210 case '\f': return "\\f";
7211 case '\013': return "\\v";
7212 case '\010': return "\\b";
7213 case '\007': return "\\a";
7214 case '\033': return "\\e";
7215 case '\x7f': return "\\c?";
7216 }
7217 return NULL;
7218}
7219
7220VALUE
7221rb_str_escape(VALUE str)
7222{
7223 int encidx = ENCODING_GET(str);
7224 rb_encoding *enc = rb_enc_from_index(encidx);
7225 const char *p = RSTRING_PTR(str);
7226 const char *pend = RSTRING_END(str);
7227 const char *prev = p;
7228 char buf[CHAR_ESC_LEN + 1];
7229 VALUE result = rb_str_buf_new(0);
7230 int unicode_p = rb_enc_unicode_p(enc);
7231 int asciicompat = rb_enc_asciicompat(enc);
7232
7233 while (p < pend) {
7234 unsigned int c;
7235 const char *cc;
7236 int n = rb_enc_precise_mbclen(p, pend, enc);
7237 if (!MBCLEN_CHARFOUND_P(n)) {
7238 if (p > prev) str_buf_cat(result, prev, p - prev);
7239 n = rb_enc_mbminlen(enc);
7240 if (pend < p + n)
7241 n = (int)(pend - p);
7242 while (n--) {
7243 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7244 str_buf_cat(result, buf, strlen(buf));
7245 prev = ++p;
7246 }
7247 continue;
7248 }
7249 n = MBCLEN_CHARFOUND_LEN(n);
7250 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7251 p += n;
7252 cc = ruby_escaped_char(c);
7253 if (cc) {
7254 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7255 str_buf_cat(result, cc, strlen(cc));
7256 prev = p;
7257 }
7258 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7259 }
7260 else {
7261 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7262 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7263 prev = p;
7264 }
7265 }
7266 if (p > prev) str_buf_cat(result, prev, p - prev);
7267 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7268
7269 return result;
7270}
7271
7272/*
7273 * call-seq:
7274 * inspect -> string
7275 *
7276 * :include: doc/string/inspect.rdoc
7277 *
7278 */
7279
7280VALUE
7282{
7283 int encidx = ENCODING_GET(str);
7284 rb_encoding *enc = rb_enc_from_index(encidx);
7285 const char *p, *pend, *prev;
7286 char buf[CHAR_ESC_LEN + 1];
7287 VALUE result = rb_str_buf_new(0);
7288 rb_encoding *resenc = rb_default_internal_encoding();
7289 int unicode_p = rb_enc_unicode_p(enc);
7290 int asciicompat = rb_enc_asciicompat(enc);
7291
7292 if (resenc == NULL) resenc = rb_default_external_encoding();
7293 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7294 rb_enc_associate(result, resenc);
7295 str_buf_cat2(result, "\"");
7296
7297 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7298 prev = p;
7299 while (p < pend) {
7300 unsigned int c, cc;
7301 int n;
7302
7303 n = rb_enc_precise_mbclen(p, pend, enc);
7304 if (!MBCLEN_CHARFOUND_P(n)) {
7305 if (p > prev) str_buf_cat(result, prev, p - prev);
7306 n = rb_enc_mbminlen(enc);
7307 if (pend < p + n)
7308 n = (int)(pend - p);
7309 while (n--) {
7310 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7311 str_buf_cat(result, buf, strlen(buf));
7312 prev = ++p;
7313 }
7314 continue;
7315 }
7316 n = MBCLEN_CHARFOUND_LEN(n);
7317 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7318 p += n;
7319 if ((asciicompat || unicode_p) &&
7320 (c == '"'|| c == '\\' ||
7321 (c == '#' &&
7322 p < pend &&
7323 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7324 (cc = rb_enc_codepoint(p,pend,enc),
7325 (cc == '$' || cc == '@' || cc == '{'))))) {
7326 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7327 str_buf_cat2(result, "\\");
7328 if (asciicompat || enc == resenc) {
7329 prev = p - n;
7330 continue;
7331 }
7332 }
7333 switch (c) {
7334 case '\n': cc = 'n'; break;
7335 case '\r': cc = 'r'; break;
7336 case '\t': cc = 't'; break;
7337 case '\f': cc = 'f'; break;
7338 case '\013': cc = 'v'; break;
7339 case '\010': cc = 'b'; break;
7340 case '\007': cc = 'a'; break;
7341 case 033: cc = 'e'; break;
7342 default: cc = 0; break;
7343 }
7344 if (cc) {
7345 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7346 buf[0] = '\\';
7347 buf[1] = (char)cc;
7348 str_buf_cat(result, buf, 2);
7349 prev = p;
7350 continue;
7351 }
7352 /* The special casing of 0x85 (NEXT_LINE) here is because
7353 * Oniguruma historically treats it as printable, but it
7354 * doesn't match the print POSIX bracket class or character
7355 * property in regexps.
7356 *
7357 * See Ruby Bug #16842 for details:
7358 * https://bugs.ruby-lang.org/issues/16842
7359 */
7360 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7361 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7362 continue;
7363 }
7364 else {
7365 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7366 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7367 prev = p;
7368 continue;
7369 }
7370 }
7371 if (p > prev) str_buf_cat(result, prev, p - prev);
7372 str_buf_cat2(result, "\"");
7373
7374 return result;
7375}
7376
7377#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7378
7379/*
7380 * call-seq:
7381 * dump -> new_string
7382 *
7383 * :include: doc/string/dump.rdoc
7384 *
7385 */
7386
7387VALUE
7389{
7390 int encidx = rb_enc_get_index(str);
7391 rb_encoding *enc = rb_enc_from_index(encidx);
7392 long len;
7393 const char *p, *pend;
7394 char *q, *qend;
7395 VALUE result;
7396 int u8 = (encidx == rb_utf8_encindex());
7397 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7398
7399 len = 2; /* "" */
7400 if (!rb_enc_asciicompat(enc)) {
7401 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7402 len += strlen(enc->name);
7403 }
7404
7405 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7406 while (p < pend) {
7407 int clen;
7408 unsigned char c = *p++;
7409
7410 switch (c) {
7411 case '"': case '\\':
7412 case '\n': case '\r':
7413 case '\t': case '\f':
7414 case '\013': case '\010': case '\007': case '\033':
7415 clen = 2;
7416 break;
7417
7418 case '#':
7419 clen = IS_EVSTR(p, pend) ? 2 : 1;
7420 break;
7421
7422 default:
7423 if (ISPRINT(c)) {
7424 clen = 1;
7425 }
7426 else {
7427 if (u8 && c > 0x7F) { /* \u notation */
7428 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7429 if (MBCLEN_CHARFOUND_P(n)) {
7430 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7431 if (cc <= 0xFFFF)
7432 clen = 6; /* \uXXXX */
7433 else if (cc <= 0xFFFFF)
7434 clen = 9; /* \u{XXXXX} */
7435 else
7436 clen = 10; /* \u{XXXXXX} */
7437 p += MBCLEN_CHARFOUND_LEN(n)-1;
7438 break;
7439 }
7440 }
7441 clen = 4; /* \xNN */
7442 }
7443 break;
7444 }
7445
7446 if (clen > LONG_MAX - len) {
7447 rb_raise(rb_eRuntimeError, "string size too big");
7448 }
7449 len += clen;
7450 }
7451
7452 result = rb_str_new(0, len);
7453 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7454 q = RSTRING_PTR(result); qend = q + len + 1;
7455
7456 *q++ = '"';
7457 while (p < pend) {
7458 unsigned char c = *p++;
7459
7460 if (c == '"' || c == '\\') {
7461 *q++ = '\\';
7462 *q++ = c;
7463 }
7464 else if (c == '#') {
7465 if (IS_EVSTR(p, pend)) *q++ = '\\';
7466 *q++ = '#';
7467 }
7468 else if (c == '\n') {
7469 *q++ = '\\';
7470 *q++ = 'n';
7471 }
7472 else if (c == '\r') {
7473 *q++ = '\\';
7474 *q++ = 'r';
7475 }
7476 else if (c == '\t') {
7477 *q++ = '\\';
7478 *q++ = 't';
7479 }
7480 else if (c == '\f') {
7481 *q++ = '\\';
7482 *q++ = 'f';
7483 }
7484 else if (c == '\013') {
7485 *q++ = '\\';
7486 *q++ = 'v';
7487 }
7488 else if (c == '\010') {
7489 *q++ = '\\';
7490 *q++ = 'b';
7491 }
7492 else if (c == '\007') {
7493 *q++ = '\\';
7494 *q++ = 'a';
7495 }
7496 else if (c == '\033') {
7497 *q++ = '\\';
7498 *q++ = 'e';
7499 }
7500 else if (ISPRINT(c)) {
7501 *q++ = c;
7502 }
7503 else {
7504 *q++ = '\\';
7505 if (u8) {
7506 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7507 if (MBCLEN_CHARFOUND_P(n)) {
7508 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7509 p += n;
7510 if (cc <= 0xFFFF)
7511 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7512 else
7513 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7514 q += strlen(q);
7515 continue;
7516 }
7517 }
7518 snprintf(q, qend-q, "x%02X", c);
7519 q += 3;
7520 }
7521 }
7522 *q++ = '"';
7523 *q = '\0';
7524 if (!rb_enc_asciicompat(enc)) {
7525 snprintf(q, qend-q, nonascii_suffix, enc->name);
7526 encidx = rb_ascii8bit_encindex();
7527 }
7528 /* result from dump is ASCII */
7529 rb_enc_associate_index(result, encidx);
7531 return result;
7532}
7533
7534static int
7535unescape_ascii(unsigned int c)
7536{
7537 switch (c) {
7538 case 'n':
7539 return '\n';
7540 case 'r':
7541 return '\r';
7542 case 't':
7543 return '\t';
7544 case 'f':
7545 return '\f';
7546 case 'v':
7547 return '\13';
7548 case 'b':
7549 return '\010';
7550 case 'a':
7551 return '\007';
7552 case 'e':
7553 return 033;
7554 }
7556}
7557
7558static void
7559undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7560{
7561 const char *s = *ss;
7562 unsigned int c;
7563 int codelen;
7564 size_t hexlen;
7565 unsigned char buf[6];
7566 static rb_encoding *enc_utf8 = NULL;
7567
7568 switch (*s) {
7569 case '\\':
7570 case '"':
7571 case '#':
7572 rb_str_cat(undumped, s, 1); /* cat itself */
7573 s++;
7574 break;
7575 case 'n':
7576 case 'r':
7577 case 't':
7578 case 'f':
7579 case 'v':
7580 case 'b':
7581 case 'a':
7582 case 'e':
7583 *buf = unescape_ascii(*s);
7584 rb_str_cat(undumped, (char *)buf, 1);
7585 s++;
7586 break;
7587 case 'u':
7588 if (*binary) {
7589 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7590 }
7591 *utf8 = true;
7592 if (++s >= s_end) {
7593 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7594 }
7595 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7596 if (*penc != enc_utf8) {
7597 *penc = enc_utf8;
7598 rb_enc_associate(undumped, enc_utf8);
7599 }
7600 if (*s == '{') { /* handle \u{...} form */
7601 s++;
7602 for (;;) {
7603 if (s >= s_end) {
7604 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7605 }
7606 if (*s == '}') {
7607 s++;
7608 break;
7609 }
7610 if (ISSPACE(*s)) {
7611 s++;
7612 continue;
7613 }
7614 c = scan_hex(s, s_end-s, &hexlen);
7615 if (hexlen == 0 || hexlen > 6) {
7616 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7617 }
7618 if (c > 0x10ffff) {
7619 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7620 }
7621 if (0xd800 <= c && c <= 0xdfff) {
7622 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7623 }
7624 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7625 rb_str_cat(undumped, (char *)buf, codelen);
7626 s += hexlen;
7627 }
7628 }
7629 else { /* handle \uXXXX form */
7630 c = scan_hex(s, 4, &hexlen);
7631 if (hexlen != 4) {
7632 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7633 }
7634 if (0xd800 <= c && c <= 0xdfff) {
7635 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7636 }
7637 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7638 rb_str_cat(undumped, (char *)buf, codelen);
7639 s += hexlen;
7640 }
7641 break;
7642 case 'x':
7643 if (++s >= s_end) {
7644 rb_raise(rb_eRuntimeError, "invalid hex escape");
7645 }
7646 *buf = scan_hex(s, 2, &hexlen);
7647 if (hexlen != 2) {
7648 rb_raise(rb_eRuntimeError, "invalid hex escape");
7649 }
7650 if (!ISASCII(*buf)) {
7651 if (*utf8) {
7652 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7653 }
7654 *binary = true;
7655 }
7656 rb_str_cat(undumped, (char *)buf, 1);
7657 s += hexlen;
7658 break;
7659 default:
7660 rb_str_cat(undumped, s-1, 2);
7661 s++;
7662 }
7663
7664 *ss = s;
7665}
7666
7667static VALUE rb_str_is_ascii_only_p(VALUE str);
7668
7669/*
7670 * call-seq:
7671 * undump -> new_string
7672 *
7673 * Inverse of String#dump; returns a copy of +self+ with changes of the kinds made by String#dump "undone."
7674 *
7675 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
7676 */
7677
7678static VALUE
7679str_undump(VALUE str)
7680{
7681 const char *s = RSTRING_PTR(str);
7682 const char *s_end = RSTRING_END(str);
7683 rb_encoding *enc = rb_enc_get(str);
7684 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7685 bool utf8 = false;
7686 bool binary = false;
7687 int w;
7688
7690 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7691 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7692 }
7693 if (!str_null_check(str, &w)) {
7694 rb_raise(rb_eRuntimeError, "string contains null byte");
7695 }
7696 if (RSTRING_LEN(str) < 2) goto invalid_format;
7697 if (*s != '"') goto invalid_format;
7698
7699 /* strip '"' at the start */
7700 s++;
7701
7702 for (;;) {
7703 if (s >= s_end) {
7704 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7705 }
7706
7707 if (*s == '"') {
7708 /* epilogue */
7709 s++;
7710 if (s == s_end) {
7711 /* ascii compatible dumped string */
7712 break;
7713 }
7714 else {
7715 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7716 static const char dup_suffix[] = ".dup";
7717 const char *encname;
7718 int encidx;
7719 ptrdiff_t size;
7720
7721 /* check separately for strings dumped by older versions */
7722 size = sizeof(dup_suffix) - 1;
7723 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7724
7725 size = sizeof(force_encoding_suffix) - 1;
7726 if (s_end - s <= size) goto invalid_format;
7727 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7728 s += size;
7729
7730 if (utf8) {
7731 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7732 }
7733
7734 encname = s;
7735 s = memchr(s, '"', s_end-s);
7736 size = s - encname;
7737 if (!s) goto invalid_format;
7738 if (s_end - s != 2) goto invalid_format;
7739 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7740
7741 encidx = rb_enc_find_index2(encname, (long)size);
7742 if (encidx < 0) {
7743 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7744 }
7745 rb_enc_associate_index(undumped, encidx);
7746 }
7747 break;
7748 }
7749
7750 if (*s == '\\') {
7751 s++;
7752 if (s >= s_end) {
7753 rb_raise(rb_eRuntimeError, "invalid escape");
7754 }
7755 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7756 }
7757 else {
7758 rb_str_cat(undumped, s++, 1);
7759 }
7760 }
7761
7762 RB_GC_GUARD(str);
7763
7764 return undumped;
7765invalid_format:
7766 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7767}
7768
7769static void
7770rb_str_check_dummy_enc(rb_encoding *enc)
7771{
7772 if (rb_enc_dummy_p(enc)) {
7773 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7774 rb_enc_name(enc));
7775 }
7776}
7777
7778static rb_encoding *
7779str_true_enc(VALUE str)
7780{
7781 rb_encoding *enc = STR_ENC_GET(str);
7782 rb_str_check_dummy_enc(enc);
7783 return enc;
7784}
7785
7786static OnigCaseFoldType
7787check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7788{
7789 if (argc==0)
7790 return flags;
7791 if (argc>2)
7792 rb_raise(rb_eArgError, "too many options");
7793 if (argv[0]==sym_turkic) {
7794 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7795 if (argc==2) {
7796 if (argv[1]==sym_lithuanian)
7797 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7798 else
7799 rb_raise(rb_eArgError, "invalid second option");
7800 }
7801 }
7802 else if (argv[0]==sym_lithuanian) {
7803 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7804 if (argc==2) {
7805 if (argv[1]==sym_turkic)
7806 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7807 else
7808 rb_raise(rb_eArgError, "invalid second option");
7809 }
7810 }
7811 else if (argc>1)
7812 rb_raise(rb_eArgError, "too many options");
7813 else if (argv[0]==sym_ascii)
7814 flags |= ONIGENC_CASE_ASCII_ONLY;
7815 else if (argv[0]==sym_fold) {
7816 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7817 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7818 else
7819 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7820 }
7821 else
7822 rb_raise(rb_eArgError, "invalid option");
7823 return flags;
7824}
7825
7826static inline bool
7827case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7828{
7829 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7830 return true;
7831 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7832}
7833
7834/* 16 should be long enough to absorb any kind of single character length increase */
7835#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7836#ifndef CASEMAP_DEBUG
7837# define CASEMAP_DEBUG 0
7838#endif
7839
7840struct mapping_buffer;
7841typedef struct mapping_buffer {
7842 size_t capa;
7843 size_t used;
7844 struct mapping_buffer *next;
7845 OnigUChar space[FLEX_ARY_LEN];
7847
7848static void
7849mapping_buffer_free(void *p)
7850{
7851 mapping_buffer *previous_buffer;
7852 mapping_buffer *current_buffer = p;
7853 while (current_buffer) {
7854 previous_buffer = current_buffer;
7855 current_buffer = current_buffer->next;
7856 ruby_xfree_sized(previous_buffer, offsetof(mapping_buffer, space) + previous_buffer->capa);
7857 }
7858}
7859
7860static const rb_data_type_t mapping_buffer_type = {
7861 "mapping_buffer",
7862 {0, mapping_buffer_free,},
7863 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7864};
7865
7866static VALUE
7867rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7868{
7869 VALUE target;
7870
7871 const OnigUChar *source_current, *source_end;
7872 int target_length = 0;
7873 VALUE buffer_anchor;
7874 mapping_buffer *current_buffer = 0;
7875 mapping_buffer **pre_buffer;
7876 size_t buffer_count = 0;
7877 int buffer_length_or_invalid;
7878
7879 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7880
7881 source_current = (OnigUChar*)RSTRING_PTR(source);
7882 source_end = (OnigUChar*)RSTRING_END(source);
7883
7884 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7885 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7886 while (source_current < source_end) {
7887 /* increase multiplier using buffer count to converge quickly */
7888 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7889 if (CASEMAP_DEBUG) {
7890 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7891 }
7892 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7893 *pre_buffer = current_buffer;
7894 pre_buffer = &current_buffer->next;
7895 current_buffer->next = NULL;
7896 current_buffer->capa = capa;
7897 buffer_length_or_invalid = enc->case_map(flags,
7898 &source_current, source_end,
7899 current_buffer->space,
7900 current_buffer->space+current_buffer->capa,
7901 enc);
7902 if (buffer_length_or_invalid < 0) {
7903 current_buffer = DATA_PTR(buffer_anchor);
7904 DATA_PTR(buffer_anchor) = 0;
7905 mapping_buffer_free(current_buffer);
7906 rb_raise(rb_eArgError, "input string invalid");
7907 }
7908 target_length += current_buffer->used = buffer_length_or_invalid;
7909 }
7910 if (CASEMAP_DEBUG) {
7911 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7912 }
7913
7914 if (buffer_count==1) {
7915 target = rb_str_new((const char*)current_buffer->space, target_length);
7916 }
7917 else {
7918 char *target_current;
7919
7920 target = rb_str_new(0, target_length);
7921 target_current = RSTRING_PTR(target);
7922 current_buffer = DATA_PTR(buffer_anchor);
7923 while (current_buffer) {
7924 memcpy(target_current, current_buffer->space, current_buffer->used);
7925 target_current += current_buffer->used;
7926 current_buffer = current_buffer->next;
7927 }
7928 }
7929 current_buffer = DATA_PTR(buffer_anchor);
7930 DATA_PTR(buffer_anchor) = 0;
7931 mapping_buffer_free(current_buffer);
7932
7933 RB_GC_GUARD(buffer_anchor);
7934
7935 /* TODO: check about string terminator character */
7936 str_enc_copy_direct(target, source);
7937 /*ENC_CODERANGE_SET(mapped, cr);*/
7938
7939 return target;
7940}
7941
7942static VALUE
7943rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7944{
7945 const OnigUChar *source_current, *source_end;
7946 OnigUChar *target_current, *target_end;
7947 long old_length = RSTRING_LEN(source);
7948 int length_or_invalid;
7949
7950 if (old_length == 0) return Qnil;
7951
7952 source_current = (OnigUChar*)RSTRING_PTR(source);
7953 source_end = (OnigUChar*)RSTRING_END(source);
7954 if (source == target) {
7955 target_current = (OnigUChar*)source_current;
7956 target_end = (OnigUChar*)source_end;
7957 }
7958 else {
7959 target_current = (OnigUChar*)RSTRING_PTR(target);
7960 target_end = (OnigUChar*)RSTRING_END(target);
7961 }
7962
7963 length_or_invalid = onigenc_ascii_only_case_map(flags,
7964 &source_current, source_end,
7965 target_current, target_end, enc);
7966 if (length_or_invalid < 0)
7967 rb_raise(rb_eArgError, "input string invalid");
7968 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7969 fprintf(stderr, "problem with rb_str_ascii_casemap"
7970 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7971 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7972 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7973 }
7974
7975 str_enc_copy(target, source);
7976
7977 return target;
7978}
7979
7980static bool
7981upcase_single(VALUE str)
7982{
7983 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7984 bool modified = false;
7985
7986 while (s < send) {
7987 unsigned int c = *(unsigned char*)s;
7988
7989 if ('a' <= c && c <= 'z') {
7990 *s = 'A' + (c - 'a');
7991 modified = true;
7992 }
7993 s++;
7994 }
7995 return modified;
7996}
7997
7998/*
7999 * call-seq:
8000 * upcase!(mapping) -> self or nil
8001 *
8002 * Like String#upcase, except that:
8003 *
8004 * - Changes character casings in +self+ (not in a copy of +self+).
8005 * - Returns +self+ if any changes are made, +nil+ otherwise.
8006 *
8007 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8008 */
8009
8010static VALUE
8011rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8012{
8013 rb_encoding *enc;
8014 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8015
8016 flags = check_case_options(argc, argv, flags);
8017 str_modify_keep_cr(str);
8018 enc = str_true_enc(str);
8019 if (case_option_single_p(flags, enc, str)) {
8020 if (upcase_single(str))
8021 flags |= ONIGENC_CASE_MODIFIED;
8022 }
8023 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8024 rb_str_ascii_casemap(str, str, &flags, enc);
8025 else
8026 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8027
8028 if (ONIGENC_CASE_MODIFIED&flags) return str;
8029 return Qnil;
8030}
8031
8032
8033/*
8034 * call-seq:
8035 * upcase(mapping = :ascii) -> new_string
8036 *
8037 * :include: doc/string/upcase.rdoc
8038 */
8039
8040static VALUE
8041rb_str_upcase(int argc, VALUE *argv, VALUE str)
8042{
8043 rb_encoding *enc;
8044 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8045 VALUE ret;
8046
8047 flags = check_case_options(argc, argv, flags);
8048 enc = str_true_enc(str);
8049 if (case_option_single_p(flags, enc, str)) {
8050 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8051 str_enc_copy_direct(ret, str);
8052 upcase_single(ret);
8053 }
8054 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8055 ret = rb_str_new(0, RSTRING_LEN(str));
8056 rb_str_ascii_casemap(str, ret, &flags, enc);
8057 }
8058 else {
8059 ret = rb_str_casemap(str, &flags, enc);
8060 }
8061
8062 return ret;
8063}
8064
8065static bool
8066downcase_single(VALUE str)
8067{
8068 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8069 bool modified = false;
8070
8071 while (s < send) {
8072 unsigned int c = *(unsigned char*)s;
8073
8074 if ('A' <= c && c <= 'Z') {
8075 *s = 'a' + (c - 'A');
8076 modified = true;
8077 }
8078 s++;
8079 }
8080
8081 return modified;
8082}
8083
8084/*
8085 * call-seq:
8086 * downcase!(mapping) -> self or nil
8087 *
8088 * Like String#downcase, except that:
8089 *
8090 * - Changes character casings in +self+ (not in a copy of +self+).
8091 * - Returns +self+ if any changes are made, +nil+ otherwise.
8092 *
8093 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8094 */
8095
8096static VALUE
8097rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8098{
8099 rb_encoding *enc;
8100 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8101
8102 flags = check_case_options(argc, argv, flags);
8103 str_modify_keep_cr(str);
8104 enc = str_true_enc(str);
8105 if (case_option_single_p(flags, enc, str)) {
8106 if (downcase_single(str))
8107 flags |= ONIGENC_CASE_MODIFIED;
8108 }
8109 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8110 rb_str_ascii_casemap(str, str, &flags, enc);
8111 else
8112 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8113
8114 if (ONIGENC_CASE_MODIFIED&flags) return str;
8115 return Qnil;
8116}
8117
8118
8119/*
8120 * call-seq:
8121 * downcase(mapping = :ascii) -> new_string
8122 *
8123 * :include: doc/string/downcase.rdoc
8124 *
8125 */
8126
8127static VALUE
8128rb_str_downcase(int argc, VALUE *argv, VALUE str)
8129{
8130 rb_encoding *enc;
8131 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8132 VALUE ret;
8133
8134 flags = check_case_options(argc, argv, flags);
8135 enc = str_true_enc(str);
8136 if (case_option_single_p(flags, enc, str)) {
8137 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8138 str_enc_copy_direct(ret, str);
8139 downcase_single(ret);
8140 }
8141 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8142 ret = rb_str_new(0, RSTRING_LEN(str));
8143 rb_str_ascii_casemap(str, ret, &flags, enc);
8144 }
8145 else {
8146 ret = rb_str_casemap(str, &flags, enc);
8147 }
8148
8149 return ret;
8150}
8151
8152
8153/*
8154 * call-seq:
8155 * capitalize!(mapping = :ascii) -> self or nil
8156 *
8157 * Like String#capitalize, except that:
8158 *
8159 * - Changes character casings in +self+ (not in a copy of +self+).
8160 * - Returns +self+ if any changes are made, +nil+ otherwise.
8161 *
8162 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8163 */
8164
8165static VALUE
8166rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8167{
8168 rb_encoding *enc;
8169 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8170
8171 flags = check_case_options(argc, argv, flags);
8172 str_modify_keep_cr(str);
8173 enc = str_true_enc(str);
8174 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8175 if (flags&ONIGENC_CASE_ASCII_ONLY)
8176 rb_str_ascii_casemap(str, str, &flags, enc);
8177 else
8178 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8179
8180 if (ONIGENC_CASE_MODIFIED&flags) return str;
8181 return Qnil;
8182}
8183
8184
8185/*
8186 * call-seq:
8187 * capitalize(mapping = :ascii) -> new_string
8188 *
8189 * :include: doc/string/capitalize.rdoc
8190 *
8191 */
8192
8193static VALUE
8194rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8195{
8196 rb_encoding *enc;
8197 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8198 VALUE ret;
8199
8200 flags = check_case_options(argc, argv, flags);
8201 enc = str_true_enc(str);
8202 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8203 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8204 ret = rb_str_new(0, RSTRING_LEN(str));
8205 rb_str_ascii_casemap(str, ret, &flags, enc);
8206 }
8207 else {
8208 ret = rb_str_casemap(str, &flags, enc);
8209 }
8210 return ret;
8211}
8212
8213
8214/*
8215 * call-seq:
8216 * swapcase!(mapping) -> self or nil
8217 *
8218 * Like String#swapcase, except that:
8219 *
8220 * - Changes are made to +self+, not to copy of +self+.
8221 * - Returns +self+ if any changes are made, +nil+ otherwise.
8222 *
8223 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8224 */
8225
8226static VALUE
8227rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8228{
8229 rb_encoding *enc;
8230 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8231
8232 flags = check_case_options(argc, argv, flags);
8233 str_modify_keep_cr(str);
8234 enc = str_true_enc(str);
8235 if (flags&ONIGENC_CASE_ASCII_ONLY)
8236 rb_str_ascii_casemap(str, str, &flags, enc);
8237 else
8238 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8239
8240 if (ONIGENC_CASE_MODIFIED&flags) return str;
8241 return Qnil;
8242}
8243
8244
8245/*
8246 * call-seq:
8247 * swapcase(mapping = :ascii) -> new_string
8248 *
8249 * :include: doc/string/swapcase.rdoc
8250 *
8251 */
8252
8253static VALUE
8254rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8255{
8256 rb_encoding *enc;
8257 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8258 VALUE ret;
8259
8260 flags = check_case_options(argc, argv, flags);
8261 enc = str_true_enc(str);
8262 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8263 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8264 ret = rb_str_new(0, RSTRING_LEN(str));
8265 rb_str_ascii_casemap(str, ret, &flags, enc);
8266 }
8267 else {
8268 ret = rb_str_casemap(str, &flags, enc);
8269 }
8270 return ret;
8271}
8272
8273typedef unsigned char *USTR;
8274
8275struct tr {
8276 int gen;
8277 unsigned int now, max;
8278 char *p, *pend;
8279};
8280
8281static unsigned int
8282trnext(struct tr *t, rb_encoding *enc)
8283{
8284 int n;
8285
8286 for (;;) {
8287 nextpart:
8288 if (!t->gen) {
8289 if (t->p == t->pend) return -1;
8290 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8291 t->p += n;
8292 }
8293 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8294 t->p += n;
8295 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8296 t->p += n;
8297 if (t->p < t->pend) {
8298 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8299 t->p += n;
8300 if (t->now > c) {
8301 if (t->now < 0x80 && c < 0x80) {
8302 rb_raise(rb_eArgError,
8303 "invalid range \"%c-%c\" in string transliteration",
8304 t->now, c);
8305 }
8306 else {
8307 rb_raise(rb_eArgError, "invalid range in string transliteration");
8308 }
8309 continue; /* not reached */
8310 }
8311 else if (t->now < c) {
8312 t->gen = 1;
8313 t->max = c;
8314 }
8315 }
8316 }
8317 return t->now;
8318 }
8319 else {
8320 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8321 if (t->now == t->max) {
8322 t->gen = 0;
8323 goto nextpart;
8324 }
8325 }
8326 if (t->now < t->max) {
8327 return t->now;
8328 }
8329 else {
8330 t->gen = 0;
8331 return t->max;
8332 }
8333 }
8334 }
8335}
8336
8337static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8338
8339static VALUE
8340tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8341{
8342 const unsigned int errc = -1;
8343 unsigned int trans[256];
8344 rb_encoding *enc, *e1, *e2;
8345 struct tr trsrc, trrepl;
8346 int cflag = 0;
8347 unsigned int c, c0, last = 0;
8348 int modify = 0, i, l;
8349 unsigned char *s, *send;
8350 VALUE hash = 0;
8351 int singlebyte = single_byte_optimizable(str);
8352 int termlen;
8353 int cr;
8354
8355#define CHECK_IF_ASCII(c) \
8356 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8357 (cr = ENC_CODERANGE_VALID) : 0)
8358
8359 StringValue(src);
8360 StringValue(repl);
8361 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8362 if (RSTRING_LEN(repl) == 0) {
8363 return rb_str_delete_bang(1, &src, str);
8364 }
8365
8366 cr = ENC_CODERANGE(str);
8367 e1 = rb_enc_check(str, src);
8368 e2 = rb_enc_check(str, repl);
8369 if (e1 == e2) {
8370 enc = e1;
8371 }
8372 else {
8373 enc = rb_enc_check(src, repl);
8374 }
8375 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8376 if (RSTRING_LEN(src) > 1 &&
8377 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8378 trsrc.p + l < trsrc.pend) {
8379 cflag = 1;
8380 trsrc.p += l;
8381 }
8382 trrepl.p = RSTRING_PTR(repl);
8383 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8384 trsrc.gen = trrepl.gen = 0;
8385 trsrc.now = trrepl.now = 0;
8386 trsrc.max = trrepl.max = 0;
8387
8388 if (cflag) {
8389 for (i=0; i<256; i++) {
8390 trans[i] = 1;
8391 }
8392 while ((c = trnext(&trsrc, enc)) != errc) {
8393 if (c < 256) {
8394 trans[c] = errc;
8395 }
8396 else {
8397 if (!hash) hash = rb_hash_new();
8398 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8399 }
8400 }
8401 while ((c = trnext(&trrepl, enc)) != errc)
8402 /* retrieve last replacer */;
8403 last = trrepl.now;
8404 for (i=0; i<256; i++) {
8405 if (trans[i] != errc) {
8406 trans[i] = last;
8407 }
8408 }
8409 }
8410 else {
8411 unsigned int r;
8412
8413 for (i=0; i<256; i++) {
8414 trans[i] = errc;
8415 }
8416 while ((c = trnext(&trsrc, enc)) != errc) {
8417 r = trnext(&trrepl, enc);
8418 if (r == errc) r = trrepl.now;
8419 if (c < 256) {
8420 trans[c] = r;
8421 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8422 }
8423 else {
8424 if (!hash) hash = rb_hash_new();
8425 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8426 }
8427 }
8428 }
8429
8430 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8431 cr = ENC_CODERANGE_7BIT;
8432 str_modify_keep_cr(str);
8433 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8434 termlen = rb_enc_mbminlen(enc);
8435 if (sflag) {
8436 int clen, tlen;
8437 long offset, max = RSTRING_LEN(str);
8438 unsigned int save = -1;
8439 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8440
8441 while (s < send) {
8442 int may_modify = 0;
8443
8444 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8445 if (!MBCLEN_CHARFOUND_P(r)) {
8446 SIZED_FREE_N(buf, max + termlen);
8447 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8448 }
8449 clen = MBCLEN_CHARFOUND_LEN(r);
8450 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8451
8452 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8453
8454 s += clen;
8455 if (c < 256) {
8456 c = trans[c];
8457 }
8458 else if (hash) {
8459 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8460 if (NIL_P(tmp)) {
8461 if (cflag) c = last;
8462 else c = errc;
8463 }
8464 else if (cflag) c = errc;
8465 else c = NUM2INT(tmp);
8466 }
8467 else {
8468 c = errc;
8469 }
8470 if (c != (unsigned int)-1) {
8471 if (save == c) {
8472 CHECK_IF_ASCII(c);
8473 continue;
8474 }
8475 save = c;
8476 tlen = rb_enc_codelen(c, enc);
8477 modify = 1;
8478 }
8479 else {
8480 save = -1;
8481 c = c0;
8482 if (enc != e1) may_modify = 1;
8483 }
8484 if ((offset = t - buf) + tlen > max) {
8485 size_t MAYBE_UNUSED(old) = max + termlen;
8486 max = offset + tlen + (send - s);
8487 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8488 t = buf + offset;
8489 }
8490 rb_enc_mbcput(c, t, enc);
8491 if (may_modify && memcmp(s, t, tlen) != 0) {
8492 modify = 1;
8493 }
8494 CHECK_IF_ASCII(c);
8495 t += tlen;
8496 }
8497 if (!STR_EMBED_P(str)) {
8498 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8499 }
8500 TERM_FILL((char *)t, termlen);
8501 RSTRING(str)->as.heap.ptr = (char *)buf;
8502 STR_SET_LEN(str, t - buf);
8503 STR_SET_NOEMBED(str);
8504 RSTRING(str)->as.heap.aux.capa = max;
8505 }
8506 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8507 while (s < send) {
8508 c = (unsigned char)*s;
8509 if (trans[c] != errc) {
8510 if (!cflag) {
8511 c = trans[c];
8512 *s = c;
8513 modify = 1;
8514 }
8515 else {
8516 *s = last;
8517 modify = 1;
8518 }
8519 }
8520 CHECK_IF_ASCII(c);
8521 s++;
8522 }
8523 }
8524 else {
8525 int clen, tlen;
8526 long offset, max = (long)((send - s) * 1.2);
8527 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8528
8529 while (s < send) {
8530 int may_modify = 0;
8531
8532 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8533 if (!MBCLEN_CHARFOUND_P(r)) {
8534 SIZED_FREE_N(buf, max + termlen);
8535 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8536 }
8537 clen = MBCLEN_CHARFOUND_LEN(r);
8538 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8539
8540 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8541
8542 if (c < 256) {
8543 c = trans[c];
8544 }
8545 else if (hash) {
8546 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8547 if (NIL_P(tmp)) {
8548 if (cflag) c = last;
8549 else c = errc;
8550 }
8551 else if (cflag) c = errc;
8552 else c = NUM2INT(tmp);
8553 }
8554 else {
8555 c = cflag ? last : errc;
8556 }
8557 if (c != errc) {
8558 tlen = rb_enc_codelen(c, enc);
8559 modify = 1;
8560 }
8561 else {
8562 c = c0;
8563 if (enc != e1) may_modify = 1;
8564 }
8565 if ((offset = t - buf) + tlen > max) {
8566 size_t MAYBE_UNUSED(old) = max + termlen;
8567 max = offset + tlen + (long)((send - s) * 1.2);
8568 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8569 t = buf + offset;
8570 }
8571 if (s != t) {
8572 rb_enc_mbcput(c, t, enc);
8573 if (may_modify && memcmp(s, t, tlen) != 0) {
8574 modify = 1;
8575 }
8576 }
8577 CHECK_IF_ASCII(c);
8578 s += clen;
8579 t += tlen;
8580 }
8581 if (!STR_EMBED_P(str)) {
8582 SIZED_FREE_N(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8583 }
8584 TERM_FILL((char *)t, termlen);
8585 RSTRING(str)->as.heap.ptr = (char *)buf;
8586 STR_SET_LEN(str, t - buf);
8587 STR_SET_NOEMBED(str);
8588 RSTRING(str)->as.heap.aux.capa = max;
8589 }
8590
8591 if (modify) {
8592 if (cr != ENC_CODERANGE_BROKEN)
8593 ENC_CODERANGE_SET(str, cr);
8594 rb_enc_associate(str, enc);
8595 return str;
8596 }
8597 return Qnil;
8598}
8599
8600
8601/*
8602 * call-seq:
8603 * tr!(selector, replacements) -> self or nil
8604 *
8605 * Like String#tr, except:
8606 *
8607 * - Performs substitutions in +self+ (not in a copy of +self+).
8608 * - Returns +self+ if any modifications were made, +nil+ otherwise.
8609 *
8610 * Related: {Modifying}[rdoc-ref:String@Modifying].
8611 */
8612
8613static VALUE
8614rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8615{
8616 return tr_trans(str, src, repl, 0);
8617}
8618
8619
8620/*
8621 * call-seq:
8622 * tr(selector, replacements) -> new_string
8623 *
8624 * Returns a copy of +self+ with each character specified by string +selector+
8625 * translated to the corresponding character in string +replacements+.
8626 * The correspondence is _positional_:
8627 *
8628 * - Each occurrence of the first character specified by +selector+
8629 * is translated to the first character in +replacements+.
8630 * - Each occurrence of the second character specified by +selector+
8631 * is translated to the second character in +replacements+.
8632 * - And so on.
8633 *
8634 * Example:
8635 *
8636 * 'hello'.tr('el', 'ip') #=> "hippo"
8637 *
8638 * If +replacements+ is shorter than +selector+,
8639 * it is implicitly padded with its own last character:
8640 *
8641 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8642 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8643 *
8644 * Arguments +selector+ and +replacements+ must be valid character selectors
8645 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8646 * and may use any of its valid forms, including negation, ranges, and escapes:
8647 *
8648 * 'hello'.tr('^aeiou', '-') # => "-e--o" # Negation.
8649 * 'ibm'.tr('b-z', 'a-z') # => "hal" # Range.
8650 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8651 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8652 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8653 *
8654 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8655 */
8656
8657static VALUE
8658rb_str_tr(VALUE str, VALUE src, VALUE repl)
8659{
8660 str = str_duplicate(rb_cString, str);
8661 tr_trans(str, src, repl, 0);
8662 return str;
8663}
8664
8665#define TR_TABLE_MAX (UCHAR_MAX+1)
8666#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8667static void
8668tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8669 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8670{
8671 const unsigned int errc = -1;
8672 char buf[TR_TABLE_MAX];
8673 struct tr tr;
8674 unsigned int c;
8675 VALUE table = 0, ptable = 0;
8676 int i, l, cflag = 0;
8677
8678 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8679 tr.gen = tr.now = tr.max = 0;
8680
8681 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8682 cflag = 1;
8683 tr.p += l;
8684 }
8685 if (first) {
8686 for (i=0; i<TR_TABLE_MAX; i++) {
8687 stable[i] = 1;
8688 }
8689 stable[TR_TABLE_MAX] = cflag;
8690 }
8691 else if (stable[TR_TABLE_MAX] && !cflag) {
8692 stable[TR_TABLE_MAX] = 0;
8693 }
8694 for (i=0; i<TR_TABLE_MAX; i++) {
8695 buf[i] = cflag;
8696 }
8697
8698 while ((c = trnext(&tr, enc)) != errc) {
8699 if (c < TR_TABLE_MAX) {
8700 buf[(unsigned char)c] = !cflag;
8701 }
8702 else {
8703 VALUE key = UINT2NUM(c);
8704
8705 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8706 if (cflag) {
8707 ptable = *ctablep;
8708 table = ptable ? ptable : rb_hash_new();
8709 *ctablep = table;
8710 }
8711 else {
8712 table = rb_hash_new();
8713 ptable = *tablep;
8714 *tablep = table;
8715 }
8716 }
8717 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8718 rb_hash_aset(table, key, Qtrue);
8719 }
8720 }
8721 }
8722 for (i=0; i<TR_TABLE_MAX; i++) {
8723 stable[i] = stable[i] && buf[i];
8724 }
8725 if (!table && !cflag) {
8726 *tablep = 0;
8727 }
8728}
8729
8730
8731static int
8732tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8733{
8734 if (c < TR_TABLE_MAX) {
8735 return table[c] != 0;
8736 }
8737 else {
8738 VALUE v = UINT2NUM(c);
8739
8740 if (del) {
8741 if (!NIL_P(rb_hash_lookup(del, v)) &&
8742 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8743 return TRUE;
8744 }
8745 }
8746 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8747 return FALSE;
8748 }
8749 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8750 }
8751}
8752
8753/*
8754 * call-seq:
8755 * delete!(*selectors) -> self or nil
8756 *
8757 * Like String#delete, but modifies +self+ in place;
8758 * returns +self+ if any characters were deleted, +nil+ otherwise.
8759 *
8760 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8761 */
8762
8763static VALUE
8764rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8765{
8766 char squeez[TR_TABLE_SIZE];
8767 rb_encoding *enc = 0;
8768 char *s, *send, *t;
8769 VALUE del = 0, nodel = 0;
8770 int modify = 0;
8771 int i, ascompat, cr;
8772
8773 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8775 for (i=0; i<argc; i++) {
8776 VALUE s = argv[i];
8777
8778 StringValue(s);
8779 enc = rb_enc_check(str, s);
8780 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8781 }
8782
8783 str_modify_keep_cr(str);
8784 ascompat = rb_enc_asciicompat(enc);
8785 s = t = RSTRING_PTR(str);
8786 send = RSTRING_END(str);
8787 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8788 while (s < send) {
8789 unsigned int c;
8790 int clen;
8791
8792 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8793 if (squeez[c]) {
8794 modify = 1;
8795 }
8796 else {
8797 if (t != s) *t = c;
8798 t++;
8799 }
8800 s++;
8801 }
8802 else {
8803 c = rb_enc_codepoint_len(s, send, &clen, enc);
8804
8805 if (tr_find(c, squeez, del, nodel)) {
8806 modify = 1;
8807 }
8808 else {
8809 if (t != s) rb_enc_mbcput(c, t, enc);
8810 t += clen;
8812 }
8813 s += clen;
8814 }
8815 }
8816 TERM_FILL(t, TERM_LEN(str));
8817 STR_SET_LEN(str, t - RSTRING_PTR(str));
8818 ENC_CODERANGE_SET(str, cr);
8819
8820 if (modify) return str;
8821 return Qnil;
8822}
8823
8824
8825/*
8826 * call-seq:
8827 * delete(*selectors) -> new_string
8828 *
8829 * :include: doc/string/delete.rdoc
8830 *
8831 */
8832
8833static VALUE
8834rb_str_delete(int argc, VALUE *argv, VALUE str)
8835{
8836 str = str_duplicate(rb_cString, str);
8837 rb_str_delete_bang(argc, argv, str);
8838 return str;
8839}
8840
8841
8842/*
8843 * call-seq:
8844 * squeeze!(*selectors) -> self or nil
8845 *
8846 * Like String#squeeze, except that:
8847 *
8848 * - Characters are squeezed in +self+ (not in a copy of +self+).
8849 * - Returns +self+ if any changes are made, +nil+ otherwise.
8850 *
8851 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8852 */
8853
8854static VALUE
8855rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8856{
8857 char squeez[TR_TABLE_SIZE];
8858 rb_encoding *enc = 0;
8859 VALUE del = 0, nodel = 0;
8860 unsigned char *s, *send, *t;
8861 int i, modify = 0;
8862 int ascompat, singlebyte = single_byte_optimizable(str);
8863 unsigned int save;
8864
8865 if (argc == 0) {
8866 enc = STR_ENC_GET(str);
8867 }
8868 else {
8869 for (i=0; i<argc; i++) {
8870 VALUE s = argv[i];
8871
8872 StringValue(s);
8873 enc = rb_enc_check(str, s);
8874 if (singlebyte && !single_byte_optimizable(s))
8875 singlebyte = 0;
8876 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8877 }
8878 }
8879
8880 str_modify_keep_cr(str);
8881 s = t = (unsigned char *)RSTRING_PTR(str);
8882 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8883 send = (unsigned char *)RSTRING_END(str);
8884 save = -1;
8885 ascompat = rb_enc_asciicompat(enc);
8886
8887 if (singlebyte) {
8888 while (s < send) {
8889 unsigned int c = *s++;
8890 if (c != save || (argc > 0 && !squeez[c])) {
8891 *t++ = save = c;
8892 }
8893 }
8894 }
8895 else {
8896 while (s < send) {
8897 unsigned int c;
8898 int clen;
8899
8900 if (ascompat && (c = *s) < 0x80) {
8901 if (c != save || (argc > 0 && !squeez[c])) {
8902 *t++ = save = c;
8903 }
8904 s++;
8905 }
8906 else {
8907 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8908
8909 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8910 if (t != s) rb_enc_mbcput(c, t, enc);
8911 save = c;
8912 t += clen;
8913 }
8914 s += clen;
8915 }
8916 }
8917 }
8918
8919 TERM_FILL((char *)t, TERM_LEN(str));
8920 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8921 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8922 modify = 1;
8923 }
8924
8925 if (modify) return str;
8926 return Qnil;
8927}
8928
8929
8930/*
8931 * call-seq:
8932 * squeeze(*selectors) -> new_string
8933 *
8934 * :include: doc/string/squeeze.rdoc
8935 *
8936 */
8937
8938static VALUE
8939rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8940{
8941 str = str_duplicate(rb_cString, str);
8942 rb_str_squeeze_bang(argc, argv, str);
8943 return str;
8944}
8945
8946
8947/*
8948 * call-seq:
8949 * tr_s!(selector, replacements) -> self or nil
8950 *
8951 * Like String#tr_s, except:
8952 *
8953 * - Modifies +self+ in place (not a copy of +self+).
8954 * - Returns +self+ if any changes were made, +nil+ otherwise.
8955 *
8956 * Related: {Modifying}[rdoc-ref:String@Modifying].
8957 */
8958
8959static VALUE
8960rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8961{
8962 return tr_trans(str, src, repl, 1);
8963}
8964
8965
8966/*
8967 * call-seq:
8968 * tr_s(selector, replacements) -> new_string
8969 *
8970 * Like String#tr, except:
8971 *
8972 * - Also squeezes the modified portions of the translated string;
8973 * see String#squeeze.
8974 * - Returns the translated and squeezed string.
8975 *
8976 * Examples:
8977 *
8978 * 'hello'.tr_s('l', 'r') #=> "hero"
8979 * 'hello'.tr_s('el', '-') #=> "h-o"
8980 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8981 *
8982 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8983 *
8984 */
8985
8986static VALUE
8987rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8988{
8989 str = str_duplicate(rb_cString, str);
8990 tr_trans(str, src, repl, 1);
8991 return str;
8992}
8993
8994
8995/*
8996 * call-seq:
8997 * count(*selectors) -> integer
8998 *
8999 * :include: doc/string/count.rdoc
9000 */
9001
9002static VALUE
9003rb_str_count(int argc, VALUE *argv, VALUE str)
9004{
9005 char table[TR_TABLE_SIZE];
9006 rb_encoding *enc = 0;
9007 VALUE del = 0, nodel = 0, tstr;
9008 char *s, *send;
9009 int i;
9010 int ascompat;
9011 size_t n = 0;
9012
9014
9015 tstr = argv[0];
9016 StringValue(tstr);
9017 enc = rb_enc_check(str, tstr);
9018 if (argc == 1) {
9019 const char *ptstr;
9020 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9021 (ptstr = RSTRING_PTR(tstr),
9022 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9023 !is_broken_string(str)) {
9024 int clen;
9025 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9026
9027 s = RSTRING_PTR(str);
9028 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9029 send = RSTRING_END(str);
9030 while (s < send) {
9031 if (*(unsigned char*)s++ == c) n++;
9032 }
9033 return SIZET2NUM(n);
9034 }
9035 }
9036
9037 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9038 for (i=1; i<argc; i++) {
9039 tstr = argv[i];
9040 StringValue(tstr);
9041 enc = rb_enc_check(str, tstr);
9042 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9043 }
9044
9045 s = RSTRING_PTR(str);
9046 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9047 send = RSTRING_END(str);
9048 ascompat = rb_enc_asciicompat(enc);
9049 while (s < send) {
9050 unsigned int c;
9051
9052 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9053 if (table[c]) {
9054 n++;
9055 }
9056 s++;
9057 }
9058 else {
9059 int clen;
9060 c = rb_enc_codepoint_len(s, send, &clen, enc);
9061 if (tr_find(c, table, del, nodel)) {
9062 n++;
9063 }
9064 s += clen;
9065 }
9066 }
9067
9068 return SIZET2NUM(n);
9069}
9070
9071static VALUE
9072rb_fs_check(VALUE val)
9073{
9074 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9075 val = rb_check_string_type(val);
9076 if (NIL_P(val)) return 0;
9077 }
9078 return val;
9079}
9080
9081static const char isspacetable[256] = {
9082 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9090 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9098};
9099
9100#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9101
9102static long
9103split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9104{
9105 if (empty_count >= 0 && len == 0) {
9106 return empty_count + 1;
9107 }
9108 if (empty_count > 0) {
9109 /* make different substrings */
9110 if (result) {
9111 do {
9112 rb_ary_push(result, str_new_empty_String(str));
9113 } while (--empty_count > 0);
9114 }
9115 else {
9116 do {
9117 rb_yield(str_new_empty_String(str));
9118 } while (--empty_count > 0);
9119 }
9120 }
9121 str = rb_str_subseq(str, beg, len);
9122 if (result) {
9123 rb_ary_push(result, str);
9124 }
9125 else {
9126 rb_yield(str);
9127 }
9128 return empty_count;
9129}
9130
9131typedef enum {
9132 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9133} split_type_t;
9134
9135static split_type_t
9136literal_split_pattern(VALUE spat, split_type_t default_type)
9137{
9138 rb_encoding *enc = STR_ENC_GET(spat);
9139 const char *ptr;
9140 long len;
9141 RSTRING_GETMEM(spat, ptr, len);
9142 if (len == 0) {
9143 /* Special case - split into chars */
9144 return SPLIT_TYPE_CHARS;
9145 }
9146 else if (rb_enc_asciicompat(enc)) {
9147 if (len == 1 && ptr[0] == ' ') {
9148 return SPLIT_TYPE_AWK;
9149 }
9150 }
9151 else {
9152 int l;
9153 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9154 return SPLIT_TYPE_AWK;
9155 }
9156 }
9157 return default_type;
9158}
9159
9160/*
9161 * call-seq:
9162 * split(field_sep = $;, limit = 0) -> array_of_substrings
9163 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9164 *
9165 * :include: doc/string/split.rdoc
9166 *
9167 */
9168
9169static VALUE
9170rb_str_split_m(int argc, VALUE *argv, VALUE str)
9171{
9172 rb_encoding *enc;
9173 VALUE spat;
9174 VALUE limit;
9175 split_type_t split_type;
9176 long beg, end, i = 0, empty_count = -1;
9177 int lim = 0;
9178 VALUE result, tmp;
9179
9180 result = rb_block_given_p() ? Qfalse : Qnil;
9181 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9182 lim = NUM2INT(limit);
9183 if (lim <= 0) limit = Qnil;
9184 else if (lim == 1) {
9185 if (RSTRING_LEN(str) == 0)
9186 return result ? rb_ary_new2(0) : str;
9187 tmp = str_duplicate(rb_cString, str);
9188 if (!result) {
9189 rb_yield(tmp);
9190 return str;
9191 }
9192 return rb_ary_new3(1, tmp);
9193 }
9194 i = 1;
9195 }
9196 if (NIL_P(limit) && !lim) empty_count = 0;
9197
9198 enc = STR_ENC_GET(str);
9199 split_type = SPLIT_TYPE_REGEXP;
9200 if (!NIL_P(spat)) {
9201 spat = get_pat_quoted(spat, 0);
9202 }
9203 else if (NIL_P(spat = rb_fs)) {
9204 split_type = SPLIT_TYPE_AWK;
9205 }
9206 else if (!(spat = rb_fs_check(spat))) {
9207 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9208 }
9209 else {
9210 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9211 }
9212 if (split_type != SPLIT_TYPE_AWK) {
9213 switch (BUILTIN_TYPE(spat)) {
9214 case T_REGEXP:
9215 rb_reg_options(spat); /* check if uninitialized */
9216 tmp = RREGEXP_SRC(spat);
9217 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9218 if (split_type == SPLIT_TYPE_AWK) {
9219 spat = tmp;
9220 split_type = SPLIT_TYPE_STRING;
9221 }
9222 break;
9223
9224 case T_STRING:
9225 mustnot_broken(spat);
9226 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9227 break;
9228
9229 default:
9231 }
9232 }
9233
9234#define SPLIT_STR(beg, len) ( \
9235 empty_count = split_string(result, str, beg, len, empty_count), \
9236 str_mod_check(str, str_start, str_len))
9237
9238 beg = 0;
9239 char *ptr = RSTRING_PTR(str);
9240 char *const str_start = ptr;
9241 const long str_len = RSTRING_LEN(str);
9242 char *const eptr = str_start + str_len;
9243 if (split_type == SPLIT_TYPE_AWK) {
9244 char *bptr = ptr;
9245 int skip = 1;
9246 unsigned int c;
9247
9248 if (result) result = rb_ary_new();
9249 end = beg;
9250 if (is_ascii_string(str)) {
9251 while (ptr < eptr) {
9252 c = (unsigned char)*ptr++;
9253 if (skip) {
9254 if (ascii_isspace(c)) {
9255 beg = ptr - bptr;
9256 }
9257 else {
9258 end = ptr - bptr;
9259 skip = 0;
9260 if (!NIL_P(limit) && lim <= i) break;
9261 }
9262 }
9263 else if (ascii_isspace(c)) {
9264 SPLIT_STR(beg, end-beg);
9265 skip = 1;
9266 beg = ptr - bptr;
9267 if (!NIL_P(limit)) ++i;
9268 }
9269 else {
9270 end = ptr - bptr;
9271 }
9272 }
9273 }
9274 else {
9275 while (ptr < eptr) {
9276 int n;
9277
9278 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9279 ptr += n;
9280 if (skip) {
9281 if (rb_isspace(c)) {
9282 beg = ptr - bptr;
9283 }
9284 else {
9285 end = ptr - bptr;
9286 skip = 0;
9287 if (!NIL_P(limit) && lim <= i) break;
9288 }
9289 }
9290 else if (rb_isspace(c)) {
9291 SPLIT_STR(beg, end-beg);
9292 skip = 1;
9293 beg = ptr - bptr;
9294 if (!NIL_P(limit)) ++i;
9295 }
9296 else {
9297 end = ptr - bptr;
9298 }
9299 }
9300 }
9301 }
9302 else if (split_type == SPLIT_TYPE_STRING) {
9303 char *substr_start = ptr;
9304 char *sptr = RSTRING_PTR(spat);
9305 long slen = RSTRING_LEN(spat);
9306
9307 if (result) result = rb_ary_new();
9308 mustnot_broken(str);
9309 enc = rb_enc_check(str, spat);
9310 while (ptr < eptr &&
9311 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9312 /* Check we are at the start of a char */
9313 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9314 if (t != ptr + end) {
9315 ptr = t;
9316 continue;
9317 }
9318 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9319 str_mod_check(spat, sptr, slen);
9320 ptr += end + slen;
9321 substr_start = ptr;
9322 if (!NIL_P(limit) && lim <= ++i) break;
9323 }
9324 beg = ptr - str_start;
9325 }
9326 else if (split_type == SPLIT_TYPE_CHARS) {
9327 int n;
9328
9329 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9330 mustnot_broken(str);
9331 enc = rb_enc_get(str);
9332 while (ptr < eptr &&
9333 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9334 SPLIT_STR(ptr - str_start, n);
9335 ptr += n;
9336 if (!NIL_P(limit) && lim <= ++i) break;
9337 }
9338 beg = ptr - str_start;
9339 }
9340 else {
9341 if (result) result = rb_ary_new();
9342 long len = RSTRING_LEN(str);
9343 long start = beg;
9344 long idx;
9345 int last_null = 0;
9346 struct re_registers *regs;
9347 VALUE match = 0;
9348
9349 for (; rb_reg_search(spat, str, start, 0) >= 0;
9350 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9351 match = rb_backref_get();
9352 if (!result) rb_match_busy(match);
9353 regs = RMATCH_REGS(match);
9354 end = BEG(0);
9355 if (start == end && BEG(0) == END(0)) {
9356 if (!ptr) {
9357 SPLIT_STR(0, 0);
9358 break;
9359 }
9360 else if (last_null == 1) {
9361 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9362 beg = start;
9363 }
9364 else {
9365 if (start == len)
9366 start++;
9367 else
9368 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9369 last_null = 1;
9370 continue;
9371 }
9372 }
9373 else {
9374 SPLIT_STR(beg, end-beg);
9375 beg = start = END(0);
9376 }
9377 last_null = 0;
9378
9379 for (idx=1; idx < regs->num_regs; idx++) {
9380 if (BEG(idx) == -1) continue;
9381 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9382 }
9383 if (!NIL_P(limit) && lim <= ++i) break;
9384 }
9385 if (match) rb_match_unbusy(match);
9386 }
9387 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9388 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9389 }
9390
9391 return result ? result : str;
9392}
9393
9394VALUE
9395rb_str_split(VALUE str, const char *sep0)
9396{
9397 VALUE sep;
9398
9399 StringValue(str);
9400 sep = rb_str_new_cstr(sep0);
9401 return rb_str_split_m(1, &sep, str);
9402}
9403
9404#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9405
9406static inline int
9407enumerator_element(VALUE ary, VALUE e)
9408{
9409 if (ary) {
9410 rb_ary_push(ary, e);
9411 return 0;
9412 }
9413 else {
9414 rb_yield(e);
9415 return 1;
9416 }
9417}
9418
9419#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9420
9421static const char *
9422chomp_newline(const char *p, const char *e, rb_encoding *enc)
9423{
9424 const char *prev = rb_enc_prev_char(p, e, e, enc);
9425 if (rb_enc_is_newline(prev, e, enc)) {
9426 e = prev;
9427 prev = rb_enc_prev_char(p, e, e, enc);
9428 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9429 e = prev;
9430 }
9431 return e;
9432}
9433
9434static VALUE
9435get_rs(void)
9436{
9437 VALUE rs = rb_rs;
9438 if (!NIL_P(rs) &&
9439 (!RB_TYPE_P(rs, T_STRING) ||
9440 RSTRING_LEN(rs) != 1 ||
9441 RSTRING_PTR(rs)[0] != '\n')) {
9442 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9443 }
9444 return rs;
9445}
9446
9447#define rb_rs get_rs()
9448
9449static VALUE
9450rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9451{
9452 rb_encoding *enc;
9453 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9454 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9455 long pos, len, rslen;
9456 int rsnewline = 0;
9457
9458 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9459 rs = rb_rs;
9460 if (!NIL_P(opts)) {
9461 static ID keywords[1];
9462 if (!keywords[0]) {
9463 keywords[0] = rb_intern_const("chomp");
9464 }
9465 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9466 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9467 }
9468
9469 if (NIL_P(rs)) {
9470 if (!ENUM_ELEM(ary, str)) {
9471 return ary;
9472 }
9473 else {
9474 return orig;
9475 }
9476 }
9477
9478 if (!RSTRING_LEN(str)) goto end;
9479 str = rb_str_new_frozen(str);
9480 ptr = subptr = RSTRING_PTR(str);
9481 pend = RSTRING_END(str);
9482 len = RSTRING_LEN(str);
9483 StringValue(rs);
9484 rslen = RSTRING_LEN(rs);
9485
9486 if (rs == rb_default_rs)
9487 enc = rb_enc_get(str);
9488 else
9489 enc = rb_enc_check(str, rs);
9490
9491 if (rslen == 0) {
9492 /* paragraph mode */
9493 int n;
9494 const char *eol = NULL;
9495 subend = subptr;
9496 while (subend < pend) {
9497 long chomp_rslen = 0;
9498 do {
9499 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9500 n = 0;
9501 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9502 if (rb_enc_is_newline(subend + n, pend, enc)) {
9503 if (eol == subend) break;
9504 subend += rslen;
9505 if (subptr) {
9506 eol = subend;
9507 chomp_rslen = -rslen;
9508 }
9509 }
9510 else {
9511 if (!subptr) subptr = subend;
9512 subend += rslen;
9513 }
9514 rslen = 0;
9515 } while (subend < pend);
9516 if (!subptr) break;
9517 if (rslen == 0) chomp_rslen = 0;
9518 line = rb_str_subseq(str, subptr - ptr,
9519 subend - subptr + (chomp ? chomp_rslen : rslen));
9520 if (ENUM_ELEM(ary, line)) {
9521 str_mod_check(str, ptr, len);
9522 }
9523 subptr = eol = NULL;
9524 }
9525 goto end;
9526 }
9527 else {
9528 rsptr = RSTRING_PTR(rs);
9529 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9530 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9531 rsnewline = 1;
9532 }
9533 }
9534
9535 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9536 rs = rb_str_new(rsptr, rslen);
9537 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9538 rsptr = RSTRING_PTR(rs);
9539 rslen = RSTRING_LEN(rs);
9540 }
9541
9542 while (subptr < pend) {
9543 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9544 if (pos < 0) break;
9545 hit = subptr + pos;
9546 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9547 if (hit != adjusted) {
9548 subptr = adjusted;
9549 continue;
9550 }
9551 subend = hit += rslen;
9552 if (chomp) {
9553 if (rsnewline) {
9554 subend = chomp_newline(subptr, subend, enc);
9555 }
9556 else {
9557 subend -= rslen;
9558 }
9559 }
9560 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9561 if (ENUM_ELEM(ary, line)) {
9562 str_mod_check(str, ptr, len);
9563 }
9564 subptr = hit;
9565 }
9566
9567 if (subptr != pend) {
9568 if (chomp) {
9569 if (rsnewline) {
9570 pend = chomp_newline(subptr, pend, enc);
9571 }
9572 else if (pend - subptr >= rslen &&
9573 memcmp(pend - rslen, rsptr, rslen) == 0) {
9574 pend -= rslen;
9575 }
9576 }
9577 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9578 ENUM_ELEM(ary, line);
9579 RB_GC_GUARD(str);
9580 }
9581
9582 end:
9583 if (ary)
9584 return ary;
9585 else
9586 return orig;
9587}
9588
9589/*
9590 * call-seq:
9591 * each_line(record_separator = $/, chomp: false) {|substring| ... } -> self
9592 * each_line(record_separator = $/, chomp: false) -> enumerator
9593 *
9594 * :include: doc/string/each_line.rdoc
9595 *
9596 */
9597
9598static VALUE
9599rb_str_each_line(int argc, VALUE *argv, VALUE str)
9600{
9601 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9602 return rb_str_enumerate_lines(argc, argv, str, 0);
9603}
9604
9605/*
9606 * call-seq:
9607 * lines(record_separator = $/, chomp: false) -> array_of_strings
9608 *
9609 * Returns substrings ("lines") of +self+
9610 * according to the given arguments:
9611 *
9612 * s = <<~EOT
9613 * This is the first line.
9614 * This is line two.
9615 *
9616 * This is line four.
9617 * This is line five.
9618 * EOT
9619 *
9620 * With the default argument values:
9621 *
9622 * $/ # => "\n"
9623 * s.lines
9624 * # =>
9625 * ["This is the first line.\n",
9626 * "This is line two.\n",
9627 * "\n",
9628 * "This is line four.\n",
9629 * "This is line five.\n"]
9630 *
9631 * With a different +record_separator+:
9632 *
9633 * record_separator = ' is '
9634 * s.lines(record_separator)
9635 * # =>
9636 * ["This is ",
9637 * "the first line.\nThis is ",
9638 * "line two.\n\nThis is ",
9639 * "line four.\nThis is ",
9640 * "line five.\n"]
9641 *
9642 * With keyword argument +chomp+ as +true+,
9643 * removes the trailing newline from each line:
9644 *
9645 * s.lines(chomp: true)
9646 * # =>
9647 * ["This is the first line.",
9648 * "This is line two.",
9649 * "",
9650 * "This is line four.",
9651 * "This is line five."]
9652 *
9653 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
9654 */
9655
9656static VALUE
9657rb_str_lines(int argc, VALUE *argv, VALUE str)
9658{
9659 VALUE ary = WANTARRAY("lines", 0);
9660 return rb_str_enumerate_lines(argc, argv, str, ary);
9661}
9662
9663static VALUE
9664rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9665{
9666 return LONG2FIX(RSTRING_LEN(str));
9667}
9668
9669static VALUE
9670rb_str_enumerate_bytes(VALUE str, VALUE ary)
9671{
9672 long i;
9673
9674 for (i=0; i<RSTRING_LEN(str); i++) {
9675 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9676 }
9677 if (ary)
9678 return ary;
9679 else
9680 return str;
9681}
9682
9683/*
9684 * call-seq:
9685 * each_byte {|byte| ... } -> self
9686 * each_byte -> enumerator
9687 *
9688 * :include: doc/string/each_byte.rdoc
9689 *
9690 */
9691
9692static VALUE
9693rb_str_each_byte(VALUE str)
9694{
9695 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9696 return rb_str_enumerate_bytes(str, 0);
9697}
9698
9699/*
9700 * call-seq:
9701 * bytes -> array_of_bytes
9702 *
9703 * :include: doc/string/bytes.rdoc
9704 *
9705 */
9706
9707static VALUE
9708rb_str_bytes(VALUE str)
9709{
9710 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9711 return rb_str_enumerate_bytes(str, ary);
9712}
9713
9714static VALUE
9715rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9716{
9717 return rb_str_length(str);
9718}
9719
9720static VALUE
9721rb_str_enumerate_chars(VALUE str, VALUE ary)
9722{
9723 VALUE orig = str;
9724 long i, len, n;
9725 const char *ptr;
9726 rb_encoding *enc;
9727
9728 str = rb_str_new_frozen(str);
9729 ptr = RSTRING_PTR(str);
9730 len = RSTRING_LEN(str);
9731 enc = rb_enc_get(str);
9732
9734 for (i = 0; i < len; i += n) {
9735 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9736 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9737 }
9738 }
9739 else {
9740 for (i = 0; i < len; i += n) {
9741 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9742 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9743 }
9744 }
9745 RB_GC_GUARD(str);
9746 if (ary)
9747 return ary;
9748 else
9749 return orig;
9750}
9751
9752/*
9753 * call-seq:
9754 * each_char {|char| ... } -> self
9755 * each_char -> enumerator
9756 *
9757 * :include: doc/string/each_char.rdoc
9758 *
9759 */
9760
9761static VALUE
9762rb_str_each_char(VALUE str)
9763{
9764 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9765 return rb_str_enumerate_chars(str, 0);
9766}
9767
9768/*
9769 * call-seq:
9770 * chars -> array_of_characters
9771 *
9772 * :include: doc/string/chars.rdoc
9773 *
9774 */
9775
9776static VALUE
9777rb_str_chars(VALUE str)
9778{
9779 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9780 return rb_str_enumerate_chars(str, ary);
9781}
9782
9783static VALUE
9784rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9785{
9786 VALUE orig = str;
9787 int n;
9788 unsigned int c;
9789 const char *ptr, *end;
9790 rb_encoding *enc;
9791
9792 if (single_byte_optimizable(str))
9793 return rb_str_enumerate_bytes(str, ary);
9794
9795 str = rb_str_new_frozen(str);
9796 ptr = RSTRING_PTR(str);
9797 end = RSTRING_END(str);
9798 enc = STR_ENC_GET(str);
9799
9800 while (ptr < end) {
9801 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9802 ENUM_ELEM(ary, UINT2NUM(c));
9803 ptr += n;
9804 }
9805 RB_GC_GUARD(str);
9806 if (ary)
9807 return ary;
9808 else
9809 return orig;
9810}
9811
9812/*
9813 * call-seq:
9814 * each_codepoint {|codepoint| ... } -> self
9815 * each_codepoint -> enumerator
9816 *
9817 * :include: doc/string/each_codepoint.rdoc
9818 *
9819 */
9820
9821static VALUE
9822rb_str_each_codepoint(VALUE str)
9823{
9824 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9825 return rb_str_enumerate_codepoints(str, 0);
9826}
9827
9828/*
9829 * call-seq:
9830 * codepoints -> array_of_integers
9831 *
9832 * :include: doc/string/codepoints.rdoc
9833 *
9834 */
9835
9836static VALUE
9837rb_str_codepoints(VALUE str)
9838{
9839 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9840 return rb_str_enumerate_codepoints(str, ary);
9841}
9842
9843static regex_t *
9844get_reg_grapheme_cluster(rb_encoding *enc)
9845{
9846 int encidx = rb_enc_to_index(enc);
9847
9848 const OnigUChar source_ascii[] = "\\X";
9849 const OnigUChar *source = source_ascii;
9850 size_t source_len = sizeof(source_ascii) - 1;
9851
9852 switch (encidx) {
9853#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9854#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9855#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9856#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9857#define CASE_UTF(e) \
9858 case ENCINDEX_UTF_##e: { \
9859 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9860 source = source_UTF_##e; \
9861 source_len = sizeof(source_UTF_##e); \
9862 break; \
9863 }
9864 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9865#undef CASE_UTF
9866#undef CHARS_16BE
9867#undef CHARS_16LE
9868#undef CHARS_32BE
9869#undef CHARS_32LE
9870 }
9871
9872 regex_t *reg_grapheme_cluster;
9873 OnigErrorInfo einfo;
9874 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9875 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9876 if (r) {
9877 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9878 onig_error_code_to_str(message, r, &einfo);
9879 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9880 }
9881
9882 return reg_grapheme_cluster;
9883}
9884
9885static regex_t *
9886get_cached_reg_grapheme_cluster(rb_encoding *enc)
9887{
9888 int encidx = rb_enc_to_index(enc);
9889 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9890
9891 if (encidx == rb_utf8_encindex()) {
9892 if (!reg_grapheme_cluster_utf8) {
9893 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9894 }
9895
9896 return reg_grapheme_cluster_utf8;
9897 }
9898
9899 return NULL;
9900}
9901
9902static VALUE
9903rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9904{
9905 size_t grapheme_cluster_count = 0;
9906 rb_encoding *enc = get_encoding(str);
9907 const char *ptr, *end;
9908
9909 if (!rb_enc_unicode_p(enc)) {
9910 return rb_str_length(str);
9911 }
9912
9913 bool cached_reg_grapheme_cluster = true;
9914 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9915 if (!reg_grapheme_cluster) {
9916 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9917 cached_reg_grapheme_cluster = false;
9918 }
9919
9920 ptr = RSTRING_PTR(str);
9921 end = RSTRING_END(str);
9922
9923 while (ptr < end) {
9924 OnigPosition len = onig_match(reg_grapheme_cluster,
9925 (const OnigUChar *)ptr, (const OnigUChar *)end,
9926 (const OnigUChar *)ptr, NULL, 0);
9927 if (len <= 0) break;
9928 grapheme_cluster_count++;
9929 ptr += len;
9930 }
9931
9932 if (!cached_reg_grapheme_cluster) {
9933 onig_free(reg_grapheme_cluster);
9934 }
9935
9936 return SIZET2NUM(grapheme_cluster_count);
9937}
9938
9939static VALUE
9940rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9941{
9942 VALUE orig = str;
9943 rb_encoding *enc = get_encoding(str);
9944 const char *ptr0, *ptr, *end;
9945
9946 if (!rb_enc_unicode_p(enc)) {
9947 return rb_str_enumerate_chars(str, ary);
9948 }
9949
9950 if (!ary) str = rb_str_new_frozen(str);
9951
9952 bool cached_reg_grapheme_cluster = true;
9953 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9954 if (!reg_grapheme_cluster) {
9955 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9956 cached_reg_grapheme_cluster = false;
9957 }
9958
9959 ptr0 = ptr = RSTRING_PTR(str);
9960 end = RSTRING_END(str);
9961
9962 while (ptr < end) {
9963 OnigPosition len = onig_match(reg_grapheme_cluster,
9964 (const OnigUChar *)ptr, (const OnigUChar *)end,
9965 (const OnigUChar *)ptr, NULL, 0);
9966 if (len <= 0) break;
9967 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9968 ptr += len;
9969 }
9970
9971 if (!cached_reg_grapheme_cluster) {
9972 onig_free(reg_grapheme_cluster);
9973 }
9974
9975 RB_GC_GUARD(str);
9976 if (ary)
9977 return ary;
9978 else
9979 return orig;
9980}
9981
9982/*
9983 * call-seq:
9984 * each_grapheme_cluster {|grapheme_cluster| ... } -> self
9985 * each_grapheme_cluster -> enumerator
9986 *
9987 * :include: doc/string/each_grapheme_cluster.rdoc
9988 *
9989 */
9990
9991static VALUE
9992rb_str_each_grapheme_cluster(VALUE str)
9993{
9994 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9995 return rb_str_enumerate_grapheme_clusters(str, 0);
9996}
9997
9998/*
9999 * call-seq:
10000 * grapheme_clusters -> array_of_grapheme_clusters
10001 *
10002 * :include: doc/string/grapheme_clusters.rdoc
10003 *
10004 */
10005
10006static VALUE
10007rb_str_grapheme_clusters(VALUE str)
10008{
10009 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10010 return rb_str_enumerate_grapheme_clusters(str, ary);
10011}
10012
10013static long
10014chopped_length(VALUE str)
10015{
10016 rb_encoding *enc = STR_ENC_GET(str);
10017 const char *p, *p2, *beg, *end;
10018
10019 beg = RSTRING_PTR(str);
10020 end = beg + RSTRING_LEN(str);
10021 if (beg >= end) return 0;
10022 p = rb_enc_prev_char(beg, end, end, enc);
10023 if (!p) return 0;
10024 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10025 p2 = rb_enc_prev_char(beg, p, end, enc);
10026 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10027 }
10028 return p - beg;
10029}
10030
10031/*
10032 * call-seq:
10033 * chop! -> self or nil
10034 *
10035 * Like String#chop, except that:
10036 *
10037 * - Removes trailing characters from +self+ (not from a copy of +self+).
10038 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10039 *
10040 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10041 */
10042
10043static VALUE
10044rb_str_chop_bang(VALUE str)
10045{
10046 str_modify_keep_cr(str);
10047 if (RSTRING_LEN(str) > 0) {
10048 long len;
10049 len = chopped_length(str);
10050 STR_SET_LEN(str, len);
10051 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10052 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10054 }
10055 return str;
10056 }
10057 return Qnil;
10058}
10059
10060
10061/*
10062 * call-seq:
10063 * chop -> new_string
10064 *
10065 * :include: doc/string/chop.rdoc
10066 *
10067 */
10068
10069static VALUE
10070rb_str_chop(VALUE str)
10071{
10072 return rb_str_subseq(str, 0, chopped_length(str));
10073}
10074
10075static long
10076smart_chomp(VALUE str, const char *e, const char *p)
10077{
10078 rb_encoding *enc = rb_enc_get(str);
10079 if (rb_enc_mbminlen(enc) > 1) {
10080 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10081 if (rb_enc_is_newline(pp, e, enc)) {
10082 e = pp;
10083 }
10084 pp = e - rb_enc_mbminlen(enc);
10085 if (pp >= p) {
10086 pp = rb_enc_left_char_head(p, pp, e, enc);
10087 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10088 e = pp;
10089 }
10090 }
10091 }
10092 else {
10093 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10094 case '\n':
10095 if (--e > p && *(e-1) == '\r') {
10096 --e;
10097 }
10098 break;
10099 case '\r':
10100 --e;
10101 break;
10102 }
10103 }
10104 return e - p;
10105}
10106
10107static long
10108chompped_length(VALUE str, VALUE rs)
10109{
10110 rb_encoding *enc;
10111 int newline;
10112 char *pp, *e, *rsptr;
10113 long rslen;
10114 char *const p = RSTRING_PTR(str);
10115 long len = RSTRING_LEN(str);
10116
10117 if (len == 0) return 0;
10118 e = p + len;
10119 if (rs == rb_default_rs) {
10120 return smart_chomp(str, e, p);
10121 }
10122
10123 enc = rb_enc_get(str);
10124 RSTRING_GETMEM(rs, rsptr, rslen);
10125 if (rslen == 0) {
10126 if (rb_enc_mbminlen(enc) > 1) {
10127 while (e > p) {
10128 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10129 if (!rb_enc_is_newline(pp, e, enc)) break;
10130 e = pp;
10131 pp -= rb_enc_mbminlen(enc);
10132 if (pp >= p) {
10133 pp = rb_enc_left_char_head(p, pp, e, enc);
10134 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10135 e = pp;
10136 }
10137 }
10138 }
10139 }
10140 else {
10141 while (e > p && *(e-1) == '\n') {
10142 --e;
10143 if (e > p && *(e-1) == '\r')
10144 --e;
10145 }
10146 }
10147 return e - p;
10148 }
10149 if (rslen > len) return len;
10150
10151 enc = rb_enc_get(rs);
10152 newline = rsptr[rslen-1];
10153 if (rslen == rb_enc_mbminlen(enc)) {
10154 if (rslen == 1) {
10155 if (newline == '\n')
10156 return smart_chomp(str, e, p);
10157 }
10158 else {
10159 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10160 return smart_chomp(str, e, p);
10161 }
10162 }
10163
10164 enc = rb_enc_check(str, rs);
10165 if (is_broken_string(rs)) {
10166 return len;
10167 }
10168 pp = e - rslen;
10169 if (p[len-1] == newline &&
10170 (rslen <= 1 ||
10171 memcmp(rsptr, pp, rslen) == 0)) {
10172 if (at_char_boundary(p, pp, e, enc))
10173 return len - rslen;
10174 RB_GC_GUARD(rs);
10175 }
10176 return len;
10177}
10178
10184static VALUE
10185chomp_rs(int argc, const VALUE *argv)
10186{
10187 rb_check_arity(argc, 0, 1);
10188 if (argc > 0) {
10189 VALUE rs = argv[0];
10190 if (!NIL_P(rs)) StringValue(rs);
10191 return rs;
10192 }
10193 else {
10194 return rb_rs;
10195 }
10196}
10197
10198VALUE
10199rb_str_chomp_string(VALUE str, VALUE rs)
10200{
10201 long olen = RSTRING_LEN(str);
10202 long len = chompped_length(str, rs);
10203 if (len >= olen) return Qnil;
10204 str_modify_keep_cr(str);
10205 STR_SET_LEN(str, len);
10206 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10207 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10209 }
10210 return str;
10211}
10212
10213/*
10214 * call-seq:
10215 * chomp!(line_sep = $/) -> self or nil
10216 *
10217 * Like String#chomp, except that:
10218 *
10219 * - Removes trailing characters from +self+ (not from a copy of +self+).
10220 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10221 *
10222 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10223 */
10224
10225static VALUE
10226rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10227{
10228 VALUE rs;
10229 str_modifiable(str);
10230 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10231 rs = chomp_rs(argc, argv);
10232 if (NIL_P(rs)) return Qnil;
10233 return rb_str_chomp_string(str, rs);
10234}
10235
10236
10237/*
10238 * call-seq:
10239 * chomp(line_sep = $/) -> new_string
10240 *
10241 * :include: doc/string/chomp.rdoc
10242 *
10243 */
10244
10245static VALUE
10246rb_str_chomp(int argc, VALUE *argv, VALUE str)
10247{
10248 VALUE rs = chomp_rs(argc, argv);
10249 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10250 return rb_str_subseq(str, 0, chompped_length(str, rs));
10251}
10252
10253static void
10254tr_setup_table_multi(char table[TR_TABLE_SIZE], VALUE *tablep, VALUE *ctablep,
10255 VALUE str, int num_selectors, VALUE *selectors)
10256{
10257 int i;
10258
10259 for (i=0; i<num_selectors; i++) {
10260 VALUE selector = selectors[i];
10261 rb_encoding *enc;
10262
10263 StringValue(selector);
10264 enc = rb_enc_check(str, selector);
10265 tr_setup_table(selector, table, i==0, tablep, ctablep, enc);
10266 }
10267}
10268
10269static long
10270lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10271{
10272 const char *const start = s;
10273
10274 if (!s || s >= e) return 0;
10275
10276 /* remove spaces at head */
10277 if (single_byte_optimizable(str)) {
10278 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10279 }
10280 else {
10281 while (s < e) {
10282 int n;
10283 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10284
10285 if (cc && !rb_isspace(cc)) break;
10286 s += n;
10287 }
10288 }
10289 return s - start;
10290}
10291
10292static long
10293lstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10294 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10295{
10296 const char *const start = s;
10297
10298 if (!s || s >= e) return 0;
10299
10300 /* remove leading characters in the table */
10301 while (s < e) {
10302 int n;
10303 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10304
10305 if (!tr_find(cc, table, del, nodel)) break;
10306 s += n;
10307 }
10308 return s - start;
10309}
10310
10311/*
10312 * call-seq:
10313 * lstrip!(*selectors) -> self or nil
10314 *
10315 * Like String#lstrip, except that:
10316 *
10317 * - Performs stripping in +self+ (not in a copy of +self+).
10318 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10319 *
10320 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10321 */
10322
10323static VALUE
10324rb_str_lstrip_bang(int argc, VALUE *argv, VALUE str)
10325{
10326 rb_encoding *enc;
10327 char *start, *s;
10328 long olen, loffset;
10329
10330 str_modify_keep_cr(str);
10331 enc = STR_ENC_GET(str);
10332 RSTRING_GETMEM(str, start, olen);
10333 if (argc > 0) {
10334 char table[TR_TABLE_SIZE];
10335 VALUE del = 0, nodel = 0;
10336
10337 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10338 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10339 }
10340 else {
10341 loffset = lstrip_offset(str, start, start+olen, enc);
10342 }
10343
10344 if (loffset > 0) {
10345 long len = olen-loffset;
10346 s = start + loffset;
10347 memmove(start, s, len);
10348 STR_SET_LEN(str, len);
10349 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10350 return str;
10351 }
10352 return Qnil;
10353}
10354
10355
10356/*
10357 * call-seq:
10358 * lstrip(*selectors) -> new_string
10359 *
10360 * Returns a copy of +self+ with leading whitespace removed;
10361 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10362 *
10363 * whitespace = "\x00\t\n\v\f\r "
10364 * s = whitespace + 'abc' + whitespace
10365 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10366 * s.lstrip
10367 * # => "abc\u0000\t\n\v\f\r "
10368 *
10369 * If +selectors+ are given, removes characters of +selectors+ from the beginning of +self+:
10370 *
10371 * s = "---abc+++"
10372 * s.lstrip("-") # => "abc+++"
10373 *
10374 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10375 * and may use any of its valid forms, including negation, ranges, and escapes:
10376 *
10377 * "01234abc56789".lstrip("0-9") # "abc56789"
10378 * "01234abc56789".lstrip("0-9", "^4-6") # "4abc56789"
10379 *
10380 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10381 */
10382
10383static VALUE
10384rb_str_lstrip(int argc, VALUE *argv, VALUE str)
10385{
10386 char *start;
10387 long len, loffset;
10388
10389 RSTRING_GETMEM(str, start, len);
10390 if (argc > 0) {
10391 char table[TR_TABLE_SIZE];
10392 VALUE del = 0, nodel = 0;
10393
10394 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10395 loffset = lstrip_offset_table(str, start, start+len, STR_ENC_GET(str), table, del, nodel);
10396 }
10397 else {
10398 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10399 }
10400 if (loffset <= 0) return str_duplicate(rb_cString, str);
10401 return rb_str_subseq(str, loffset, len - loffset);
10402}
10403
10404static long
10405rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10406{
10407 const char *t;
10408
10409 rb_str_check_dummy_enc(enc);
10410 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10411 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10412 }
10413 if (!s || s >= e) return 0;
10414 t = e;
10415
10416 /* remove trailing spaces or '\0's */
10417 if (single_byte_optimizable(str)) {
10418 unsigned char c;
10419 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10420 }
10421 else {
10422 char *tp;
10423
10424 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10425 unsigned int c = rb_enc_codepoint(tp, e, enc);
10426 if (c && !rb_isspace(c)) break;
10427 t = tp;
10428 }
10429 }
10430 return e - t;
10431}
10432
10433static long
10434rstrip_offset_table(VALUE str, const char *s, const char *e, rb_encoding *enc,
10435 char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
10436{
10437 const char *t;
10438 char *tp;
10439
10440 rb_str_check_dummy_enc(enc);
10441 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
10442 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10443 }
10444 if (!s || s >= e) return 0;
10445 t = e;
10446
10447 /* remove trailing characters in the table */
10448 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10449 unsigned int c = rb_enc_codepoint(tp, e, enc);
10450 if (!tr_find(c, table, del, nodel)) break;
10451 t = tp;
10452 }
10453
10454 return e - t;
10455}
10456
10457/*
10458 * call-seq:
10459 * rstrip!(*selectors) -> self or nil
10460 *
10461 * Like String#rstrip, except that:
10462 *
10463 * - Performs stripping in +self+ (not in a copy of +self+).
10464 * - Returns +self+ if any characters are stripped, +nil+ otherwise.
10465 *
10466 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10467 */
10468
10469static VALUE
10470rb_str_rstrip_bang(int argc, VALUE *argv, VALUE str)
10471{
10472 rb_encoding *enc;
10473 char *start;
10474 long olen, roffset;
10475
10476 str_modify_keep_cr(str);
10477 enc = STR_ENC_GET(str);
10478 RSTRING_GETMEM(str, start, olen);
10479 if (argc > 0) {
10480 char table[TR_TABLE_SIZE];
10481 VALUE del = 0, nodel = 0;
10482
10483 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10484 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10485 }
10486 else {
10487 roffset = rstrip_offset(str, start, start+olen, enc);
10488 }
10489 if (roffset > 0) {
10490 long len = olen - roffset;
10491
10492 STR_SET_LEN(str, len);
10493 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10494 return str;
10495 }
10496 return Qnil;
10497}
10498
10499
10500/*
10501 * call-seq:
10502 * rstrip(*selectors) -> new_string
10503 *
10504 * Returns a copy of +self+ with trailing whitespace removed;
10505 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10506 *
10507 * whitespace = "\x00\t\n\v\f\r "
10508 * s = whitespace + 'abc' + whitespace
10509 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10510 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10511 *
10512 * If +selectors+ are given, removes characters of +selectors+ from the end of +self+:
10513 *
10514 * s = "---abc+++"
10515 * s.rstrip("+") # => "---abc"
10516 *
10517 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10518 * and may use any of its valid forms, including negation, ranges, and escapes:
10519 *
10520 * "01234abc56789".rstrip("0-9") # "01234abc"
10521 * "01234abc56789".rstrip("0-9", "^4-6") # "01234abc56"
10522 *
10523 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10524 */
10525
10526static VALUE
10527rb_str_rstrip(int argc, VALUE *argv, VALUE str)
10528{
10529 rb_encoding *enc;
10530 char *start;
10531 long olen, roffset;
10532
10533 enc = STR_ENC_GET(str);
10534 RSTRING_GETMEM(str, start, olen);
10535 if (argc > 0) {
10536 char table[TR_TABLE_SIZE];
10537 VALUE del = 0, nodel = 0;
10538
10539 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10540 roffset = rstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10541 }
10542 else {
10543 roffset = rstrip_offset(str, start, start+olen, enc);
10544 }
10545 if (roffset <= 0) return str_duplicate(rb_cString, str);
10546 return rb_str_subseq(str, 0, olen-roffset);
10547}
10548
10549
10550/*
10551 * call-seq:
10552 * strip!(*selectors) -> self or nil
10553 *
10554 * Like String#strip, except that:
10555 *
10556 * - Any modifications are made to +self+.
10557 * - Returns +self+ if any modification are made, +nil+ otherwise.
10558 *
10559 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10560 */
10561
10562static VALUE
10563rb_str_strip_bang(int argc, VALUE *argv, VALUE str)
10564{
10565 char *start;
10566 long olen, loffset, roffset;
10567 rb_encoding *enc;
10568
10569 str_modify_keep_cr(str);
10570 enc = STR_ENC_GET(str);
10571 RSTRING_GETMEM(str, start, olen);
10572
10573 if (argc > 0) {
10574 char table[TR_TABLE_SIZE];
10575 VALUE del = 0, nodel = 0;
10576
10577 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10578 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10579 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10580 }
10581 else {
10582 loffset = lstrip_offset(str, start, start+olen, enc);
10583 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10584 }
10585
10586 if (loffset > 0 || roffset > 0) {
10587 long len = olen-roffset;
10588 if (loffset > 0) {
10589 len -= loffset;
10590 memmove(start, start + loffset, len);
10591 }
10592 STR_SET_LEN(str, len);
10593 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10594 return str;
10595 }
10596 return Qnil;
10597}
10598
10599
10600/*
10601 * call-seq:
10602 * strip(*selectors) -> new_string
10603 *
10604 * Returns a copy of +self+ with leading and trailing whitespace removed;
10605 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10606 *
10607 * whitespace = "\x00\t\n\v\f\r "
10608 * s = whitespace + 'abc' + whitespace
10609 * # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10610 * s.strip # => "abc"
10611 *
10612 * If +selectors+ are given, removes characters of +selectors+ from both ends of +self+:
10613 *
10614 * s = "---abc+++"
10615 * s.strip("-+") # => "abc"
10616 * s.strip("+-") # => "abc"
10617 *
10618 * +selectors+ must be valid character selectors (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
10619 * and may use any of its valid forms, including negation, ranges, and escapes:
10620 *
10621 * "01234abc56789".strip("0-9") # "abc"
10622 * "01234abc56789".strip("0-9", "^4-6") # "4abc56"
10623 *
10624 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
10625 */
10626
10627static VALUE
10628rb_str_strip(int argc, VALUE *argv, VALUE str)
10629{
10630 char *start;
10631 long olen, loffset, roffset;
10632 rb_encoding *enc = STR_ENC_GET(str);
10633
10634 RSTRING_GETMEM(str, start, olen);
10635
10636 if (argc > 0) {
10637 char table[TR_TABLE_SIZE];
10638 VALUE del = 0, nodel = 0;
10639
10640 tr_setup_table_multi(table, &del, &nodel, str, argc, argv);
10641 loffset = lstrip_offset_table(str, start, start+olen, enc, table, del, nodel);
10642 roffset = rstrip_offset_table(str, start+loffset, start+olen, enc, table, del, nodel);
10643 }
10644 else {
10645 loffset = lstrip_offset(str, start, start+olen, enc);
10646 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10647 }
10648
10649 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10650 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10651}
10652
10653static VALUE
10654scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10655{
10656 VALUE result = Qnil;
10657 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10658 if (pos >= 0) {
10659 VALUE match;
10660 struct re_registers *regs;
10661 if (BUILTIN_TYPE(pat) == T_STRING) {
10662 regs = NULL;
10663 end = pos + RSTRING_LEN(pat);
10664 }
10665 else {
10666 match = rb_backref_get();
10667 regs = RMATCH_REGS(match);
10668 pos = BEG(0);
10669 end = END(0);
10670 }
10671
10672 if (pos == end) {
10673 rb_encoding *enc = STR_ENC_GET(str);
10674 /*
10675 * Always consume at least one character of the input string
10676 */
10677 if (RSTRING_LEN(str) > end)
10678 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10679 RSTRING_END(str), enc);
10680 else
10681 *start = end + 1;
10682 }
10683 else {
10684 *start = end;
10685 }
10686
10687 if (!regs || regs->num_regs == 1) {
10688 result = rb_str_subseq(str, pos, end - pos);
10689 return result;
10690 }
10691 else {
10692 result = rb_ary_new2(regs->num_regs);
10693 for (int i = 1; i < regs->num_regs; i++) {
10694 VALUE s = Qnil;
10695 if (BEG(i) >= 0) {
10696 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10697 }
10698
10699 rb_ary_push(result, s);
10700 }
10701 }
10702
10703 RB_GC_GUARD(match);
10704 }
10705
10706 return result;
10707}
10708
10709
10710/*
10711 * call-seq:
10712 * scan(pattern) -> array_of_results
10713 * scan(pattern) {|result| ... } -> self
10714 *
10715 * :include: doc/string/scan.rdoc
10716 *
10717 */
10718
10719static VALUE
10720rb_str_scan(VALUE str, VALUE pat)
10721{
10722 VALUE result;
10723 long start = 0;
10724 long last = -1, prev = 0;
10725 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10726
10727 pat = get_pat_quoted(pat, 1);
10728 mustnot_broken(str);
10729 if (!rb_block_given_p()) {
10730 VALUE ary = rb_ary_new();
10731
10732 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10733 last = prev;
10734 prev = start;
10735 rb_ary_push(ary, result);
10736 }
10737 if (last >= 0) rb_pat_search(pat, str, last, 1);
10738 else rb_backref_set(Qnil);
10739 return ary;
10740 }
10741
10742 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10743 last = prev;
10744 prev = start;
10745 rb_yield(result);
10746 str_mod_check(str, p, len);
10747 }
10748 if (last >= 0) rb_pat_search(pat, str, last, 1);
10749 return str;
10750}
10751
10752
10753/*
10754 * call-seq:
10755 * hex -> integer
10756 *
10757 * Interprets the leading substring of +self+ as hexadecimal, possibly signed;
10758 * returns its value as an integer.
10759 *
10760 * The leading substring is interpreted as hexadecimal when it begins with:
10761 *
10762 * - One or more character representing hexadecimal digits
10763 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10764 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit:
10765 *
10766 * 'f'.hex # => 15
10767 * '11'.hex # => 17
10768 * 'FFF'.hex # => 4095
10769 * 'fffg'.hex # => 4095
10770 * 'foo'.hex # => 15 # 'f' hexadecimal, 'oo' not.
10771 * 'bar'.hex # => 186 # 'ba' hexadecimal, 'r' not.
10772 * 'deadbeef'.hex # => 3735928559
10773 *
10774 * - <tt>'0x'</tt> or <tt>'0X'</tt>, followed by one or more hexadecimal digits:
10775 *
10776 * '0xfff'.hex # => 4095
10777 * '0xfffg'.hex # => 4095
10778 *
10779 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10780 *
10781 * '-fff'.hex # => -4095
10782 * '-0xFFF'.hex # => -4095
10783 *
10784 * For any substring not described above, returns zero:
10785 *
10786 * 'xxx'.hex # => 0
10787 * ''.hex # => 0
10788 *
10789 * Note that, unlike #oct, this method interprets only hexadecimal,
10790 * and not binary, octal, or decimal notations:
10791 *
10792 * '0b111'.hex # => 45329
10793 * '0o777'.hex # => 0
10794 * '0d999'.hex # => 55705
10795 *
10796 * Related: See {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10797 */
10798
10799static VALUE
10800rb_str_hex(VALUE str)
10801{
10802 return rb_str_to_inum(str, 16, FALSE);
10803}
10804
10805
10806/*
10807 * call-seq:
10808 * oct -> integer
10809 *
10810 * Interprets the leading substring of +self+ as octal, binary, decimal, or hexadecimal, possibly signed;
10811 * returns their value as an integer.
10812 *
10813 * In brief:
10814 *
10815 * # Interpreted as octal.
10816 * '777'.oct # => 511
10817 * '777x'.oct # => 511
10818 * '0777'.oct # => 511
10819 * '0o777'.oct # => 511
10820 * '-777'.oct # => -511
10821 * # Not interpreted as octal.
10822 * '0b111'.oct # => 7 # Interpreted as binary.
10823 * '0d999'.oct # => 999 # Interpreted as decimal.
10824 * '0xfff'.oct # => 4095 # Interpreted as hexadecimal.
10825 *
10826 * The leading substring is interpreted as octal when it begins with:
10827 *
10828 * - One or more character representing octal digits
10829 * (each in the range <tt>'0'..'7'</tt>);
10830 * the string to be interpreted ends at the first character that does not represent an octal digit:
10831 *
10832 * '7'.oct @ => 7
10833 * '11'.oct # => 9
10834 * '777'.oct # => 511
10835 * '0777'.oct # => 511
10836 * '7778'.oct # => 511
10837 * '777x'.oct # => 511
10838 *
10839 * - <tt>'0o'</tt>, followed by one or more octal digits:
10840 *
10841 * '0o777'.oct # => 511
10842 * '0o7778'.oct # => 511
10843 *
10844 * The leading substring is _not_ interpreted as octal when it begins with:
10845 *
10846 * - <tt>'0b'</tt>, followed by one or more characters representing binary digits
10847 * (each in the range <tt>'0'..'1'</tt>);
10848 * the string to be interpreted ends at the first character that does not represent a binary digit.
10849 * the string is interpreted as binary digits (base 2):
10850 *
10851 * '0b111'.oct # => 7
10852 * '0b1112'.oct # => 7
10853 *
10854 * - <tt>'0d'</tt>, followed by one or more characters representing decimal digits
10855 * (each in the range <tt>'0'..'9'</tt>);
10856 * the string to be interpreted ends at the first character that does not represent a decimal digit.
10857 * the string is interpreted as decimal digits (base 10):
10858 *
10859 * '0d999'.oct # => 999
10860 * '0d999x'.oct # => 999
10861 *
10862 * - <tt>'0x'</tt>, followed by one or more characters representing hexadecimal digits
10863 * (each in one of the ranges <tt>'0'..'9'</tt>, <tt>'a'..'f'</tt>, or <tt>'A'..'F'</tt>);
10864 * the string to be interpreted ends at the first character that does not represent a hexadecimal digit.
10865 * the string is interpreted as hexadecimal digits (base 16):
10866 *
10867 * '0xfff'.oct # => 4095
10868 * '0xfffg'.oct # => 4095
10869 *
10870 * Any of the above may prefixed with <tt>'-'</tt>, which negates the interpreted value:
10871 *
10872 * '-777'.oct # => -511
10873 * '-0777'.oct # => -511
10874 * '-0b111'.oct # => -7
10875 * '-0xfff'.oct # => -4095
10876 *
10877 * For any substring not described above, returns zero:
10878 *
10879 * 'foo'.oct # => 0
10880 * ''.oct # => 0
10881 *
10882 * Related: see {Converting to Non-String}[rdoc-ref:String@Converting+to+Non-String].
10883 */
10884
10885static VALUE
10886rb_str_oct(VALUE str)
10887{
10888 return rb_str_to_inum(str, -8, FALSE);
10889}
10890
10891#ifndef HAVE_CRYPT_R
10892# include "ruby/thread_native.h"
10893# include "ruby/atomic.h"
10894
10895static struct {
10896 rb_nativethread_lock_t lock;
10897} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10898#endif
10899
10900/*
10901 * call-seq:
10902 * crypt(salt_str) -> new_string
10903 *
10904 * Returns the string generated by calling <code>crypt(3)</code>
10905 * standard library function with <code>str</code> and
10906 * <code>salt_str</code>, in this order, as its arguments. Please do
10907 * not use this method any longer. It is legacy; provided only for
10908 * backward compatibility with ruby scripts in earlier days. It is
10909 * bad to use in contemporary programs for several reasons:
10910 *
10911 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10912 * run. The generated string lacks data portability.
10913 *
10914 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10915 * (i.e. silently ends up in unexpected results).
10916 *
10917 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10918 * thread safe.
10919 *
10920 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10921 * very very weak. According to its manpage, Linux's traditional
10922 * <code>crypt(3)</code> output has only 2**56 variations; too
10923 * easy to brute force today. And this is the default behaviour.
10924 *
10925 * * In order to make things robust some OSes implement so-called
10926 * "modular" usage. To go through, you have to do a complex
10927 * build-up of the <code>salt_str</code> parameter, by hand.
10928 * Failure in generation of a proper salt string tends not to
10929 * yield any errors; typos in parameters are normally not
10930 * detectable.
10931 *
10932 * * For instance, in the following example, the second invocation
10933 * of String#crypt is wrong; it has a typo in "round=" (lacks
10934 * "s"). However the call does not fail and something unexpected
10935 * is generated.
10936 *
10937 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10938 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10939 *
10940 * * Even in the "modular" mode, some hash functions are considered
10941 * archaic and no longer recommended at all; for instance module
10942 * <code>$1$</code> is officially abandoned by its author: see
10943 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10944 * instance module <code>$3$</code> is considered completely
10945 * broken: see the manpage of FreeBSD.
10946 *
10947 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10948 * written above, <code>crypt(3)</code> on Mac OS never fails.
10949 * This means even if you build up a proper salt string it
10950 * generates a traditional DES hash anyways, and there is no way
10951 * for you to be aware of.
10952 *
10953 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10954 *
10955 * If for some reason you cannot migrate to other secure contemporary
10956 * password hashing algorithms, install the string-crypt gem and
10957 * <code>require 'string/crypt'</code> to continue using it.
10958 */
10959
10960static VALUE
10961rb_str_crypt(VALUE str, VALUE salt)
10962{
10963#ifdef HAVE_CRYPT_R
10964 VALUE databuf;
10965 struct crypt_data *data;
10966# define CRYPT_END() ALLOCV_END(databuf)
10967#else
10968 char *tmp_buf;
10969 extern char *crypt(const char *, const char *);
10970# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10971#endif
10972 VALUE result;
10973 const char *s, *saltp;
10974 char *res;
10975#ifdef BROKEN_CRYPT
10976 char salt_8bit_clean[3];
10977#endif
10978
10979 StringValue(salt);
10980 mustnot_wchar(str);
10981 mustnot_wchar(salt);
10982 s = StringValueCStr(str);
10983 saltp = RSTRING_PTR(salt);
10984 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10985 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10986 }
10987
10988#ifdef BROKEN_CRYPT
10989 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10990 salt_8bit_clean[0] = saltp[0] & 0x7f;
10991 salt_8bit_clean[1] = saltp[1] & 0x7f;
10992 salt_8bit_clean[2] = '\0';
10993 saltp = salt_8bit_clean;
10994 }
10995#endif
10996#ifdef HAVE_CRYPT_R
10997 data = ALLOCV(databuf, sizeof(struct crypt_data));
10998# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10999 data->initialized = 0;
11000# endif
11001 res = crypt_r(s, saltp, data);
11002#else
11003 rb_nativethread_lock_lock(&crypt_mutex.lock);
11004 res = crypt(s, saltp);
11005#endif
11006 if (!res) {
11007 int err = errno;
11008 CRYPT_END();
11009 rb_syserr_fail(err, "crypt");
11010 }
11011#ifdef HAVE_CRYPT_R
11012 result = rb_str_new_cstr(res);
11013 CRYPT_END();
11014#else
11015 // We need to copy this buffer because it's static and we need to unlock the mutex
11016 // before allocating a new object (the string to be returned). If we allocate while
11017 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
11018 // if other ractors are waiting on this lock.
11019 size_t res_size = strlen(res)+1;
11020 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
11021 memcpy(tmp_buf, res, res_size);
11022 res = tmp_buf;
11023 CRYPT_END();
11024 result = rb_str_new_cstr(res);
11025#endif
11026 return result;
11027}
11028
11029
11030/*
11031 * call-seq:
11032 * ord -> integer
11033 *
11034 * :include: doc/string/ord.rdoc
11035 *
11036 */
11037
11038static VALUE
11039rb_str_ord(VALUE s)
11040{
11041 unsigned int c;
11042
11043 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
11044 return UINT2NUM(c);
11045}
11046/*
11047 * call-seq:
11048 * sum(n = 16) -> integer
11049 *
11050 * :include: doc/string/sum.rdoc
11051 *
11052 */
11053
11054static VALUE
11055rb_str_sum(int argc, VALUE *argv, VALUE str)
11056{
11057 int bits = 16;
11058 char *ptr, *p, *pend;
11059 long len;
11060 VALUE sum = INT2FIX(0);
11061 unsigned long sum0 = 0;
11062
11063 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
11064 bits = 0;
11065 }
11066 ptr = p = RSTRING_PTR(str);
11067 len = RSTRING_LEN(str);
11068 pend = p + len;
11069
11070 while (p < pend) {
11071 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
11072 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11073 str_mod_check(str, ptr, len);
11074 sum0 = 0;
11075 }
11076 sum0 += (unsigned char)*p;
11077 p++;
11078 }
11079
11080 if (bits == 0) {
11081 if (sum0) {
11082 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11083 }
11084 }
11085 else {
11086 if (sum == INT2FIX(0)) {
11087 if (bits < (int)sizeof(long)*CHAR_BIT) {
11088 sum0 &= (((unsigned long)1)<<bits)-1;
11089 }
11090 sum = LONG2FIX(sum0);
11091 }
11092 else {
11093 VALUE mod;
11094
11095 if (sum0) {
11096 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
11097 }
11098
11099 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
11100 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
11101 sum = rb_funcall(sum, '&', 1, mod);
11102 }
11103 }
11104 return sum;
11105}
11106
11107static VALUE
11108rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
11109{
11110 rb_encoding *enc;
11111 VALUE w;
11112 long width, len, flen = 1, fclen = 1;
11113 VALUE res;
11114 char *p;
11115 const char *f = " ";
11116 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
11117 VALUE pad;
11118 int singlebyte = 1, cr;
11119 int termlen;
11120
11121 rb_scan_args(argc, argv, "11", &w, &pad);
11122 enc = STR_ENC_GET(str);
11123 termlen = rb_enc_mbminlen(enc);
11124 width = NUM2LONG(w);
11125 if (argc == 2) {
11126 StringValue(pad);
11127 enc = rb_enc_check(str, pad);
11128 f = RSTRING_PTR(pad);
11129 flen = RSTRING_LEN(pad);
11130 fclen = str_strlen(pad, enc); /* rb_enc_check */
11131 singlebyte = single_byte_optimizable(pad);
11132 if (flen == 0 || fclen == 0) {
11133 rb_raise(rb_eArgError, "zero width padding");
11134 }
11135 }
11136 len = str_strlen(str, enc); /* rb_enc_check */
11137 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11138 n = width - len;
11139 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11140 rlen = n - llen;
11141 cr = ENC_CODERANGE(str);
11142 if (flen > 1) {
11143 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11144 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11145 }
11146 size = RSTRING_LEN(str);
11147 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11148 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11149 (len += llen2 + rlen2) >= LONG_MAX - size) {
11150 rb_raise(rb_eArgError, "argument too big");
11151 }
11152 len += size;
11153 res = str_enc_new(rb_cString, 0, len, enc);
11154 p = RSTRING_PTR(res);
11155 if (flen <= 1) {
11156 memset(p, *f, llen);
11157 p += llen;
11158 }
11159 else {
11160 while (llen >= fclen) {
11161 memcpy(p,f,flen);
11162 p += flen;
11163 llen -= fclen;
11164 }
11165 if (llen > 0) {
11166 memcpy(p, f, llen2);
11167 p += llen2;
11168 }
11169 }
11170 memcpy(p, RSTRING_PTR(str), size);
11171 p += size;
11172 if (flen <= 1) {
11173 memset(p, *f, rlen);
11174 p += rlen;
11175 }
11176 else {
11177 while (rlen >= fclen) {
11178 memcpy(p,f,flen);
11179 p += flen;
11180 rlen -= fclen;
11181 }
11182 if (rlen > 0) {
11183 memcpy(p, f, rlen2);
11184 p += rlen2;
11185 }
11186 }
11187 TERM_FILL(p, termlen);
11188 STR_SET_LEN(res, p-RSTRING_PTR(res));
11189
11190 if (argc == 2)
11191 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11192 if (cr != ENC_CODERANGE_BROKEN)
11193 ENC_CODERANGE_SET(res, cr);
11194
11195 RB_GC_GUARD(pad);
11196 return res;
11197}
11198
11199
11200/*
11201 * call-seq:
11202 * ljust(width, pad_string = ' ') -> new_string
11203 *
11204 * :include: doc/string/ljust.rdoc
11205 *
11206 */
11207
11208static VALUE
11209rb_str_ljust(int argc, VALUE *argv, VALUE str)
11210{
11211 return rb_str_justify(argc, argv, str, 'l');
11212}
11213
11214/*
11215 * call-seq:
11216 * rjust(width, pad_string = ' ') -> new_string
11217 *
11218 * :include: doc/string/rjust.rdoc
11219 *
11220 */
11221
11222static VALUE
11223rb_str_rjust(int argc, VALUE *argv, VALUE str)
11224{
11225 return rb_str_justify(argc, argv, str, 'r');
11226}
11227
11228
11229/*
11230 * call-seq:
11231 * center(size, pad_string = ' ') -> new_string
11232 *
11233 * :include: doc/string/center.rdoc
11234 *
11235 */
11236
11237static VALUE
11238rb_str_center(int argc, VALUE *argv, VALUE str)
11239{
11240 return rb_str_justify(argc, argv, str, 'c');
11241}
11242
11243/*
11244 * call-seq:
11245 * partition(pattern) -> [pre_match, first_match, post_match]
11246 *
11247 * :include: doc/string/partition.rdoc
11248 *
11249 */
11250
11251static VALUE
11252rb_str_partition(VALUE str, VALUE sep)
11253{
11254 long pos;
11255
11256 sep = get_pat_quoted(sep, 0);
11257 if (RB_TYPE_P(sep, T_REGEXP)) {
11258 if (rb_reg_search(sep, str, 0, 0) < 0) {
11259 goto failed;
11260 }
11261 VALUE match = rb_backref_get();
11262 struct re_registers *regs = RMATCH_REGS(match);
11263
11264 pos = BEG(0);
11265 sep = rb_str_subseq(str, pos, END(0) - pos);
11266 }
11267 else {
11268 pos = rb_str_index(str, sep, 0);
11269 if (pos < 0) goto failed;
11270 }
11271 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11272 sep,
11273 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11274 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11275
11276 failed:
11277 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11278}
11279
11280/*
11281 * call-seq:
11282 * rpartition(pattern) -> [pre_match, last_match, post_match]
11283 *
11284 * :include: doc/string/rpartition.rdoc
11285 *
11286 */
11287
11288static VALUE
11289rb_str_rpartition(VALUE str, VALUE sep)
11290{
11291 long pos = RSTRING_LEN(str);
11292
11293 sep = get_pat_quoted(sep, 0);
11294 if (RB_TYPE_P(sep, T_REGEXP)) {
11295 if (rb_reg_search(sep, str, pos, 1) < 0) {
11296 goto failed;
11297 }
11298 VALUE match = rb_backref_get();
11299 struct re_registers *regs = RMATCH_REGS(match);
11300
11301 pos = BEG(0);
11302 sep = rb_str_subseq(str, pos, END(0) - pos);
11303 }
11304 else {
11305 pos = rb_str_sublen(str, pos);
11306 pos = rb_str_rindex(str, sep, pos);
11307 if (pos < 0) {
11308 goto failed;
11309 }
11310 }
11311
11312 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11313 sep,
11314 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11315 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11316 failed:
11317 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11318}
11319
11320/*
11321 * call-seq:
11322 * start_with?(*patterns) -> true or false
11323 *
11324 * :include: doc/string/start_with_p.rdoc
11325 *
11326 */
11327
11328static VALUE
11329rb_str_start_with(int argc, VALUE *argv, VALUE str)
11330{
11331 int i;
11332
11333 for (i=0; i<argc; i++) {
11334 VALUE tmp = argv[i];
11335 if (RB_TYPE_P(tmp, T_REGEXP)) {
11336 if (rb_reg_start_with_p(tmp, str))
11337 return Qtrue;
11338 }
11339 else {
11340 const char *p, *s, *e;
11341 long slen, tlen;
11342 rb_encoding *enc;
11343
11344 StringValue(tmp);
11345 enc = rb_enc_check(str, tmp);
11346 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11347 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11348 p = RSTRING_PTR(str);
11349 e = p + slen;
11350 s = p + tlen;
11351 if (!at_char_right_boundary(p, s, e, enc))
11352 continue;
11353 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11354 return Qtrue;
11355 }
11356 }
11357 return Qfalse;
11358}
11359
11360/*
11361 * call-seq:
11362 * end_with?(*strings) -> true or false
11363 *
11364 * :include: doc/string/end_with_p.rdoc
11365 *
11366 */
11367
11368static VALUE
11369rb_str_end_with(int argc, VALUE *argv, VALUE str)
11370{
11371 int i;
11372
11373 for (i=0; i<argc; i++) {
11374 VALUE tmp = argv[i];
11375 const char *p, *s, *e;
11376 long slen, tlen;
11377 rb_encoding *enc;
11378
11379 StringValue(tmp);
11380 enc = rb_enc_check(str, tmp);
11381 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11382 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11383 p = RSTRING_PTR(str);
11384 e = p + slen;
11385 s = e - tlen;
11386 if (!at_char_boundary(p, s, e, enc))
11387 continue;
11388 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11389 return Qtrue;
11390 }
11391 return Qfalse;
11392}
11393
11403static long
11404deleted_prefix_length(VALUE str, VALUE prefix)
11405{
11406 const char *strptr, *prefixptr;
11407 long olen, prefixlen;
11408 rb_encoding *enc = rb_enc_get(str);
11409
11410 StringValue(prefix);
11411
11412 if (!is_broken_string(prefix) ||
11413 !rb_enc_asciicompat(enc) ||
11414 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11415 enc = rb_enc_check(str, prefix);
11416 }
11417
11418 /* return 0 if not start with prefix */
11419 prefixlen = RSTRING_LEN(prefix);
11420 if (prefixlen <= 0) return 0;
11421 olen = RSTRING_LEN(str);
11422 if (olen < prefixlen) return 0;
11423 strptr = RSTRING_PTR(str);
11424 prefixptr = RSTRING_PTR(prefix);
11425 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11426 if (is_broken_string(prefix)) {
11427 if (!is_broken_string(str)) {
11428 /* prefix in a valid string cannot be broken */
11429 return 0;
11430 }
11431 const char *strend = strptr + olen;
11432 const char *after_prefix = strptr + prefixlen;
11433 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11434 /* prefix does not end at char-boundary */
11435 return 0;
11436 }
11437 }
11438 /* prefix part in `str` also should be valid. */
11439
11440 return prefixlen;
11441}
11442
11443/*
11444 * call-seq:
11445 * delete_prefix!(prefix) -> self or nil
11446 *
11447 * Like String#delete_prefix, except that +self+ is modified in place;
11448 * returns +self+ if the prefix is removed, +nil+ otherwise.
11449 *
11450 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11451 */
11452
11453static VALUE
11454rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11455{
11456 long prefixlen;
11457 str_modify_keep_cr(str);
11458
11459 prefixlen = deleted_prefix_length(str, prefix);
11460 if (prefixlen <= 0) return Qnil;
11461
11462 return rb_str_drop_bytes(str, prefixlen);
11463}
11464
11465/*
11466 * call-seq:
11467 * delete_prefix(prefix) -> new_string
11468 *
11469 * :include: doc/string/delete_prefix.rdoc
11470 *
11471 */
11472
11473static VALUE
11474rb_str_delete_prefix(VALUE str, VALUE prefix)
11475{
11476 long prefixlen;
11477
11478 prefixlen = deleted_prefix_length(str, prefix);
11479 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11480
11481 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11482}
11483
11493static long
11494deleted_suffix_length(VALUE str, VALUE suffix)
11495{
11496 const char *strptr, *suffixptr;
11497 long olen, suffixlen;
11498 rb_encoding *enc;
11499
11500 StringValue(suffix);
11501 if (is_broken_string(suffix)) return 0;
11502 enc = rb_enc_check(str, suffix);
11503
11504 /* return 0 if not start with suffix */
11505 suffixlen = RSTRING_LEN(suffix);
11506 if (suffixlen <= 0) return 0;
11507 olen = RSTRING_LEN(str);
11508 if (olen < suffixlen) return 0;
11509 strptr = RSTRING_PTR(str);
11510 suffixptr = RSTRING_PTR(suffix);
11511 const char *strend = strptr + olen;
11512 const char *before_suffix = strend - suffixlen;
11513 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11514 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11515
11516 return suffixlen;
11517}
11518
11519/*
11520 * call-seq:
11521 * delete_suffix!(suffix) -> self or nil
11522 *
11523 * Like String#delete_suffix, except that +self+ is modified in place;
11524 * returns +self+ if the suffix is removed, +nil+ otherwise.
11525 *
11526 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11527 */
11528
11529static VALUE
11530rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11531{
11532 long olen, suffixlen, len;
11533 str_modifiable(str);
11534
11535 suffixlen = deleted_suffix_length(str, suffix);
11536 if (suffixlen <= 0) return Qnil;
11537
11538 olen = RSTRING_LEN(str);
11539 str_modify_keep_cr(str);
11540 len = olen - suffixlen;
11541 STR_SET_LEN(str, len);
11542 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11543 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11545 }
11546 return str;
11547}
11548
11549/*
11550 * call-seq:
11551 * delete_suffix(suffix) -> new_string
11552 *
11553 * :include: doc/string/delete_suffix.rdoc
11554 *
11555 */
11556
11557static VALUE
11558rb_str_delete_suffix(VALUE str, VALUE suffix)
11559{
11560 long suffixlen;
11561
11562 suffixlen = deleted_suffix_length(str, suffix);
11563 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11564
11565 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11566}
11567
11568void
11569rb_str_setter(VALUE val, ID id, VALUE *var)
11570{
11571 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11572 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11573 }
11574 *var = val;
11575}
11576
11577static void
11578nil_setter_warning(ID id)
11579{
11580 rb_warn_deprecated("non-nil '%"PRIsVALUE"'", NULL, rb_id2str(id));
11581}
11582
11583void
11584rb_deprecated_str_setter(VALUE val, ID id, VALUE *var)
11585{
11586 rb_str_setter(val, id, var);
11587 if (!NIL_P(*var)) {
11588 nil_setter_warning(id);
11589 }
11590}
11591
11592static void
11593rb_fs_setter(VALUE val, ID id, VALUE *var)
11594{
11595 val = rb_fs_check(val);
11596 if (!val) {
11597 rb_raise(rb_eTypeError,
11598 "value of %"PRIsVALUE" must be String or Regexp",
11599 rb_id2str(id));
11600 }
11601 if (!NIL_P(val)) {
11602 nil_setter_warning(id);
11603 }
11604 *var = val;
11605}
11606
11607
11608/*
11609 * call-seq:
11610 * force_encoding(encoding) -> self
11611 *
11612 * :include: doc/string/force_encoding.rdoc
11613 *
11614 */
11615
11616static VALUE
11617rb_str_force_encoding(VALUE str, VALUE enc)
11618{
11619 str_modifiable(str);
11620
11621 rb_encoding *encoding = rb_to_encoding(enc);
11622 int idx = rb_enc_to_index(encoding);
11623
11624 // If the encoding is unchanged, we do nothing.
11625 if (ENCODING_GET(str) == idx) {
11626 return str;
11627 }
11628
11629 rb_enc_associate_index(str, idx);
11630
11631 // If the coderange was 7bit and the new encoding is ASCII-compatible
11632 // we can keep the coderange.
11633 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11634 return str;
11635 }
11636
11638 return str;
11639}
11640
11641/*
11642 * call-seq:
11643 * b -> new_string
11644 *
11645 * :include: doc/string/b.rdoc
11646 *
11647 */
11648
11649static VALUE
11650rb_str_b(VALUE str)
11651{
11652 VALUE str2;
11653 if (STR_EMBED_P(str)) {
11654 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11655 }
11656 else {
11657 str2 = str_alloc_heap(rb_cString);
11658 }
11659 str_replace_shared_without_enc(str2, str);
11660
11661 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11662 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11663 // If we know the receiver's code range then we know the result's code range.
11664 int cr = ENC_CODERANGE(str);
11665 switch (cr) {
11666 case ENC_CODERANGE_7BIT:
11668 break;
11672 break;
11673 default:
11674 ENC_CODERANGE_CLEAR(str2);
11675 break;
11676 }
11677 }
11678
11679 return str2;
11680}
11681
11682/*
11683 * call-seq:
11684 * valid_encoding? -> true or false
11685 *
11686 * :include: doc/string/valid_encoding_p.rdoc
11687 *
11688 */
11689
11690static VALUE
11691rb_str_valid_encoding_p(VALUE str)
11692{
11693 int cr = rb_enc_str_coderange(str);
11694
11695 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11696}
11697
11698/*
11699 * call-seq:
11700 * ascii_only? -> true or false
11701 *
11702 * Returns whether +self+ contains only ASCII characters:
11703 *
11704 * 'abc'.ascii_only? # => true
11705 * "abc\u{6666}".ascii_only? # => false
11706 *
11707 * Related: see {Querying}[rdoc-ref:String@Querying].
11708 */
11709
11710static VALUE
11711rb_str_is_ascii_only_p(VALUE str)
11712{
11713 int cr = rb_enc_str_coderange(str);
11714
11715 return RBOOL(cr == ENC_CODERANGE_7BIT);
11716}
11717
11718VALUE
11720{
11721 static const char ellipsis[] = "...";
11722 const long ellipsislen = sizeof(ellipsis) - 1;
11723 rb_encoding *const enc = rb_enc_get(str);
11724 const long blen = RSTRING_LEN(str);
11725 const char *const p = RSTRING_PTR(str), *e = p + blen;
11726 VALUE estr, ret = 0;
11727
11728 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11729 if (len * rb_enc_mbminlen(enc) >= blen ||
11730 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11731 ret = str;
11732 }
11733 else if (len <= ellipsislen ||
11734 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11735 if (rb_enc_asciicompat(enc)) {
11736 ret = rb_str_new(ellipsis, len);
11737 rb_enc_associate(ret, enc);
11738 }
11739 else {
11740 estr = rb_usascii_str_new(ellipsis, len);
11741 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11742 }
11743 }
11744 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11745 rb_str_cat(ret, ellipsis, ellipsislen);
11746 }
11747 else {
11748 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11749 rb_enc_from_encoding(enc), 0, Qnil);
11750 rb_str_append(ret, estr);
11751 }
11752 return ret;
11753}
11754
11755static VALUE
11756str_compat_and_valid(VALUE str, rb_encoding *enc)
11757{
11758 int cr;
11759 str = StringValue(str);
11760 cr = rb_enc_str_coderange(str);
11761 if (cr == ENC_CODERANGE_BROKEN) {
11762 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11763 }
11764 else {
11765 rb_encoding *e = STR_ENC_GET(str);
11766 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11767 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11768 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11769 }
11770 }
11771 return str;
11772}
11773
11774static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11775
11776VALUE
11778{
11779 rb_encoding *enc = STR_ENC_GET(str);
11780 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11781}
11782
11783VALUE
11784rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11785{
11786 int cr = ENC_CODERANGE_UNKNOWN;
11787 if (enc == STR_ENC_GET(str)) {
11788 /* cached coderange makes sense only when enc equals the
11789 * actual encoding of str */
11790 cr = ENC_CODERANGE(str);
11791 }
11792 return enc_str_scrub(enc, str, repl, cr);
11793}
11794
11795static VALUE
11796enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11797{
11798 int encidx;
11799 VALUE buf = Qnil;
11800 const char *rep, *p, *e, *p1, *sp;
11801 long replen = -1;
11802 long slen;
11803
11804 if (rb_block_given_p()) {
11805 if (!NIL_P(repl))
11806 rb_raise(rb_eArgError, "both of block and replacement given");
11807 replen = 0;
11808 }
11809
11810 if (ENC_CODERANGE_CLEAN_P(cr))
11811 return Qnil;
11812
11813 if (!NIL_P(repl)) {
11814 repl = str_compat_and_valid(repl, enc);
11815 }
11816
11817 if (rb_enc_dummy_p(enc)) {
11818 return Qnil;
11819 }
11820 encidx = rb_enc_to_index(enc);
11821
11822#define DEFAULT_REPLACE_CHAR(str) do { \
11823 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11824 rep = replace; replen = (int)sizeof(replace); \
11825 } while (0)
11826
11827 slen = RSTRING_LEN(str);
11828 p = RSTRING_PTR(str);
11829 e = RSTRING_END(str);
11830 p1 = p;
11831 sp = p;
11832
11833 if (rb_enc_asciicompat(enc)) {
11834 int rep7bit_p;
11835 if (!replen) {
11836 rep = NULL;
11837 rep7bit_p = FALSE;
11838 }
11839 else if (!NIL_P(repl)) {
11840 rep = RSTRING_PTR(repl);
11841 replen = RSTRING_LEN(repl);
11842 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11843 }
11844 else if (encidx == rb_utf8_encindex()) {
11845 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11846 rep7bit_p = FALSE;
11847 }
11848 else {
11849 DEFAULT_REPLACE_CHAR("?");
11850 rep7bit_p = TRUE;
11851 }
11852 cr = ENC_CODERANGE_7BIT;
11853
11854 p = search_nonascii(p, e);
11855 if (!p) {
11856 p = e;
11857 }
11858 while (p < e) {
11859 int ret = rb_enc_precise_mbclen(p, e, enc);
11860 if (MBCLEN_NEEDMORE_P(ret)) {
11861 break;
11862 }
11863 else if (MBCLEN_CHARFOUND_P(ret)) {
11865 p += MBCLEN_CHARFOUND_LEN(ret);
11866 }
11867 else if (MBCLEN_INVALID_P(ret)) {
11868 /*
11869 * p1~p: valid ascii/multibyte chars
11870 * p ~e: invalid bytes + unknown bytes
11871 */
11872 long clen = rb_enc_mbmaxlen(enc);
11873 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11874 if (p > p1) {
11875 rb_str_buf_cat(buf, p1, p - p1);
11876 }
11877
11878 if (e - p < clen) clen = e - p;
11879 if (clen <= 2) {
11880 clen = 1;
11881 }
11882 else {
11883 const char *q = p;
11884 clen--;
11885 for (; clen > 1; clen--) {
11886 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11887 if (MBCLEN_NEEDMORE_P(ret)) break;
11888 if (MBCLEN_INVALID_P(ret)) continue;
11890 }
11891 }
11892 if (rep) {
11893 rb_str_buf_cat(buf, rep, replen);
11894 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11895 }
11896 else {
11897 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11898 str_mod_check(str, sp, slen);
11899 repl = str_compat_and_valid(repl, enc);
11900 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11903 }
11904 p += clen;
11905 p1 = p;
11906 p = search_nonascii(p, e);
11907 if (!p) {
11908 p = e;
11909 break;
11910 }
11911 }
11912 else {
11914 }
11915 }
11916 if (NIL_P(buf)) {
11917 if (p == e) {
11918 ENC_CODERANGE_SET(str, cr);
11919 return Qnil;
11920 }
11921 buf = rb_str_buf_new(RSTRING_LEN(str));
11922 }
11923 if (p1 < p) {
11924 rb_str_buf_cat(buf, p1, p - p1);
11925 }
11926 if (p < e) {
11927 if (rep) {
11928 rb_str_buf_cat(buf, rep, replen);
11929 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11930 }
11931 else {
11932 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11933 str_mod_check(str, sp, slen);
11934 repl = str_compat_and_valid(repl, enc);
11935 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11938 }
11939 }
11940 }
11941 else {
11942 /* ASCII incompatible */
11943 long mbminlen = rb_enc_mbminlen(enc);
11944 if (!replen) {
11945 rep = NULL;
11946 }
11947 else if (!NIL_P(repl)) {
11948 rep = RSTRING_PTR(repl);
11949 replen = RSTRING_LEN(repl);
11950 }
11951 else if (encidx == ENCINDEX_UTF_16BE) {
11952 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11953 }
11954 else if (encidx == ENCINDEX_UTF_16LE) {
11955 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11956 }
11957 else if (encidx == ENCINDEX_UTF_32BE) {
11958 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11959 }
11960 else if (encidx == ENCINDEX_UTF_32LE) {
11961 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11962 }
11963 else {
11964 DEFAULT_REPLACE_CHAR("?");
11965 }
11966
11967 while (p < e) {
11968 int ret = rb_enc_precise_mbclen(p, e, enc);
11969 if (MBCLEN_NEEDMORE_P(ret)) {
11970 break;
11971 }
11972 else if (MBCLEN_CHARFOUND_P(ret)) {
11973 p += MBCLEN_CHARFOUND_LEN(ret);
11974 }
11975 else if (MBCLEN_INVALID_P(ret)) {
11976 const char *q = p;
11977 long clen = rb_enc_mbmaxlen(enc);
11978 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11979 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11980
11981 if (e - p < clen) clen = e - p;
11982 if (clen <= mbminlen * 2) {
11983 clen = mbminlen;
11984 }
11985 else {
11986 clen -= mbminlen;
11987 for (; clen > mbminlen; clen-=mbminlen) {
11988 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11989 if (MBCLEN_NEEDMORE_P(ret)) break;
11990 if (MBCLEN_INVALID_P(ret)) continue;
11992 }
11993 }
11994 if (rep) {
11995 rb_str_buf_cat(buf, rep, replen);
11996 }
11997 else {
11998 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11999 str_mod_check(str, sp, slen);
12000 repl = str_compat_and_valid(repl, enc);
12001 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12002 }
12003 p += clen;
12004 p1 = p;
12005 }
12006 else {
12008 }
12009 }
12010 if (NIL_P(buf)) {
12011 if (p == e) {
12013 return Qnil;
12014 }
12015 buf = rb_str_buf_new(RSTRING_LEN(str));
12016 }
12017 if (p1 < p) {
12018 rb_str_buf_cat(buf, p1, p - p1);
12019 }
12020 if (p < e) {
12021 if (rep) {
12022 rb_str_buf_cat(buf, rep, replen);
12023 }
12024 else {
12025 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
12026 str_mod_check(str, sp, slen);
12027 repl = str_compat_and_valid(repl, enc);
12028 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
12029 }
12030 }
12032 }
12033 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
12034 return buf;
12035}
12036
12037/*
12038 * call-seq:
12039 * scrub(replacement_string = default_replacement_string) -> new_string
12040 * scrub{|sequence| ... } -> new_string
12041 *
12042 * :include: doc/string/scrub.rdoc
12043 *
12044 */
12045static VALUE
12046str_scrub(int argc, VALUE *argv, VALUE str)
12047{
12048 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12049 VALUE new = rb_str_scrub(str, repl);
12050 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
12051}
12052
12053/*
12054 * call-seq:
12055 * scrub!(replacement_string = default_replacement_string) -> self
12056 * scrub!{|sequence| ... } -> self
12057 *
12058 * Like String#scrub, except that:
12059 *
12060 * - Any replacements are made in +self+.
12061 * - Returns +self+.
12062 *
12063 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12064 *
12065 */
12066static VALUE
12067str_scrub_bang(int argc, VALUE *argv, VALUE str)
12068{
12069 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
12070 VALUE new = rb_str_scrub(str, repl);
12071 if (!NIL_P(new)) rb_str_replace(str, new);
12072 return str;
12073}
12074
12075static ID id_normalize;
12076static ID id_normalized_p;
12077static VALUE mUnicodeNormalize;
12078
12079static VALUE
12080unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
12081{
12082 static int UnicodeNormalizeRequired = 0;
12083 VALUE argv2[2];
12084
12085 if (!UnicodeNormalizeRequired) {
12086 rb_require("unicode_normalize/normalize.rb");
12087 UnicodeNormalizeRequired = 1;
12088 }
12089 argv2[0] = str;
12090 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
12091 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
12092}
12093
12094/*
12095 * call-seq:
12096 * unicode_normalize(form = :nfc) -> string
12097 *
12098 * :include: doc/string/unicode_normalize.rdoc
12099 *
12100 */
12101static VALUE
12102rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
12103{
12104 return unicode_normalize_common(argc, argv, str, id_normalize);
12105}
12106
12107/*
12108 * call-seq:
12109 * unicode_normalize!(form = :nfc) -> self
12110 *
12111 * Like String#unicode_normalize, except that the normalization
12112 * is performed on +self+ (not on a copy of +self+).
12113 *
12114 * Related: see {Modifying}[rdoc-ref:String@Modifying].
12115 *
12116 */
12117static VALUE
12118rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12119{
12120 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12121}
12122
12123/* call-seq:
12124 * unicode_normalized?(form = :nfc) -> true or false
12125 *
12126 * Returns whether +self+ is in the given +form+ of Unicode normalization;
12127 * see String#unicode_normalize.
12128 *
12129 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12130 *
12131 * Examples:
12132 *
12133 * "a\u0300".unicode_normalized? # => false
12134 * "a\u0300".unicode_normalized?(:nfd) # => true
12135 * "\u00E0".unicode_normalized? # => true
12136 * "\u00E0".unicode_normalized?(:nfd) # => false
12137 *
12138 *
12139 * Raises an exception if +self+ is not in a Unicode encoding:
12140 *
12141 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12142 * s.unicode_normalized? # Raises Encoding::CompatibilityError
12143 *
12144 * Related: see {Querying}[rdoc-ref:String@Querying].
12145 */
12146static VALUE
12147rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12148{
12149 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12150}
12151
12152/**********************************************************************
12153 * Document-class: Symbol
12154 *
12155 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12156 *
12157 * You can create a +Symbol+ object explicitly with:
12158 *
12159 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12160 *
12161 * The same +Symbol+ object will be
12162 * created for a given name or string for the duration of a program's
12163 * execution, regardless of the context or meaning of that name. Thus
12164 * if <code>Fred</code> is a constant in one context, a method in
12165 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12166 * will be the same object in all three contexts.
12167 *
12168 * module One
12169 * class Fred
12170 * end
12171 * $f1 = :Fred
12172 * end
12173 * module Two
12174 * Fred = 1
12175 * $f2 = :Fred
12176 * end
12177 * def Fred()
12178 * end
12179 * $f3 = :Fred
12180 * $f1.object_id #=> 2514190
12181 * $f2.object_id #=> 2514190
12182 * $f3.object_id #=> 2514190
12183 *
12184 * Constant, method, and variable names are returned as symbols:
12185 *
12186 * module One
12187 * Two = 2
12188 * def three; 3 end
12189 * @four = 4
12190 * @@five = 5
12191 * $six = 6
12192 * end
12193 * seven = 7
12194 *
12195 * One.constants
12196 * # => [:Two]
12197 * One.instance_methods(true)
12198 * # => [:three]
12199 * One.instance_variables
12200 * # => [:@four]
12201 * One.class_variables
12202 * # => [:@@five]
12203 * global_variables.grep(/six/)
12204 * # => [:$six]
12205 * local_variables
12206 * # => [:seven]
12207 *
12208 * A +Symbol+ object differs from a String object in that
12209 * a +Symbol+ object represents an identifier, while a String object
12210 * represents text or data.
12211 *
12212 * == What's Here
12213 *
12214 * First, what's elsewhere. Class +Symbol+:
12215 *
12216 * - Inherits from {class Object}[rdoc-ref:Object@Whats+Here].
12217 * - Includes {module Comparable}[rdoc-ref:Comparable@Whats+Here].
12218 *
12219 * Here, class +Symbol+ provides methods that are useful for:
12220 *
12221 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12222 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12223 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12224 *
12225 * === Methods for Querying
12226 *
12227 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12228 * - #=~: Returns the index of the first substring in symbol that matches a
12229 * given Regexp or other object; returns +nil+ if no match is found.
12230 * - #[], #slice : Returns a substring of symbol
12231 * determined by a given index, start/length, or range, or string.
12232 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12233 * - #encoding: Returns the Encoding object that represents the encoding
12234 * of symbol.
12235 * - #end_with?: Returns +true+ if symbol ends with
12236 * any of the given strings.
12237 * - #match: Returns a MatchData object if symbol
12238 * matches a given Regexp; +nil+ otherwise.
12239 * - #match?: Returns +true+ if symbol
12240 * matches a given Regexp; +false+ otherwise.
12241 * - #length, #size: Returns the number of characters in symbol.
12242 * - #start_with?: Returns +true+ if symbol starts with
12243 * any of the given strings.
12244 *
12245 * === Methods for Comparing
12246 *
12247 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12248 * or larger than symbol.
12249 * - #==, #===: Returns +true+ if a given symbol has the same content and
12250 * encoding.
12251 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12252 * symbol is smaller than, equal to, or larger than symbol.
12253 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12254 * after Unicode case folding; +false+ otherwise.
12255 *
12256 * === Methods for Converting
12257 *
12258 * - #capitalize: Returns symbol with the first character upcased
12259 * and all other characters downcased.
12260 * - #downcase: Returns symbol with all characters downcased.
12261 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12262 * - #name: Returns the frozen string corresponding to symbol.
12263 * - #succ, #next: Returns the symbol that is the successor to symbol.
12264 * - #swapcase: Returns symbol with all upcase characters downcased
12265 * and all downcase characters upcased.
12266 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12267 * - #to_s, #id2name: Returns the string corresponding to +self+.
12268 * - #to_sym, #intern: Returns +self+.
12269 * - #upcase: Returns symbol with all characters upcased.
12270 *
12271 */
12272
12273
12274/*
12275 * call-seq:
12276 * self == other -> true or false
12277 *
12278 * Returns whether +other+ is the same object as +self+.
12279 */
12280
12281#define sym_equal rb_obj_equal
12282
12283static int
12284sym_printable(const char *s, const char *send, rb_encoding *enc)
12285{
12286 while (s < send) {
12287 int n;
12288 int c = rb_enc_precise_mbclen(s, send, enc);
12289
12290 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12291 n = MBCLEN_CHARFOUND_LEN(c);
12292 c = rb_enc_mbc_to_codepoint(s, send, enc);
12293 if (!rb_enc_isprint(c, enc)) return FALSE;
12294 s += n;
12295 }
12296 return TRUE;
12297}
12298
12299int
12300rb_str_symname_p(VALUE sym)
12301{
12302 rb_encoding *enc;
12303 const char *ptr;
12304 long len;
12305 rb_encoding *resenc = rb_default_internal_encoding();
12306
12307 if (resenc == NULL) resenc = rb_default_external_encoding();
12308 enc = STR_ENC_GET(sym);
12309 ptr = RSTRING_PTR(sym);
12310 len = RSTRING_LEN(sym);
12311 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12312 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12313 return FALSE;
12314 }
12315 return TRUE;
12316}
12317
12318VALUE
12319rb_str_quote_unprintable(VALUE str)
12320{
12321 rb_encoding *enc;
12322 const char *ptr;
12323 long len;
12324 rb_encoding *resenc;
12325
12326 Check_Type(str, T_STRING);
12327 resenc = rb_default_internal_encoding();
12328 if (resenc == NULL) resenc = rb_default_external_encoding();
12329 enc = STR_ENC_GET(str);
12330 ptr = RSTRING_PTR(str);
12331 len = RSTRING_LEN(str);
12332 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12333 !sym_printable(ptr, ptr + len, enc)) {
12334 return rb_str_escape(str);
12335 }
12336 return str;
12337}
12338
12339VALUE
12340rb_id_quote_unprintable(ID id)
12341{
12342 VALUE str = rb_id2str(id);
12343 if (!rb_str_symname_p(str)) {
12344 return rb_str_escape(str);
12345 }
12346 return str;
12347}
12348
12349/*
12350 * call-seq:
12351 * inspect -> string
12352 *
12353 * Returns a string representation of +self+ (including the leading colon):
12354 *
12355 * :foo.inspect # => ":foo"
12356 *
12357 * Related: Symbol#to_s, Symbol#name.
12358 *
12359 */
12360
12361static VALUE
12362sym_inspect(VALUE sym)
12363{
12364 VALUE str = rb_sym2str(sym);
12365 const char *ptr;
12366 long len;
12367 char *dest;
12368
12369 if (!rb_str_symname_p(str)) {
12370 str = rb_str_inspect(str);
12371 len = RSTRING_LEN(str);
12372 rb_str_resize(str, len + 1);
12373 dest = RSTRING_PTR(str);
12374 memmove(dest + 1, dest, len);
12375 }
12376 else {
12377 rb_encoding *enc = STR_ENC_GET(str);
12378 VALUE orig_str = str;
12379
12380 len = RSTRING_LEN(orig_str);
12381 str = rb_enc_str_new(0, len + 1, enc);
12382
12383 // Get data pointer after allocation
12384 ptr = RSTRING_PTR(orig_str);
12385 dest = RSTRING_PTR(str);
12386 memcpy(dest + 1, ptr, len);
12387
12388 RB_GC_GUARD(orig_str);
12389 }
12390 dest[0] = ':';
12391
12393
12394 return str;
12395}
12396
12397VALUE
12399{
12400 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12401 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12402 return str;
12403}
12404
12405VALUE
12406rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12407{
12408 VALUE obj;
12409
12410 if (argc < 1) {
12411 rb_raise(rb_eArgError, "no receiver given");
12412 }
12413 obj = argv[0];
12414 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12415}
12416
12417/*
12418 * call-seq:
12419 * succ
12420 *
12421 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12422 *
12423 * :foo.succ # => :fop
12424 *
12425 * Related: String#succ.
12426 */
12427
12428static VALUE
12429sym_succ(VALUE sym)
12430{
12431 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12432}
12433
12434/*
12435 * call-seq:
12436 * self <=> other -> -1, 0, 1, or nil
12437 *
12438 * Compares +self+ and +other+, using String#<=>.
12439 *
12440 * Returns:
12441 *
12442 * - <tt>self.to_s <=> other.to_s</tt>, if +other+ is a symbol.
12443 * - +nil+, otherwise.
12444 *
12445 * Examples:
12446 *
12447 * :bar <=> :foo # => -1
12448 * :foo <=> :foo # => 0
12449 * :foo <=> :bar # => 1
12450 * :foo <=> 'bar' # => nil
12451 *
12452 * \Class \Symbol includes module Comparable,
12453 * each of whose methods uses Symbol#<=> for comparison.
12454 *
12455 * Related: String#<=>.
12456 */
12457
12458static VALUE
12459sym_cmp(VALUE sym, VALUE other)
12460{
12461 if (!SYMBOL_P(other)) {
12462 return Qnil;
12463 }
12464 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12465}
12466
12467/*
12468 * call-seq:
12469 * casecmp(object) -> -1, 0, 1, or nil
12470 *
12471 * :include: doc/symbol/casecmp.rdoc
12472 *
12473 */
12474
12475static VALUE
12476sym_casecmp(VALUE sym, VALUE other)
12477{
12478 if (!SYMBOL_P(other)) {
12479 return Qnil;
12480 }
12481 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12482}
12483
12484/*
12485 * call-seq:
12486 * casecmp?(object) -> true, false, or nil
12487 *
12488 * :include: doc/symbol/casecmp_p.rdoc
12489 *
12490 */
12491
12492static VALUE
12493sym_casecmp_p(VALUE sym, VALUE other)
12494{
12495 if (!SYMBOL_P(other)) {
12496 return Qnil;
12497 }
12498 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12499}
12500
12501/*
12502 * call-seq:
12503 * self =~ other -> integer or nil
12504 *
12505 * Equivalent to <tt>self.to_s =~ other</tt>,
12506 * including possible updates to global variables;
12507 * see String#=~.
12508 *
12509 */
12510
12511static VALUE
12512sym_match(VALUE sym, VALUE other)
12513{
12514 return rb_str_match(rb_sym2str(sym), other);
12515}
12516
12517/*
12518 * call-seq:
12519 * match(pattern, offset = 0) -> matchdata or nil
12520 * match(pattern, offset = 0) {|matchdata| } -> object
12521 *
12522 * Equivalent to <tt>self.to_s.match</tt>,
12523 * including possible updates to global variables;
12524 * see String#match.
12525 *
12526 */
12527
12528static VALUE
12529sym_match_m(int argc, VALUE *argv, VALUE sym)
12530{
12531 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12532}
12533
12534/*
12535 * call-seq:
12536 * match?(pattern, offset) -> true or false
12537 *
12538 * Equivalent to <tt>sym.to_s.match?</tt>;
12539 * see String#match.
12540 *
12541 */
12542
12543static VALUE
12544sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12545{
12546 return rb_str_match_m_p(argc, argv, sym);
12547}
12548
12549/*
12550 * call-seq:
12551 * self[offset] -> string or nil
12552 * self[offset, size] -> string or nil
12553 * self[range] -> string or nil
12554 * self[regexp, capture = 0] -> string or nil
12555 * self[substring] -> string or nil
12556 *
12557 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12558 *
12559 */
12560
12561static VALUE
12562sym_aref(int argc, VALUE *argv, VALUE sym)
12563{
12564 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12565}
12566
12567/*
12568 * call-seq:
12569 * length -> integer
12570 *
12571 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12572 */
12573
12574static VALUE
12575sym_length(VALUE sym)
12576{
12577 return rb_str_length(rb_sym2str(sym));
12578}
12579
12580/*
12581 * call-seq:
12582 * empty? -> true or false
12583 *
12584 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12585 *
12586 */
12587
12588static VALUE
12589sym_empty(VALUE sym)
12590{
12591 return rb_str_empty(rb_sym2str(sym));
12592}
12593
12594/*
12595 * call-seq:
12596 * upcase(mapping) -> symbol
12597 *
12598 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12599 *
12600 * See String#upcase.
12601 *
12602 */
12603
12604static VALUE
12605sym_upcase(int argc, VALUE *argv, VALUE sym)
12606{
12607 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12608}
12609
12610/*
12611 * call-seq:
12612 * downcase(mapping) -> symbol
12613 *
12614 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12615 *
12616 * See String#downcase.
12617 *
12618 * Related: Symbol#upcase.
12619 *
12620 */
12621
12622static VALUE
12623sym_downcase(int argc, VALUE *argv, VALUE sym)
12624{
12625 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12626}
12627
12628/*
12629 * call-seq:
12630 * capitalize(mapping) -> symbol
12631 *
12632 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12633 *
12634 * See String#capitalize.
12635 *
12636 */
12637
12638static VALUE
12639sym_capitalize(int argc, VALUE *argv, VALUE sym)
12640{
12641 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12642}
12643
12644/*
12645 * call-seq:
12646 * swapcase(mapping) -> symbol
12647 *
12648 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12649 *
12650 * See String#swapcase.
12651 *
12652 */
12653
12654static VALUE
12655sym_swapcase(int argc, VALUE *argv, VALUE sym)
12656{
12657 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12658}
12659
12660/*
12661 * call-seq:
12662 * start_with?(*string_or_regexp) -> true or false
12663 *
12664 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12665 *
12666 */
12667
12668static VALUE
12669sym_start_with(int argc, VALUE *argv, VALUE sym)
12670{
12671 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12672}
12673
12674/*
12675 * call-seq:
12676 * end_with?(*strings) -> true or false
12677 *
12678 *
12679 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12680 *
12681 */
12682
12683static VALUE
12684sym_end_with(int argc, VALUE *argv, VALUE sym)
12685{
12686 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12687}
12688
12689/*
12690 * call-seq:
12691 * encoding -> encoding
12692 *
12693 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12694 *
12695 */
12696
12697static VALUE
12698sym_encoding(VALUE sym)
12699{
12700 return rb_obj_encoding(rb_sym2str(sym));
12701}
12702
12703static VALUE
12704string_for_symbol(VALUE name)
12705{
12706 if (!RB_TYPE_P(name, T_STRING)) {
12707 VALUE tmp = rb_check_string_type(name);
12708 if (NIL_P(tmp)) {
12709 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12710 name);
12711 }
12712 name = tmp;
12713 }
12714 return name;
12715}
12716
12717ID
12719{
12720 if (SYMBOL_P(name)) {
12721 return SYM2ID(name);
12722 }
12723 name = string_for_symbol(name);
12724 return rb_intern_str(name);
12725}
12726
12727VALUE
12729{
12730 if (SYMBOL_P(name)) {
12731 return name;
12732 }
12733 name = string_for_symbol(name);
12734 return rb_str_intern(name);
12735}
12736
12737/*
12738 * call-seq:
12739 * Symbol.all_symbols -> array_of_symbols
12740 *
12741 * Returns an array of all symbols currently in Ruby's symbol table:
12742 *
12743 * Symbol.all_symbols.size # => 9334
12744 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12745 *
12746 */
12747
12748static VALUE
12749sym_all_symbols(VALUE _)
12750{
12751 return rb_sym_all_symbols();
12752}
12753
12754VALUE
12755rb_str_to_interned_str(VALUE str)
12756{
12757 return rb_fstring(str);
12758}
12759
12760VALUE
12761rb_interned_str(const char *ptr, long len)
12762{
12763 struct RString fake_str = {RBASIC_INIT};
12764 int encidx = ENCINDEX_US_ASCII;
12765 int coderange = ENC_CODERANGE_7BIT;
12766 if (len > 0 && search_nonascii(ptr, ptr + len)) {
12767 encidx = ENCINDEX_ASCII_8BIT;
12768 coderange = ENC_CODERANGE_VALID;
12769 }
12770 VALUE str = setup_fake_str(&fake_str, ptr, len, encidx);
12771 ENC_CODERANGE_SET(str, coderange);
12772 return register_fstring(str, true, false);
12773}
12774
12775VALUE
12777{
12778 return rb_interned_str(ptr, strlen(ptr));
12779}
12780
12781VALUE
12782rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12783{
12784 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12785 rb_enc_autoload(enc);
12786 }
12787
12788 struct RString fake_str = {RBASIC_INIT};
12789 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12790}
12791
12792VALUE
12793rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12794{
12795 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12796 rb_enc_autoload(enc);
12797 }
12798
12799 struct RString fake_str = {RBASIC_INIT};
12800 VALUE str = register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12801 RUBY_ASSERT(RB_OBJ_SHAREABLE_P(str) && (rb_gc_verify_shareable(str), 1));
12802 return str;
12803}
12804
12805VALUE
12807{
12808 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12809}
12810
12811#if USE_YJIT || USE_ZJIT
12812void
12813rb_jit_str_concat_codepoint(VALUE str, VALUE codepoint)
12814{
12815 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12816 ssize_t code = RB_NUM2SSIZE(codepoint);
12817
12818 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12819 rb_str_buf_cat_byte(str, (char) code);
12820 return;
12821 }
12822 }
12823
12824 rb_str_concat(str, codepoint);
12825}
12826#endif
12827
12828static int
12829fstring_set_class_i(VALUE *str, void *data)
12830{
12831 RBASIC_SET_CLASS(*str, rb_cString);
12832
12833 return ST_CONTINUE;
12834}
12835
12836void
12837Init_String(void)
12838{
12840
12841 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12842
12844 rb_define_alloc_func(rb_cString, empty_str_alloc);
12845 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12846 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12847 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12849 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12850 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12853 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12854 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12855 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12856 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12859 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12860 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12861 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12862 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12865 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12866 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12867 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12868 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12869 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12871 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12873 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12874 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12875 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12876 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12877 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12878 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12879 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12880 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12881 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12882 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12883 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12884 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12885 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12886 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12888 rb_define_method(rb_cString, "+@", str_uplus, 0);
12889 rb_define_method(rb_cString, "-@", str_uminus, 0);
12890 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12891 rb_define_alias(rb_cString, "dedup", "-@");
12892
12893 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12894 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12895 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12896 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12899 rb_define_method(rb_cString, "undump", str_undump, 0);
12900
12901 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12902 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12903 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12904 sym_fold = ID2SYM(rb_intern_const("fold"));
12905
12906 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12907 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12908 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12909 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12910
12911 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12912 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12913 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12914 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12915
12916 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12917 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12918 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12919 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12920 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12921 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12922 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12923 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12924 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12925 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12926 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12927 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12929 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12930 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12931 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12932 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12933 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12934
12935 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12936 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12937 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12938
12939 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12940
12941 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12942 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12943 rb_define_method(rb_cString, "center", rb_str_center, -1);
12944
12945 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12946 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12947 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12948 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12949 rb_define_method(rb_cString, "strip", rb_str_strip, -1);
12950 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, -1);
12951 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, -1);
12952 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12953 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12954
12955 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12956 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12957 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12958 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12959 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, -1);
12960 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, -1);
12961 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, -1);
12962 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12963 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12964
12965 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12966 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12967 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12968 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12969 rb_define_method(rb_cString, "count", rb_str_count, -1);
12970
12971 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12972 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12973 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12974 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12975
12976 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12977 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12978 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12979 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12980 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12981
12982 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12983
12984 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12985 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12986
12987 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12988 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12989
12990 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12991 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12992 rb_define_method(rb_cString, "b", rb_str_b, 0);
12993 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12994 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12995
12996 /* define UnicodeNormalize module here so that we don't have to look it up */
12997 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12998 id_normalize = rb_intern_const("normalize");
12999 id_normalized_p = rb_intern_const("normalized?");
13000
13001 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
13002 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
13003 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
13004
13005 rb_fs = Qnil;
13006 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
13007 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
13008 rb_gc_register_address(&rb_fs);
13009
13014 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
13015
13016 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
13017 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
13018 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
13019 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
13020 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
13021 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
13022
13023 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
13024 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
13025 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
13026 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
13027
13028 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
13029 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
13030 rb_define_method(rb_cSymbol, "length", sym_length, 0);
13031 rb_define_method(rb_cSymbol, "size", sym_length, 0);
13032 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
13033 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
13034 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
13035
13036 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
13037 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
13038 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
13039 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
13040
13041 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
13042 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
13043
13044 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
13045}
13046
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:696
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:404
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1704
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1497
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1610
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2864
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2674
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3154
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:1018
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2943
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:130
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1683
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:128
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:125
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:122
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:127
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:65
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:129
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:126
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:134
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:477
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:661
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3967
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1431
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1427
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1434
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1425
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1429
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2254
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2272
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1325
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3650
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:235
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:85
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:141
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1313
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3334
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1349
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1214
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:3061
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1233
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12782
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:255
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2341
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3778
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1162
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1454
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1355
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:973
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12806
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:829
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:755
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1120
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1207
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:208
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:242
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:712
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:2043
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1091
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:2049
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1949
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4294
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3786
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1917
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1759
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1519
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2494
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1584
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:946
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:940
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3843
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1430
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12398
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2567
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1406
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1753
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3089
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5388
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4206
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3196
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11719
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1791
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1795
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1682
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1196
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1533
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:1008
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1525
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:2003
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4192
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3611
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2430
VALUE rb_str_resurrect(VALUE str)
Like rb_str_dup(), but always create an instance of rb_cString regardless of the given object's class...
Definition string.c:2021
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1640
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1568
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6602
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3204
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1147
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12776
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1436
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1605
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3809
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3136
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4313
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3430
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7281
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2799
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12761
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4260
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4080
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4235
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1693
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3785
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3321
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5875
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11777
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1626
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1709
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:632
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2983
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3293
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1657
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3412
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1208
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1550
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2753
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7388
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1418
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1725
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2444
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5790
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9395
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1202
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:968
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1857
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2047
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2126
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3474
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1731
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1024
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12728
ID rb_to_id(VALUE str)
Identical to rb_intern_str(), except it tries to convert the parameter object to an instance of rb_cS...
Definition string.c:12718
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define RB_OBJ_SHAREABLE_P(obj)
Queries if the passed object has previously classified as shareable or not.
Definition ractor.h:235
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1856
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3496
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4538
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1375
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:166
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1448
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2960
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:438
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:450
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2818
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1442
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2831
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1786
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:122
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:531
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1466
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RString::@53::@55 embed
Embedded contents.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@53 as
String's specific fields.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@53::@54 heap
Strings that use separated memory region for contents use this pattern.
union RString::@53::@54::@56 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:229
Definition string.c:8275
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:308
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113