Ruby 3.5.0dev (2025-07-18 revision bd50545f3cc14790f5fe9530690a3ad67afe6f92)
string.c (bd50545f3cc14790f5fe9530690a3ad67afe6f92)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/concurrent_set.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/hash.h"
36#include "internal/numeric.h"
37#include "internal/object.h"
38#include "internal/proc.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static const struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555};
556
557void
558Init_fstring_table(void)
559{
560 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
561 rb_gc_register_address(&fstring_table_obj);
562}
563
564static VALUE
565register_fstring(VALUE str, bool copy, bool force_precompute_hash)
566{
567 struct fstr_create_arg args = {
568 .copy = copy,
569 .force_precompute_hash = force_precompute_hash
570 };
571
572#if SIZEOF_VOIDP == SIZEOF_LONG
573 if (FL_TEST_RAW(str, STR_FAKESTR)) {
574 // if the string hasn't been interned, we'll need the hash twice, so we
575 // compute it once and store it in capa
576 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
577 }
578#endif
579
580 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
581
582 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
584 RUBY_ASSERT(OBJ_FROZEN(result));
585 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
587
588 return result;
589}
590
591bool
592rb_obj_is_fstring_table(VALUE obj)
593{
594 ASSERT_vm_locking();
595
596 return obj == fstring_table_obj;
597}
598
599void
600rb_gc_free_fstring(VALUE obj)
601{
602 // Assume locking and barrier (which there is no assert for)
603 ASSERT_vm_locking();
604
605 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
606
607 RB_DEBUG_COUNTER_INC(obj_str_fstr);
608
609 FL_UNSET(obj, RSTRING_FSTR);
610}
611
612void
613rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
614{
615 if (fstring_table_obj) {
616 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
617 }
618}
619
620static VALUE
621setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
622{
623 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
624 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
625
626 if (!name) {
628 name = "";
629 }
630
631 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
632
633 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
634 fake_str->len = len;
635 fake_str->as.heap.ptr = (char *)name;
636 fake_str->as.heap.aux.capa = len;
637 return (VALUE)fake_str;
638}
639
640/*
641 * set up a fake string which refers a static string literal.
642 */
643VALUE
644rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
645{
646 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
647}
648
649/*
650 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
651 * shared string which refers a static string literal. `ptr` must
652 * point a constant string.
653 */
654VALUE
655rb_fstring_new(const char *ptr, long len)
656{
657 struct RString fake_str;
658 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
659}
660
661VALUE
662rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
663{
664 struct RString fake_str;
665 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
666}
667
668VALUE
669rb_fstring_cstr(const char *ptr)
670{
671 return rb_fstring_new(ptr, strlen(ptr));
672}
673
674static inline bool
675single_byte_optimizable(VALUE str)
676{
677 int encindex = ENCODING_GET(str);
678 switch (encindex) {
679 case ENCINDEX_ASCII_8BIT:
680 case ENCINDEX_US_ASCII:
681 return true;
682 case ENCINDEX_UTF_8:
683 // For UTF-8 it's worth scanning the string coderange when unknown.
685 }
686 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
687 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
688 return true;
689 }
690
691 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
692 return true;
693 }
694
695 /* Conservative. Possibly single byte.
696 * "\xa1" in Shift_JIS for example. */
697 return false;
698}
699
701
702static inline const char *
703search_nonascii(const char *p, const char *e)
704{
705 const uintptr_t *s, *t;
706
707#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
708# if SIZEOF_UINTPTR_T == 8
709# define NONASCII_MASK UINT64_C(0x8080808080808080)
710# elif SIZEOF_UINTPTR_T == 4
711# define NONASCII_MASK UINT32_C(0x80808080)
712# else
713# error "don't know what to do."
714# endif
715#else
716# if SIZEOF_UINTPTR_T == 8
717# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
718# elif SIZEOF_UINTPTR_T == 4
719# define NONASCII_MASK 0x80808080UL /* or...? */
720# else
721# error "don't know what to do."
722# endif
723#endif
724
725 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
726#if !UNALIGNED_WORD_ACCESS
727 if ((uintptr_t)p % SIZEOF_VOIDP) {
728 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
729 p += l;
730 switch (l) {
731 default: UNREACHABLE;
732#if SIZEOF_VOIDP > 4
733 case 7: if (p[-7]&0x80) return p-7;
734 case 6: if (p[-6]&0x80) return p-6;
735 case 5: if (p[-5]&0x80) return p-5;
736 case 4: if (p[-4]&0x80) return p-4;
737#endif
738 case 3: if (p[-3]&0x80) return p-3;
739 case 2: if (p[-2]&0x80) return p-2;
740 case 1: if (p[-1]&0x80) return p-1;
741 case 0: break;
742 }
743 }
744#endif
745#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
746#define aligned_ptr(value) \
747 __builtin_assume_aligned((value), sizeof(uintptr_t))
748#else
749#define aligned_ptr(value) (uintptr_t *)(value)
750#endif
751 s = aligned_ptr(p);
752 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
753#undef aligned_ptr
754 for (;s < t; s++) {
755 if (*s & NONASCII_MASK) {
756#ifdef WORDS_BIGENDIAN
757 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
758#else
759 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
760#endif
761 }
762 }
763 p = (const char *)s;
764 }
765
766 switch (e - p) {
767 default: UNREACHABLE;
768#if SIZEOF_VOIDP > 4
769 case 7: if (e[-7]&0x80) return e-7;
770 case 6: if (e[-6]&0x80) return e-6;
771 case 5: if (e[-5]&0x80) return e-5;
772 case 4: if (e[-4]&0x80) return e-4;
773#endif
774 case 3: if (e[-3]&0x80) return e-3;
775 case 2: if (e[-2]&0x80) return e-2;
776 case 1: if (e[-1]&0x80) return e-1;
777 case 0: return NULL;
778 }
779}
780
781static int
782coderange_scan(const char *p, long len, rb_encoding *enc)
783{
784 const char *e = p + len;
785
786 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
787 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
788 p = search_nonascii(p, e);
790 }
791
792 if (rb_enc_asciicompat(enc)) {
793 p = search_nonascii(p, e);
794 if (!p) return ENC_CODERANGE_7BIT;
795 for (;;) {
796 int ret = rb_enc_precise_mbclen(p, e, enc);
798 p += MBCLEN_CHARFOUND_LEN(ret);
799 if (p == e) break;
800 p = search_nonascii(p, e);
801 if (!p) break;
802 }
803 }
804 else {
805 while (p < e) {
806 int ret = rb_enc_precise_mbclen(p, e, enc);
808 p += MBCLEN_CHARFOUND_LEN(ret);
809 }
810 }
811 return ENC_CODERANGE_VALID;
812}
813
814long
815rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
816{
817 const char *p = s;
818
819 if (*cr == ENC_CODERANGE_BROKEN)
820 return e - s;
821
822 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
823 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
824 if (*cr == ENC_CODERANGE_VALID) return e - s;
825 p = search_nonascii(p, e);
827 return e - s;
828 }
829 else if (rb_enc_asciicompat(enc)) {
830 p = search_nonascii(p, e);
831 if (!p) {
832 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
833 return e - s;
834 }
835 for (;;) {
836 int ret = rb_enc_precise_mbclen(p, e, enc);
837 if (!MBCLEN_CHARFOUND_P(ret)) {
839 return p - s;
840 }
841 p += MBCLEN_CHARFOUND_LEN(ret);
842 if (p == e) break;
843 p = search_nonascii(p, e);
844 if (!p) break;
845 }
846 }
847 else {
848 while (p < e) {
849 int ret = rb_enc_precise_mbclen(p, e, enc);
850 if (!MBCLEN_CHARFOUND_P(ret)) {
852 return p - s;
853 }
854 p += MBCLEN_CHARFOUND_LEN(ret);
855 }
856 }
858 return e - s;
859}
860
861static inline void
862str_enc_copy(VALUE str1, VALUE str2)
863{
864 rb_enc_set_index(str1, ENCODING_GET(str2));
865}
866
867/* Like str_enc_copy, but does not check frozen status of str1.
868 * You should use this only if you're certain that str1 is not frozen. */
869static inline void
870str_enc_copy_direct(VALUE str1, VALUE str2)
871{
872 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
873 if (inlined_encoding == ENCODING_INLINE_MAX) {
874 rb_enc_set_index(str1, rb_enc_get_index(str2));
875 }
876 else {
877 ENCODING_SET_INLINED(str1, inlined_encoding);
878 }
879}
880
881static void
882rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
883{
884 /* this function is designed for copying encoding and coderange
885 * from src to new string "dest" which is made from the part of src.
886 */
887 str_enc_copy(dest, src);
888 if (RSTRING_LEN(dest) == 0) {
889 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
891 else
893 return;
894 }
895 switch (ENC_CODERANGE(src)) {
898 break;
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
901 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
903 else
905 break;
906 default:
907 break;
908 }
909}
910
911static void
912rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
913{
914 str_enc_copy(dest, src);
916}
917
918static int
919enc_coderange_scan(VALUE str, rb_encoding *enc)
920{
921 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
922}
923
924int
925rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
926{
927 return enc_coderange_scan(str, enc);
928}
929
930int
932{
933 int cr = ENC_CODERANGE(str);
934
935 if (cr == ENC_CODERANGE_UNKNOWN) {
936 cr = enc_coderange_scan(str, get_encoding(str));
937 ENC_CODERANGE_SET(str, cr);
938 }
939 return cr;
940}
941
942static inline bool
943rb_enc_str_asciicompat(VALUE str)
944{
945 int encindex = ENCODING_GET_INLINED(str);
946 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
947}
948
949int
951{
952 switch(ENC_CODERANGE(str)) {
954 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
956 return true;
957 default:
958 return false;
959 }
960}
961
962static inline void
963str_mod_check(VALUE s, const char *p, long len)
964{
965 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
966 rb_raise(rb_eRuntimeError, "string modified");
967 }
968}
969
970static size_t
971str_capacity(VALUE str, const int termlen)
972{
973 if (STR_EMBED_P(str)) {
974 return str_embed_capa(str) - termlen;
975 }
976 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
977 return RSTRING(str)->len;
978 }
979 else {
980 return RSTRING(str)->as.heap.aux.capa;
981 }
982}
983
984size_t
986{
987 return str_capacity(str, TERM_LEN(str));
988}
989
990static inline void
991must_not_null(const char *ptr)
992{
993 if (!ptr) {
994 rb_raise(rb_eArgError, "NULL pointer given");
995 }
996}
997
998static inline VALUE
999str_alloc_embed(VALUE klass, size_t capa)
1000{
1001 size_t size = rb_str_embed_size(capa);
1002 RUBY_ASSERT(size > 0);
1003 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1004
1005 NEWOBJ_OF(str, struct RString, klass,
1007
1008 return (VALUE)str;
1009}
1010
1011static inline VALUE
1012str_alloc_heap(VALUE klass)
1013{
1014 NEWOBJ_OF(str, struct RString, klass,
1015 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1016
1017 return (VALUE)str;
1018}
1019
1020static inline VALUE
1021empty_str_alloc(VALUE klass)
1022{
1023 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1024 VALUE str = str_alloc_embed(klass, 0);
1025 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1027 return str;
1028}
1029
1030static VALUE
1031str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (enc == NULL) {
1040 enc = rb_ascii8bit_encoding();
1041 }
1042
1043 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1044
1045 int termlen = rb_enc_mbminlen(enc);
1046
1047 if (STR_EMBEDDABLE_P(len, termlen)) {
1048 str = str_alloc_embed(klass, len + termlen);
1049 if (len == 0) {
1050 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1051 }
1052 }
1053 else {
1054 str = str_alloc_heap(klass);
1055 RSTRING(str)->as.heap.aux.capa = len;
1056 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1057 * integer overflow. If we can STATIC_ASSERT that, the following
1058 * mul_add_mul can be reverted to a simple ALLOC_N. */
1059 RSTRING(str)->as.heap.ptr =
1060 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1061 }
1062
1063 rb_enc_raw_set(str, enc);
1064
1065 if (ptr) {
1066 memcpy(RSTRING_PTR(str), ptr, len);
1067 }
1068
1069 STR_SET_LEN(str, len);
1070 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1071 return str;
1072}
1073
1074static VALUE
1075str_new(VALUE klass, const char *ptr, long len)
1076{
1077 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1078}
1079
1080VALUE
1081rb_str_new(const char *ptr, long len)
1082{
1083 return str_new(rb_cString, ptr, len);
1084}
1085
1086VALUE
1087rb_usascii_str_new(const char *ptr, long len)
1088{
1089 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1090}
1091
1092VALUE
1093rb_utf8_str_new(const char *ptr, long len)
1094{
1095 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1096}
1097
1098VALUE
1099rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1100{
1101 return str_enc_new(rb_cString, ptr, len, enc);
1102}
1103
1104VALUE
1106{
1107 must_not_null(ptr);
1108 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1109 * memory regions, and that cannot be detected by the MSAN. Just
1110 * trust the programmer that the argument passed here is a sane C
1111 * string. */
1112 __msan_unpoison_string(ptr);
1113 return rb_str_new(ptr, strlen(ptr));
1114}
1115
1116VALUE
1118{
1119 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1120}
1121
1122VALUE
1124{
1125 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 if (rb_enc_mbminlen(enc) != 1) {
1133 rb_raise(rb_eArgError, "wchar encoding given");
1134 }
1135 return rb_enc_str_new(ptr, strlen(ptr), enc);
1136}
1137
1138static VALUE
1139str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1140{
1141 VALUE str;
1142
1143 if (len < 0) {
1144 rb_raise(rb_eArgError, "negative string size (or size too big)");
1145 }
1146
1147 if (!ptr) {
1148 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1149 }
1150 else {
1151 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1152 str = str_alloc_heap(klass);
1153 RSTRING(str)->len = len;
1154 RSTRING(str)->as.heap.ptr = (char *)ptr;
1155 RSTRING(str)->as.heap.aux.capa = len;
1156 RBASIC(str)->flags |= STR_NOFREE;
1157 rb_enc_associate_index(str, encindex);
1158 }
1159 return str;
1160}
1161
1162VALUE
1163rb_str_new_static(const char *ptr, long len)
1164{
1165 return str_new_static(rb_cString, ptr, len, 0);
1166}
1167
1168VALUE
1170{
1171 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1172}
1173
1174VALUE
1176{
1177 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1178}
1179
1180VALUE
1182{
1183 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1184}
1185
1186static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1187 rb_encoding *from, rb_encoding *to,
1188 int ecflags, VALUE ecopts);
1189
1190static inline bool
1191is_enc_ascii_string(VALUE str, rb_encoding *enc)
1192{
1193 int encidx = rb_enc_to_index(enc);
1194 if (rb_enc_get_index(str) == encidx)
1195 return is_ascii_string(str);
1196 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1197}
1198
1199VALUE
1200rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1201{
1202 long len;
1203 const char *ptr;
1204 VALUE newstr;
1205
1206 if (!to) return str;
1207 if (!from) from = rb_enc_get(str);
1208 if (from == to) return str;
1209 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1210 rb_is_ascii8bit_enc(to)) {
1211 if (STR_ENC_GET(str) != to) {
1212 str = rb_str_dup(str);
1213 rb_enc_associate(str, to);
1214 }
1215 return str;
1216 }
1217
1218 RSTRING_GETMEM(str, ptr, len);
1219 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1220 from, to, ecflags, ecopts);
1221 if (NIL_P(newstr)) {
1222 /* some error, return original */
1223 return str;
1224 }
1225 return newstr;
1226}
1227
1228VALUE
1229rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1230 rb_encoding *from, int ecflags, VALUE ecopts)
1231{
1232 long olen;
1233
1234 olen = RSTRING_LEN(newstr);
1235 if (ofs < -olen || olen < ofs)
1236 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1237 if (ofs < 0) ofs += olen;
1238 if (!from) {
1239 STR_SET_LEN(newstr, ofs);
1240 return rb_str_cat(newstr, ptr, len);
1241 }
1242
1243 rb_str_modify(newstr);
1244 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1245 rb_enc_get(newstr),
1246 ecflags, ecopts);
1247}
1248
1249VALUE
1250rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1251{
1252 STR_SET_LEN(str, 0);
1253 rb_enc_associate(str, enc);
1254 rb_str_cat(str, ptr, len);
1255 return str;
1256}
1257
1258static VALUE
1259str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1260 rb_encoding *from, rb_encoding *to,
1261 int ecflags, VALUE ecopts)
1262{
1263 rb_econv_t *ec;
1265 long olen;
1266 VALUE econv_wrapper;
1267 const unsigned char *start, *sp;
1268 unsigned char *dest, *dp;
1269 size_t converted_output = (size_t)ofs;
1270
1271 olen = rb_str_capacity(newstr);
1272
1273 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1274 RBASIC_CLEAR_CLASS(econv_wrapper);
1275 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1276 if (!ec) return Qnil;
1277 DATA_PTR(econv_wrapper) = ec;
1278
1279 sp = (unsigned char*)ptr;
1280 start = sp;
1281 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1282 (dp = dest + converted_output),
1283 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1285 /* destination buffer short */
1286 size_t converted_input = sp - start;
1287 size_t rest = len - converted_input;
1288 converted_output = dp - dest;
1289 rb_str_set_len(newstr, converted_output);
1290 if (converted_input && converted_output &&
1291 rest < (LONG_MAX / converted_output)) {
1292 rest = (rest * converted_output) / converted_input;
1293 }
1294 else {
1295 rest = olen;
1296 }
1297 olen += rest < 2 ? 2 : rest;
1298 rb_str_resize(newstr, olen);
1299 }
1300 DATA_PTR(econv_wrapper) = 0;
1301 RB_GC_GUARD(econv_wrapper);
1302 rb_econv_close(ec);
1303 switch (ret) {
1304 case econv_finished:
1305 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1306 rb_str_set_len(newstr, len);
1307 rb_enc_associate(newstr, to);
1308 return newstr;
1309
1310 default:
1311 return Qnil;
1312 }
1313}
1314
1315VALUE
1317{
1318 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1319}
1320
1321VALUE
1323{
1324 rb_encoding *ienc;
1325 VALUE str;
1326 const int eidx = rb_enc_to_index(eenc);
1327
1328 if (!ptr) {
1329 return rb_enc_str_new(ptr, len, eenc);
1330 }
1331
1332 /* ASCII-8BIT case, no conversion */
1333 if ((eidx == rb_ascii8bit_encindex()) ||
1334 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1335 return rb_str_new(ptr, len);
1336 }
1337 /* no default_internal or same encoding, no conversion */
1338 ienc = rb_default_internal_encoding();
1339 if (!ienc || eenc == ienc) {
1340 return rb_enc_str_new(ptr, len, eenc);
1341 }
1342 /* ASCII compatible, and ASCII only string, no conversion in
1343 * default_internal */
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex()) ||
1346 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1347 return rb_enc_str_new(ptr, len, ienc);
1348 }
1349 /* convert from the given encoding to default_internal */
1350 str = rb_enc_str_new(NULL, 0, ienc);
1351 /* when the conversion failed for some reason, just ignore the
1352 * default_internal and result in the given encoding as-is. */
1353 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1354 rb_str_initialize(str, ptr, len, eenc);
1355 }
1356 return str;
1357}
1358
1359VALUE
1360rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1361{
1362 int eidx = rb_enc_to_index(eenc);
1363 if (eidx == rb_usascii_encindex() &&
1364 !is_ascii_string(str)) {
1365 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1366 return str;
1367 }
1368 rb_enc_associate_index(str, eidx);
1369 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1370}
1371
1372VALUE
1373rb_external_str_new(const char *ptr, long len)
1374{
1375 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1376}
1377
1378VALUE
1380{
1381 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1382}
1383
1384VALUE
1385rb_locale_str_new(const char *ptr, long len)
1386{
1387 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1388}
1389
1390VALUE
1392{
1393 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1394}
1395
1396VALUE
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1406}
1407
1408VALUE
1410{
1411 return rb_str_export_to_enc(str, rb_default_external_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_str_export_to_enc(str, rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1424}
1425
1426static VALUE
1427str_replace_shared_without_enc(VALUE str2, VALUE str)
1428{
1429 const int termlen = TERM_LEN(str);
1430 char *ptr;
1431 long len;
1432
1433 RSTRING_GETMEM(str, ptr, len);
1434 if (str_embed_capa(str2) >= len + termlen) {
1435 char *ptr2 = RSTRING(str2)->as.embed.ary;
1436 STR_SET_EMBED(str2);
1437 memcpy(ptr2, RSTRING_PTR(str), len);
1438 TERM_FILL(ptr2+len, termlen);
1439 }
1440 else {
1441 VALUE root;
1442 if (STR_SHARED_P(str)) {
1443 root = RSTRING(str)->as.heap.aux.shared;
1444 RSTRING_GETMEM(str, ptr, len);
1445 }
1446 else {
1447 root = rb_str_new_frozen(str);
1448 RSTRING_GETMEM(root, ptr, len);
1449 }
1450 RUBY_ASSERT(OBJ_FROZEN(root));
1451
1452 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1453 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1454 rb_fatal("about to free a possible shared root");
1455 }
1456 char *ptr2 = STR_HEAP_PTR(str2);
1457 if (ptr2 != ptr) {
1458 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1459 }
1460 }
1461 FL_SET(str2, STR_NOEMBED);
1462 RSTRING(str2)->as.heap.ptr = ptr;
1463 STR_SET_SHARED(str2, root);
1464 }
1465
1466 STR_SET_LEN(str2, len);
1467
1468 return str2;
1469}
1470
1471static VALUE
1472str_replace_shared(VALUE str2, VALUE str)
1473{
1474 str_replace_shared_without_enc(str2, str);
1475 rb_enc_cr_str_exact_copy(str2, str);
1476 return str2;
1477}
1478
1479static VALUE
1480str_new_shared(VALUE klass, VALUE str)
1481{
1482 return str_replace_shared(str_alloc_heap(klass), str);
1483}
1484
1485VALUE
1487{
1488 return str_new_shared(rb_obj_class(str), str);
1489}
1490
1491VALUE
1493{
1494 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1495 return str_new_frozen(rb_obj_class(orig), orig);
1496}
1497
1498static VALUE
1499rb_str_new_frozen_String(VALUE orig)
1500{
1501 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1502 return str_new_frozen(rb_cString, orig);
1503}
1504
1505
1506VALUE
1507rb_str_frozen_bare_string(VALUE orig)
1508{
1509 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1510 return str_new_frozen(rb_cString, orig);
1511}
1512
1513VALUE
1514rb_str_tmp_frozen_acquire(VALUE orig)
1515{
1516 if (OBJ_FROZEN_RAW(orig)) return orig;
1517 return str_new_frozen_buffer(0, orig, FALSE);
1518}
1519
1520VALUE
1521rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1522{
1523 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1524 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1525
1526 VALUE str = str_alloc_heap(0);
1527 OBJ_FREEZE(str);
1528 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1529 FL_SET(str, STR_SHARED_ROOT);
1530
1531 size_t capa = str_capacity(orig, TERM_LEN(orig));
1532
1533 /* If the string is embedded then we want to create a copy that is heap
1534 * allocated. If the string is shared then the shared root must be
1535 * embedded, so we want to create a copy. If the string is a shared root
1536 * then it must be embedded, so we want to create a copy. */
1537 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1538 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1539 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1540 }
1541 else {
1542 /* orig must be heap allocated and not shared, so we can safely transfer
1543 * the pointer to str. */
1544 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1545 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1546 RBASIC(orig)->flags &= ~STR_NOFREE;
1547 STR_SET_SHARED(orig, str);
1548 }
1549
1550 RSTRING(str)->len = RSTRING(orig)->len;
1551 RSTRING(str)->as.heap.aux.capa = capa;
1552
1553 return str;
1554}
1555
1556void
1557rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1558{
1559 if (RBASIC_CLASS(tmp) != 0)
1560 return;
1561
1562 if (STR_EMBED_P(tmp)) {
1564 }
1565 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1566 !OBJ_FROZEN_RAW(orig)) {
1567 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1568
1569 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1570 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1571 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1572
1573 /* Unshare orig since the root (tmp) only has this one child. */
1574 FL_UNSET_RAW(orig, STR_SHARED);
1575 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1576 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1578
1579 /* Make tmp embedded and empty so it is safe for sweeping. */
1580 STR_SET_EMBED(tmp);
1581 STR_SET_LEN(tmp, 0);
1582 }
1583 }
1584}
1585
1586static VALUE
1587str_new_frozen(VALUE klass, VALUE orig)
1588{
1589 return str_new_frozen_buffer(klass, orig, TRUE);
1590}
1591
1592static VALUE
1593heap_str_make_shared(VALUE klass, VALUE orig)
1594{
1595 RUBY_ASSERT(!STR_EMBED_P(orig));
1596 RUBY_ASSERT(!STR_SHARED_P(orig));
1597
1598 VALUE str = str_alloc_heap(klass);
1599 STR_SET_LEN(str, RSTRING_LEN(orig));
1600 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1601 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1602 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1603 RBASIC(orig)->flags &= ~STR_NOFREE;
1604 STR_SET_SHARED(orig, str);
1605 if (klass == 0)
1606 FL_UNSET_RAW(str, STR_BORROWED);
1607 return str;
1608}
1609
1610static VALUE
1611str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1612{
1613 VALUE str;
1614
1615 long len = RSTRING_LEN(orig);
1616 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1617 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1618
1619 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1620 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1621 RUBY_ASSERT(STR_EMBED_P(str));
1622 }
1623 else {
1624 if (FL_TEST_RAW(orig, STR_SHARED)) {
1625 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1626 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1627 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1628 RUBY_ASSERT(ofs >= 0);
1629 RUBY_ASSERT(rest >= 0);
1630 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1632
1633 if ((ofs > 0) || (rest > 0) ||
1634 (klass != RBASIC(shared)->klass) ||
1635 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1636 str = str_new_shared(klass, shared);
1637 RUBY_ASSERT(!STR_EMBED_P(str));
1638 RSTRING(str)->as.heap.ptr += ofs;
1639 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1640 }
1641 else {
1642 if (RBASIC_CLASS(shared) == 0)
1643 FL_SET_RAW(shared, STR_BORROWED);
1644 return shared;
1645 }
1646 }
1647 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1648 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1649 STR_SET_EMBED(str);
1650 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1651 STR_SET_LEN(str, RSTRING_LEN(orig));
1652 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1653 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1654 }
1655 else {
1656 str = heap_str_make_shared(klass, orig);
1657 }
1658 }
1659
1660 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1661 OBJ_FREEZE(str);
1662 return str;
1663}
1664
1665VALUE
1666rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1667{
1668 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1669}
1670
1671static VALUE
1672str_new_empty_String(VALUE str)
1673{
1674 VALUE v = rb_str_new(0, 0);
1675 rb_enc_copy(v, str);
1676 return v;
1677}
1678
1679#define STR_BUF_MIN_SIZE 63
1680
1681VALUE
1683{
1684 if (STR_EMBEDDABLE_P(capa, 1)) {
1685 return str_alloc_embed(rb_cString, capa + 1);
1686 }
1687
1688 VALUE str = str_alloc_heap(rb_cString);
1689
1690 RSTRING(str)->as.heap.aux.capa = capa;
1691 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1692 RSTRING(str)->as.heap.ptr[0] = '\0';
1693
1694 return str;
1695}
1696
1697VALUE
1699{
1700 VALUE str;
1701 long len = strlen(ptr);
1702
1703 str = rb_str_buf_new(len);
1704 rb_str_buf_cat(str, ptr, len);
1705
1706 return str;
1707}
1708
1709VALUE
1711{
1712 return str_new(0, 0, len);
1713}
1714
1715void
1717{
1718 if (STR_EMBED_P(str)) {
1719 RB_DEBUG_COUNTER_INC(obj_str_embed);
1720 }
1721 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1722 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1724 }
1725 else {
1726 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1727 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1728 }
1729}
1730
1731size_t
1732rb_str_memsize(VALUE str)
1733{
1734 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1735 return STR_HEAP_SIZE(str);
1736 }
1737 else {
1738 return 0;
1739 }
1740}
1741
1742VALUE
1744{
1745 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1746}
1747
1748static inline void str_discard(VALUE str);
1749static void str_shared_replace(VALUE str, VALUE str2);
1750
1751void
1753{
1754 if (str != str2) str_shared_replace(str, str2);
1755}
1756
1757static void
1758str_shared_replace(VALUE str, VALUE str2)
1759{
1760 rb_encoding *enc;
1761 int cr;
1762 int termlen;
1763
1764 RUBY_ASSERT(str2 != str);
1765 enc = STR_ENC_GET(str2);
1766 cr = ENC_CODERANGE(str2);
1767 str_discard(str);
1768 termlen = rb_enc_mbminlen(enc);
1769
1770 STR_SET_LEN(str, RSTRING_LEN(str2));
1771
1772 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1773 STR_SET_EMBED(str);
1774 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1775 rb_enc_associate(str, enc);
1776 ENC_CODERANGE_SET(str, cr);
1777 }
1778 else {
1779 if (STR_EMBED_P(str2)) {
1780 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1781 long len = RSTRING_LEN(str2);
1782 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1783
1784 char *new_ptr = ALLOC_N(char, len + termlen);
1785 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1786 RSTRING(str2)->as.heap.ptr = new_ptr;
1787 STR_SET_LEN(str2, len);
1788 RSTRING(str2)->as.heap.aux.capa = len;
1789 STR_SET_NOEMBED(str2);
1790 }
1791
1792 STR_SET_NOEMBED(str);
1793 FL_UNSET(str, STR_SHARED);
1794 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1795
1796 if (FL_TEST(str2, STR_SHARED)) {
1797 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1798 STR_SET_SHARED(str, shared);
1799 }
1800 else {
1801 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1802 }
1803
1804 /* abandon str2 */
1805 STR_SET_EMBED(str2);
1806 RSTRING_PTR(str2)[0] = 0;
1807 STR_SET_LEN(str2, 0);
1808 rb_enc_associate(str, enc);
1809 ENC_CODERANGE_SET(str, cr);
1810 }
1811}
1812
1813VALUE
1815{
1816 VALUE str;
1817
1818 if (RB_TYPE_P(obj, T_STRING)) {
1819 return obj;
1820 }
1821 str = rb_funcall(obj, idTo_s, 0);
1822 return rb_obj_as_string_result(str, obj);
1823}
1824
1825VALUE
1826rb_obj_as_string_result(VALUE str, VALUE obj)
1827{
1828 if (!RB_TYPE_P(str, T_STRING))
1829 return rb_any_to_s(obj);
1830 return str;
1831}
1832
1833static VALUE
1834str_replace(VALUE str, VALUE str2)
1835{
1836 long len;
1837
1838 len = RSTRING_LEN(str2);
1839 if (STR_SHARED_P(str2)) {
1840 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1842 STR_SET_NOEMBED(str);
1843 STR_SET_LEN(str, len);
1844 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1845 STR_SET_SHARED(str, shared);
1846 rb_enc_cr_str_exact_copy(str, str2);
1847 }
1848 else {
1849 str_replace_shared(str, str2);
1850 }
1851
1852 return str;
1853}
1854
1855static inline VALUE
1856ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1857{
1858 size_t size = rb_str_embed_size(capa);
1859 RUBY_ASSERT(size > 0);
1860 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1861
1862 NEWOBJ_OF(str, struct RString, klass,
1864
1865 return (VALUE)str;
1866}
1867
1868static inline VALUE
1869ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1870{
1871 NEWOBJ_OF(str, struct RString, klass,
1872 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1873
1874 return (VALUE)str;
1875}
1876
1877static inline VALUE
1878str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1879{
1880 int encidx = 0;
1881 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1882 encidx = rb_enc_get_index(str);
1883 flags &= ~ENCODING_MASK;
1884 }
1885 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1886 if (encidx) rb_enc_associate_index(dup, encidx);
1887 return dup;
1888}
1889
1890static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1891
1892static inline VALUE
1893str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1894{
1895 VALUE flags = FL_TEST_RAW(str, flag_mask);
1896 long len = RSTRING_LEN(str);
1897
1898 RUBY_ASSERT(STR_EMBED_P(dup));
1899 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1900 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1901 STR_SET_LEN(dup, RSTRING_LEN(str));
1902 return str_duplicate_setup_encoding(str, dup, flags);
1903}
1904
1905static inline VALUE
1906str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1907{
1908 VALUE flags = FL_TEST_RAW(str, flag_mask);
1909 VALUE root = str;
1910 if (FL_TEST_RAW(str, STR_SHARED)) {
1911 root = RSTRING(str)->as.heap.aux.shared;
1912 }
1913 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1914 root = str = str_new_frozen(klass, str);
1915 flags = FL_TEST_RAW(str, flag_mask);
1916 }
1917 RUBY_ASSERT(!STR_SHARED_P(root));
1919
1920 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1921 FL_SET(root, STR_SHARED_ROOT);
1922 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1923 flags |= RSTRING_NOEMBED | STR_SHARED;
1924
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1927}
1928
1929static inline VALUE
1930str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1931{
1932 if (STR_EMBED_P(str)) {
1933 return str_duplicate_setup_embed(klass, str, dup);
1934 }
1935 else {
1936 return str_duplicate_setup_heap(klass, str, dup);
1937 }
1938}
1939
1940static inline VALUE
1941str_duplicate(VALUE klass, VALUE str)
1942{
1943 VALUE dup;
1944 if (STR_EMBED_P(str)) {
1945 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1946 }
1947 else {
1948 dup = str_alloc_heap(klass);
1949 }
1950
1951 return str_duplicate_setup(klass, str, dup);
1952}
1953
1954VALUE
1956{
1957 return str_duplicate(rb_obj_class(str), str);
1958}
1959
1960/* :nodoc: */
1961VALUE
1962rb_str_dup_m(VALUE str)
1963{
1964 if (LIKELY(BARE_STRING_P(str))) {
1965 return str_duplicate(rb_cString, str);
1966 }
1967 else {
1968 return rb_obj_dup(str);
1969 }
1970}
1971
1972VALUE
1974{
1975 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1976 return str_duplicate(rb_cString, str);
1977}
1978
1979VALUE
1980rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1981{
1982 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1983 VALUE new_str, klass = rb_cString;
1984
1985 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1986 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 str_duplicate_setup_embed(klass, str, new_str);
1988 }
1989 else {
1990 new_str = ec_str_alloc_heap(ec, klass);
1991 str_duplicate_setup_heap(klass, str, new_str);
1992 }
1993 if (chilled) {
1994 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1995 }
1996 return new_str;
1997}
1998
1999VALUE
2000rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2001{
2002 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2003 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2004 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2005 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2006 return rb_str_freeze(str);
2007}
2008
2009/*
2010 * The documentation block below uses an include (instead of inline text)
2011 * because the included text has non-ASCII characters (which are not allowed in a C file).
2012 */
2013
2014/*
2015 *
2016 * call-seq:
2017 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2018 *
2019 * :include: doc/string/new.rdoc
2020 *
2021 */
2022
2023static VALUE
2024rb_str_init(int argc, VALUE *argv, VALUE str)
2025{
2026 static ID keyword_ids[2];
2027 VALUE orig, opt, venc, vcapa;
2028 VALUE kwargs[2];
2029 rb_encoding *enc = 0;
2030 int n;
2031
2032 if (!keyword_ids[0]) {
2033 keyword_ids[0] = rb_id_encoding();
2034 CONST_ID(keyword_ids[1], "capacity");
2035 }
2036
2037 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2038 if (!NIL_P(opt)) {
2039 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2040 venc = kwargs[0];
2041 vcapa = kwargs[1];
2042 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2043 enc = rb_to_encoding(venc);
2044 }
2045 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2046 long capa = NUM2LONG(vcapa);
2047 long len = 0;
2048 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2049
2050 if (capa < STR_BUF_MIN_SIZE) {
2051 capa = STR_BUF_MIN_SIZE;
2052 }
2053 if (n == 1) {
2054 StringValue(orig);
2055 len = RSTRING_LEN(orig);
2056 if (capa < len) {
2057 capa = len;
2058 }
2059 if (orig == str) n = 0;
2060 }
2061 str_modifiable(str);
2062 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2063 /* make noembed always */
2064 const size_t size = (size_t)capa + termlen;
2065 const char *const old_ptr = RSTRING_PTR(str);
2066 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2067 char *new_ptr = ALLOC_N(char, size);
2068 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2069 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2070 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2071 RSTRING(str)->as.heap.ptr = new_ptr;
2072 }
2073 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2074 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2075 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2076 }
2077 STR_SET_LEN(str, len);
2078 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2079 if (n == 1) {
2080 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2081 rb_enc_cr_str_exact_copy(str, orig);
2082 }
2083 FL_SET(str, STR_NOEMBED);
2084 RSTRING(str)->as.heap.aux.capa = capa;
2085 }
2086 else if (n == 1) {
2087 rb_str_replace(str, orig);
2088 }
2089 if (enc) {
2090 rb_enc_associate(str, enc);
2092 }
2093 }
2094 else if (n == 1) {
2095 rb_str_replace(str, orig);
2096 }
2097 return str;
2098}
2099
2100/* :nodoc: */
2101static VALUE
2102rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2103{
2104 if (klass != rb_cString) {
2105 return rb_class_new_instance_pass_kw(argc, argv, klass);
2106 }
2107
2108 static ID keyword_ids[2];
2109 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2110 VALUE kwargs[2];
2111 rb_encoding *enc = NULL;
2112
2113 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2114 if (NIL_P(opt)) {
2115 return rb_class_new_instance_pass_kw(argc, argv, klass);
2116 }
2117
2118 keyword_ids[0] = rb_id_encoding();
2119 CONST_ID(keyword_ids[1], "capacity");
2120 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2121 encoding = kwargs[0];
2122 capacity = kwargs[1];
2123
2124 if (n == 1) {
2125 orig = StringValue(orig);
2126 }
2127 else {
2128 orig = Qnil;
2129 }
2130
2131 if (UNDEF_P(encoding)) {
2132 if (!NIL_P(orig)) {
2133 encoding = rb_obj_encoding(orig);
2134 }
2135 }
2136
2137 if (!UNDEF_P(encoding)) {
2138 enc = rb_to_encoding(encoding);
2139 }
2140
2141 // If capacity is nil, we're basically just duping `orig`.
2142 if (UNDEF_P(capacity)) {
2143 if (NIL_P(orig)) {
2144 VALUE empty_str = str_new(klass, "", 0);
2145 if (enc) {
2146 rb_enc_associate(empty_str, enc);
2147 }
2148 return empty_str;
2149 }
2150 VALUE copy = str_duplicate(klass, orig);
2151 rb_enc_associate(copy, enc);
2152 ENC_CODERANGE_CLEAR(copy);
2153 return copy;
2154 }
2155
2156 long capa = 0;
2157 capa = NUM2LONG(capacity);
2158 if (capa < 0) {
2159 capa = 0;
2160 }
2161
2162 if (!NIL_P(orig)) {
2163 long orig_capa = rb_str_capacity(orig);
2164 if (orig_capa > capa) {
2165 capa = orig_capa;
2166 }
2167 }
2168
2169 VALUE str = str_enc_new(klass, NULL, capa, enc);
2170 STR_SET_LEN(str, 0);
2171 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2172
2173 if (!NIL_P(orig)) {
2174 rb_str_buf_append(str, orig);
2175 }
2176
2177 return str;
2178}
2179
2180#ifdef NONASCII_MASK
2181#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2182
2183/*
2184 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2185 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2186 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2187 *
2188 * if (!(byte & 0x80))
2189 * byte |= 0x40; // turn on bit6
2190 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2191 *
2192 * This function calculates whether a byte is leading or not for all bytes
2193 * in the argument word by concurrently using the above logic, and then
2194 * adds up the number of leading bytes in the word.
2195 */
2196static inline uintptr_t
2197count_utf8_lead_bytes_with_word(const uintptr_t *s)
2198{
2199 uintptr_t d = *s;
2200
2201 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2202 d = (d>>6) | (~d>>7);
2203 d &= NONASCII_MASK >> 7;
2204
2205 /* Gather all bytes. */
2206#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2207 /* use only if it can use POPCNT */
2208 return rb_popcount_intptr(d);
2209#else
2210 d += (d>>8);
2211 d += (d>>16);
2212# if SIZEOF_VOIDP == 8
2213 d += (d>>32);
2214# endif
2215 return (d&0xF);
2216#endif
2217}
2218#endif
2219
2220static inline long
2221enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2222{
2223 long c;
2224 const char *q;
2225
2226 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2227 long diff = (long)(e - p);
2228 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2229 }
2230#ifdef NONASCII_MASK
2231 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2232 uintptr_t len = 0;
2233 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2234 const uintptr_t *s, *t;
2235 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2236 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2237 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2238 while (p < (const char *)s) {
2239 if (is_utf8_lead_byte(*p)) len++;
2240 p++;
2241 }
2242 while (s < t) {
2243 len += count_utf8_lead_bytes_with_word(s);
2244 s++;
2245 }
2246 p = (const char *)s;
2247 }
2248 while (p < e) {
2249 if (is_utf8_lead_byte(*p)) len++;
2250 p++;
2251 }
2252 return (long)len;
2253 }
2254#endif
2255 else if (rb_enc_asciicompat(enc)) {
2256 c = 0;
2257 if (ENC_CODERANGE_CLEAN_P(cr)) {
2258 while (p < e) {
2259 if (ISASCII(*p)) {
2260 q = search_nonascii(p, e);
2261 if (!q)
2262 return c + (e - p);
2263 c += q - p;
2264 p = q;
2265 }
2266 p += rb_enc_fast_mbclen(p, e, enc);
2267 c++;
2268 }
2269 }
2270 else {
2271 while (p < e) {
2272 if (ISASCII(*p)) {
2273 q = search_nonascii(p, e);
2274 if (!q)
2275 return c + (e - p);
2276 c += q - p;
2277 p = q;
2278 }
2279 p += rb_enc_mbclen(p, e, enc);
2280 c++;
2281 }
2282 }
2283 return c;
2284 }
2285
2286 for (c=0; p<e; c++) {
2287 p += rb_enc_mbclen(p, e, enc);
2288 }
2289 return c;
2290}
2291
2292long
2293rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2294{
2295 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2296}
2297
2298/* To get strlen with cr
2299 * Note that given cr is not used.
2300 */
2301long
2302rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2303{
2304 long c;
2305 const char *q;
2306 int ret;
2307
2308 *cr = 0;
2309 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2310 long diff = (long)(e - p);
2311 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2312 }
2313 else if (rb_enc_asciicompat(enc)) {
2314 c = 0;
2315 while (p < e) {
2316 if (ISASCII(*p)) {
2317 q = search_nonascii(p, e);
2318 if (!q) {
2319 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2320 return c + (e - p);
2321 }
2322 c += q - p;
2323 p = q;
2324 }
2325 ret = rb_enc_precise_mbclen(p, e, enc);
2326 if (MBCLEN_CHARFOUND_P(ret)) {
2327 *cr |= ENC_CODERANGE_VALID;
2328 p += MBCLEN_CHARFOUND_LEN(ret);
2329 }
2330 else {
2332 p++;
2333 }
2334 c++;
2335 }
2336 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2337 return c;
2338 }
2339
2340 for (c=0; p<e; c++) {
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2342 if (MBCLEN_CHARFOUND_P(ret)) {
2343 *cr |= ENC_CODERANGE_VALID;
2344 p += MBCLEN_CHARFOUND_LEN(ret);
2345 }
2346 else {
2348 if (p + rb_enc_mbminlen(enc) <= e)
2349 p += rb_enc_mbminlen(enc);
2350 else
2351 p = e;
2352 }
2353 }
2354 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2355 return c;
2356}
2357
2358/* enc must be str's enc or rb_enc_check(str, str2) */
2359static long
2360str_strlen(VALUE str, rb_encoding *enc)
2361{
2362 const char *p, *e;
2363 int cr;
2364
2365 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2366 if (!enc) enc = STR_ENC_GET(str);
2367 p = RSTRING_PTR(str);
2368 e = RSTRING_END(str);
2369 cr = ENC_CODERANGE(str);
2370
2371 if (cr == ENC_CODERANGE_UNKNOWN) {
2372 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2373 if (cr) ENC_CODERANGE_SET(str, cr);
2374 return n;
2375 }
2376 else {
2377 return enc_strlen(p, e, enc, cr);
2378 }
2379}
2380
2381long
2383{
2384 return str_strlen(str, NULL);
2385}
2386
2387/*
2388 * call-seq:
2389 * length -> integer
2390 *
2391 * :include: doc/string/length.rdoc
2392 *
2393 */
2394
2395VALUE
2397{
2398 return LONG2NUM(str_strlen(str, NULL));
2399}
2400
2401/*
2402 * call-seq:
2403 * bytesize -> integer
2404 *
2405 * :include: doc/string/bytesize.rdoc
2406 *
2407 */
2408
2409VALUE
2410rb_str_bytesize(VALUE str)
2411{
2412 return LONG2NUM(RSTRING_LEN(str));
2413}
2414
2415/*
2416 * call-seq:
2417 * empty? -> true or false
2418 *
2419 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2420 *
2421 * "hello".empty? # => false
2422 * " ".empty? # => false
2423 * "".empty? # => true
2424 *
2425 */
2426
2427static VALUE
2428rb_str_empty(VALUE str)
2429{
2430 return RBOOL(RSTRING_LEN(str) == 0);
2431}
2432
2433/*
2434 * call-seq:
2435 * self + other_string -> new_string
2436 *
2437 * Returns a new string containing +other_string+ concatenated to +self+:
2438 *
2439 * 'Hello from ' + self.to_s # => "Hello from main"
2440 *
2441 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2442 */
2443
2444VALUE
2446{
2447 VALUE str3;
2448 rb_encoding *enc;
2449 char *ptr1, *ptr2, *ptr3;
2450 long len1, len2;
2451 int termlen;
2452
2453 StringValue(str2);
2454 enc = rb_enc_check_str(str1, str2);
2455 RSTRING_GETMEM(str1, ptr1, len1);
2456 RSTRING_GETMEM(str2, ptr2, len2);
2457 termlen = rb_enc_mbminlen(enc);
2458 if (len1 > LONG_MAX - len2) {
2459 rb_raise(rb_eArgError, "string size too big");
2460 }
2461 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2462 ptr3 = RSTRING_PTR(str3);
2463 memcpy(ptr3, ptr1, len1);
2464 memcpy(ptr3+len1, ptr2, len2);
2465 TERM_FILL(&ptr3[len1+len2], termlen);
2466
2467 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2469 RB_GC_GUARD(str1);
2470 RB_GC_GUARD(str2);
2471 return str3;
2472}
2473
2474/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2475VALUE
2476rb_str_opt_plus(VALUE str1, VALUE str2)
2477{
2480 long len1, len2;
2481 MAYBE_UNUSED(char) *ptr1, *ptr2;
2482 RSTRING_GETMEM(str1, ptr1, len1);
2483 RSTRING_GETMEM(str2, ptr2, len2);
2484 int enc1 = rb_enc_get_index(str1);
2485 int enc2 = rb_enc_get_index(str2);
2486
2487 if (enc1 < 0) {
2488 return Qundef;
2489 }
2490 else if (enc2 < 0) {
2491 return Qundef;
2492 }
2493 else if (enc1 != enc2) {
2494 return Qundef;
2495 }
2496 else if (len1 > LONG_MAX - len2) {
2497 return Qundef;
2498 }
2499 else {
2500 return rb_str_plus(str1, str2);
2501 }
2502
2503}
2504
2505/*
2506 * call-seq:
2507 * self * n -> new_string
2508 *
2509 * Returns a new string containing +n+ copies of +self+:
2510 *
2511 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2512 * 'No!' * 0 # => ""
2513 *
2514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2515 */
2516
2517VALUE
2519{
2520 VALUE str2;
2521 long n, len;
2522 char *ptr2;
2523 int termlen;
2524
2525 if (times == INT2FIX(1)) {
2526 return str_duplicate(rb_cString, str);
2527 }
2528 if (times == INT2FIX(0)) {
2529 str2 = str_alloc_embed(rb_cString, 0);
2530 rb_enc_copy(str2, str);
2531 return str2;
2532 }
2533 len = NUM2LONG(times);
2534 if (len < 0) {
2535 rb_raise(rb_eArgError, "negative argument");
2536 }
2537 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2538 if (STR_EMBEDDABLE_P(len, 1)) {
2539 str2 = str_alloc_embed(rb_cString, len + 1);
2540 memset(RSTRING_PTR(str2), 0, len + 1);
2541 }
2542 else {
2543 str2 = str_alloc_heap(rb_cString);
2544 RSTRING(str2)->as.heap.aux.capa = len;
2545 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2546 }
2547 STR_SET_LEN(str2, len);
2548 rb_enc_copy(str2, str);
2549 return str2;
2550 }
2551 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2552 rb_raise(rb_eArgError, "argument too big");
2553 }
2554
2555 len *= RSTRING_LEN(str);
2556 termlen = TERM_LEN(str);
2557 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2558 ptr2 = RSTRING_PTR(str2);
2559 if (len) {
2560 n = RSTRING_LEN(str);
2561 memcpy(ptr2, RSTRING_PTR(str), n);
2562 while (n <= len/2) {
2563 memcpy(ptr2 + n, ptr2, n);
2564 n *= 2;
2565 }
2566 memcpy(ptr2 + n, ptr2, len-n);
2567 }
2568 STR_SET_LEN(str2, len);
2569 TERM_FILL(&ptr2[len], termlen);
2570 rb_enc_cr_str_copy_for_substr(str2, str);
2571
2572 return str2;
2573}
2574
2575/*
2576 * call-seq:
2577 * self % object -> new_string
2578 *
2579 * Returns the result of formatting +object+ into the format specifications
2580 * contained in +self+
2581 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2582 *
2583 * '%05d' % 123 # => "00123"
2584 *
2585 * If +self+ contains multiple format specifications,
2586 * +object+ must be an array or hash containing the objects to be formatted:
2587 *
2588 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2589 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2590 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2591 *
2592 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2593 */
2594
2595static VALUE
2596rb_str_format_m(VALUE str, VALUE arg)
2597{
2598 VALUE tmp = rb_check_array_type(arg);
2599
2600 if (!NIL_P(tmp)) {
2601 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2602 }
2603 return rb_str_format(1, &arg, str);
2604}
2605
2606static inline void
2607rb_check_lockedtmp(VALUE str)
2608{
2609 if (FL_TEST(str, STR_TMPLOCK)) {
2610 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2611 }
2612}
2613
2614// If none of these flags are set, we know we have an modifiable string.
2615// If any is set, we need to do more detailed checks.
2616#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2617static inline void
2618str_modifiable(VALUE str)
2619{
2620 RUBY_ASSERT(ruby_thread_has_gvl_p());
2621
2622 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2623 if (CHILLED_STRING_P(str)) {
2624 CHILLED_STRING_MUTATED(str);
2625 }
2626 rb_check_lockedtmp(str);
2627 rb_check_frozen(str);
2628 }
2629}
2630
2631static inline int
2632str_dependent_p(VALUE str)
2633{
2634 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2635 return FALSE;
2636 }
2637 else {
2638 return TRUE;
2639 }
2640}
2641
2642// If none of these flags are set, we know we have an independent string.
2643// If any is set, we need to do more detailed checks.
2644#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2645static inline int
2646str_independent(VALUE str)
2647{
2648 RUBY_ASSERT(ruby_thread_has_gvl_p());
2649
2650 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2651 str_modifiable(str);
2652 return !str_dependent_p(str);
2653 }
2654 return TRUE;
2655}
2656
2657static void
2658str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 char *ptr;
2663 char *oldptr;
2664 long capa = len + expand;
2665
2666 if (len > capa) len = capa;
2667
2668 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2669 ptr = RSTRING(str)->as.heap.ptr;
2670 STR_SET_EMBED(str);
2671 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2672 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2673 STR_SET_LEN(str, len);
2674 return;
2675 }
2676
2677 ptr = ALLOC_N(char, (size_t)capa + termlen);
2678 oldptr = RSTRING_PTR(str);
2679 if (oldptr) {
2680 memcpy(ptr, oldptr, len);
2681 }
2682 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2683 xfree(oldptr);
2684 }
2685 STR_SET_NOEMBED(str);
2686 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2687 TERM_FILL(ptr + len, termlen);
2688 RSTRING(str)->as.heap.ptr = ptr;
2689 STR_SET_LEN(str, len);
2690 RSTRING(str)->as.heap.aux.capa = capa;
2691}
2692
2693void
2694rb_str_modify(VALUE str)
2695{
2696 if (!str_independent(str))
2697 str_make_independent(str);
2699}
2700
2701void
2703{
2704 RUBY_ASSERT(ruby_thread_has_gvl_p());
2705
2706 int termlen = TERM_LEN(str);
2707 long len = RSTRING_LEN(str);
2708
2709 if (expand < 0) {
2710 rb_raise(rb_eArgError, "negative expanding string size");
2711 }
2712 if (expand >= LONG_MAX - len) {
2713 rb_raise(rb_eArgError, "string size too big");
2714 }
2715
2716 if (!str_independent(str)) {
2717 str_make_independent_expand(str, len, expand, termlen);
2718 }
2719 else if (expand > 0) {
2720 RESIZE_CAPA_TERM(str, len + expand, termlen);
2721 }
2723}
2724
2725/* As rb_str_modify(), but don't clear coderange */
2726static void
2727str_modify_keep_cr(VALUE str)
2728{
2729 if (!str_independent(str))
2730 str_make_independent(str);
2732 /* Force re-scan later */
2734}
2735
2736static inline void
2737str_discard(VALUE str)
2738{
2739 str_modifiable(str);
2740 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2741 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2742 RSTRING(str)->as.heap.ptr = 0;
2743 STR_SET_LEN(str, 0);
2744 }
2745}
2746
2747void
2749{
2750 int encindex = rb_enc_get_index(str);
2751
2752 if (RB_UNLIKELY(encindex == -1)) {
2753 rb_raise(rb_eTypeError, "not encoding capable object");
2754 }
2755
2756 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2757 return;
2758 }
2759
2760 rb_encoding *enc = rb_enc_from_index(encindex);
2761 if (!rb_enc_asciicompat(enc)) {
2762 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2763 }
2764}
2765
2766VALUE
2768{
2769 RUBY_ASSERT(ruby_thread_has_gvl_p());
2770
2771 VALUE s = *ptr;
2772 if (!RB_TYPE_P(s, T_STRING)) {
2773 s = rb_str_to_str(s);
2774 *ptr = s;
2775 }
2776 return s;
2777}
2778
2779char *
2781{
2782 VALUE str = rb_string_value(ptr);
2783 return RSTRING_PTR(str);
2784}
2785
2786static int
2787zero_filled(const char *s, int n)
2788{
2789 for (; n > 0; --n) {
2790 if (*s++) return 0;
2791 }
2792 return 1;
2793}
2794
2795static const char *
2796str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2797{
2798 const char *e = s + len;
2799
2800 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2801 if (zero_filled(s, minlen)) return s;
2802 }
2803 return 0;
2804}
2805
2806static char *
2807str_fill_term(VALUE str, char *s, long len, int termlen)
2808{
2809 /* This function assumes that (capa + termlen) bytes of memory
2810 * is allocated, like many other functions in this file.
2811 */
2812 if (str_dependent_p(str)) {
2813 if (!zero_filled(s + len, termlen))
2814 str_make_independent_expand(str, len, 0L, termlen);
2815 }
2816 else {
2817 TERM_FILL(s + len, termlen);
2818 return s;
2819 }
2820 return RSTRING_PTR(str);
2821}
2822
2823void
2824rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2825{
2826 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2827 long len = RSTRING_LEN(str);
2828
2829 RUBY_ASSERT(capa >= len);
2830 if (capa - len < termlen) {
2831 rb_check_lockedtmp(str);
2832 str_make_independent_expand(str, len, 0L, termlen);
2833 }
2834 else if (str_dependent_p(str)) {
2835 if (termlen > oldtermlen)
2836 str_make_independent_expand(str, len, 0L, termlen);
2837 }
2838 else {
2839 if (!STR_EMBED_P(str)) {
2840 /* modify capa instead of realloc */
2841 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2842 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2843 }
2844 if (termlen > oldtermlen) {
2845 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2846 }
2847 }
2848
2849 return;
2850}
2851
2852static char *
2853str_null_check(VALUE str, int *w)
2854{
2855 char *s = RSTRING_PTR(str);
2856 long len = RSTRING_LEN(str);
2857 rb_encoding *enc = rb_enc_get(str);
2858 const int minlen = rb_enc_mbminlen(enc);
2859
2860 if (minlen > 1) {
2861 *w = 1;
2862 if (str_null_char(s, len, minlen, enc)) {
2863 return NULL;
2864 }
2865 return str_fill_term(str, s, len, minlen);
2866 }
2867 *w = 0;
2868 if (!s || memchr(s, 0, len)) {
2869 return NULL;
2870 }
2871 if (s[len]) {
2872 s = str_fill_term(str, s, len, minlen);
2873 }
2874 return s;
2875}
2876
2877char *
2878rb_str_to_cstr(VALUE str)
2879{
2880 int w;
2881 return str_null_check(str, &w);
2882}
2883
2884char *
2886{
2887 VALUE str = rb_string_value(ptr);
2888 int w;
2889 char *s = str_null_check(str, &w);
2890 if (!s) {
2891 if (w) {
2892 rb_raise(rb_eArgError, "string contains null char");
2893 }
2894 rb_raise(rb_eArgError, "string contains null byte");
2895 }
2896 return s;
2897}
2898
2899char *
2900rb_str_fill_terminator(VALUE str, const int newminlen)
2901{
2902 char *s = RSTRING_PTR(str);
2903 long len = RSTRING_LEN(str);
2904 return str_fill_term(str, s, len, newminlen);
2905}
2906
2907VALUE
2909{
2910 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2911 return str;
2912}
2913
2914/*
2915 * call-seq:
2916 * String.try_convert(object) -> object, new_string, or nil
2917 *
2918 * Attempts to convert the given +object+ to a string.
2919 *
2920 * If +object+ is already a string, returns +object+, unmodified.
2921 *
2922 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2923 * calls <tt>object.to_str</tt> and returns the result.
2924 *
2925 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2926 *
2927 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2928 */
2929static VALUE
2930rb_str_s_try_convert(VALUE dummy, VALUE str)
2931{
2932 return rb_check_string_type(str);
2933}
2934
2935static char*
2936str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2937{
2938 long nth = *nthp;
2939 if (rb_enc_mbmaxlen(enc) == 1) {
2940 p += nth;
2941 }
2942 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2943 p += nth * rb_enc_mbmaxlen(enc);
2944 }
2945 else if (rb_enc_asciicompat(enc)) {
2946 const char *p2, *e2;
2947 int n;
2948
2949 while (p < e && 0 < nth) {
2950 e2 = p + nth;
2951 if (e < e2) {
2952 *nthp = nth;
2953 return (char *)e;
2954 }
2955 if (ISASCII(*p)) {
2956 p2 = search_nonascii(p, e2);
2957 if (!p2) {
2958 nth -= e2 - p;
2959 *nthp = nth;
2960 return (char *)e2;
2961 }
2962 nth -= p2 - p;
2963 p = p2;
2964 }
2965 n = rb_enc_mbclen(p, e, enc);
2966 p += n;
2967 nth--;
2968 }
2969 *nthp = nth;
2970 if (nth != 0) {
2971 return (char *)e;
2972 }
2973 return (char *)p;
2974 }
2975 else {
2976 while (p < e && nth--) {
2977 p += rb_enc_mbclen(p, e, enc);
2978 }
2979 }
2980 if (p > e) p = e;
2981 *nthp = nth;
2982 return (char*)p;
2983}
2984
2985char*
2986rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2987{
2988 return str_nth_len(p, e, &nth, enc);
2989}
2990
2991static char*
2992str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2993{
2994 if (singlebyte)
2995 p += nth;
2996 else {
2997 p = str_nth_len(p, e, &nth, enc);
2998 }
2999 if (!p) return 0;
3000 if (p > e) p = e;
3001 return (char *)p;
3002}
3003
3004/* char offset to byte offset */
3005static long
3006str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3007{
3008 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3009 if (!pp) return e - p;
3010 return pp - p;
3011}
3012
3013long
3014rb_str_offset(VALUE str, long pos)
3015{
3016 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3017 STR_ENC_GET(str), single_byte_optimizable(str));
3018}
3019
3020#ifdef NONASCII_MASK
3021static char *
3022str_utf8_nth(const char *p, const char *e, long *nthp)
3023{
3024 long nth = *nthp;
3025 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3026 const uintptr_t *s, *t;
3027 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3028 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3029 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3030 while (p < (const char *)s) {
3031 if (is_utf8_lead_byte(*p)) nth--;
3032 p++;
3033 }
3034 do {
3035 nth -= count_utf8_lead_bytes_with_word(s);
3036 s++;
3037 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3038 p = (char *)s;
3039 }
3040 while (p < e) {
3041 if (is_utf8_lead_byte(*p)) {
3042 if (nth == 0) break;
3043 nth--;
3044 }
3045 p++;
3046 }
3047 *nthp = nth;
3048 return (char *)p;
3049}
3050
3051static long
3052str_utf8_offset(const char *p, const char *e, long nth)
3053{
3054 const char *pp = str_utf8_nth(p, e, &nth);
3055 return pp - p;
3056}
3057#endif
3058
3059/* byte offset to char offset */
3060long
3061rb_str_sublen(VALUE str, long pos)
3062{
3063 if (single_byte_optimizable(str) || pos < 0)
3064 return pos;
3065 else {
3066 char *p = RSTRING_PTR(str);
3067 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3068 }
3069}
3070
3071static VALUE
3072str_subseq(VALUE str, long beg, long len)
3073{
3074 VALUE str2;
3075
3076 RUBY_ASSERT(beg >= 0);
3077 RUBY_ASSERT(len >= 0);
3078 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3079
3080 const int termlen = TERM_LEN(str);
3081 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3082 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3083 RB_GC_GUARD(str);
3084 return str2;
3085 }
3086
3087 str2 = str_alloc_heap(rb_cString);
3088 if (str_embed_capa(str2) >= len + termlen) {
3089 char *ptr2 = RSTRING(str2)->as.embed.ary;
3090 STR_SET_EMBED(str2);
3091 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3092 TERM_FILL(ptr2+len, termlen);
3093
3094 STR_SET_LEN(str2, len);
3095 RB_GC_GUARD(str);
3096 }
3097 else {
3098 str_replace_shared(str2, str);
3099 RUBY_ASSERT(!STR_EMBED_P(str2));
3100 ENC_CODERANGE_CLEAR(str2);
3101 RSTRING(str2)->as.heap.ptr += beg;
3102 if (RSTRING_LEN(str2) > len) {
3103 STR_SET_LEN(str2, len);
3104 }
3105 }
3106
3107 return str2;
3108}
3109
3110VALUE
3111rb_str_subseq(VALUE str, long beg, long len)
3112{
3113 VALUE str2 = str_subseq(str, beg, len);
3114 rb_enc_cr_str_copy_for_substr(str2, str);
3115 return str2;
3116}
3117
3118char *
3119rb_str_subpos(VALUE str, long beg, long *lenp)
3120{
3121 long len = *lenp;
3122 long slen = -1L;
3123 const long blen = RSTRING_LEN(str);
3124 rb_encoding *enc = STR_ENC_GET(str);
3125 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3126
3127 if (len < 0) return 0;
3128 if (beg < 0 && -beg < 0) return 0;
3129 if (!blen) {
3130 len = 0;
3131 }
3132 if (single_byte_optimizable(str)) {
3133 if (beg > blen) return 0;
3134 if (beg < 0) {
3135 beg += blen;
3136 if (beg < 0) return 0;
3137 }
3138 if (len > blen - beg)
3139 len = blen - beg;
3140 if (len < 0) return 0;
3141 p = s + beg;
3142 goto end;
3143 }
3144 if (beg < 0) {
3145 if (len > -beg) len = -beg;
3146 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3147 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3148 beg = -beg;
3149 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3150 p = e;
3151 if (!p) return 0;
3152 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3153 if (!p) return 0;
3154 len = e - p;
3155 goto end;
3156 }
3157 else {
3158 slen = str_strlen(str, enc);
3159 beg += slen;
3160 if (beg < 0) return 0;
3161 p = s + beg;
3162 if (len == 0) goto end;
3163 }
3164 }
3165 else if (beg > 0 && beg > blen) {
3166 return 0;
3167 }
3168 if (len == 0) {
3169 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3170 p = s + beg;
3171 }
3172#ifdef NONASCII_MASK
3173 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3174 enc == rb_utf8_encoding()) {
3175 p = str_utf8_nth(s, e, &beg);
3176 if (beg > 0) return 0;
3177 len = str_utf8_offset(p, e, len);
3178 }
3179#endif
3180 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3181 int char_sz = rb_enc_mbmaxlen(enc);
3182
3183 p = s + beg * char_sz;
3184 if (p > e) {
3185 return 0;
3186 }
3187 else if (len * char_sz > e - p)
3188 len = e - p;
3189 else
3190 len *= char_sz;
3191 }
3192 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3193 if (beg > 0) return 0;
3194 len = 0;
3195 }
3196 else {
3197 len = str_offset(p, e, len, enc, 0);
3198 }
3199 end:
3200 *lenp = len;
3201 RB_GC_GUARD(str);
3202 return p;
3203}
3204
3205static VALUE str_substr(VALUE str, long beg, long len, int empty);
3206
3207VALUE
3208rb_str_substr(VALUE str, long beg, long len)
3209{
3210 return str_substr(str, beg, len, TRUE);
3211}
3212
3213VALUE
3214rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3215{
3216 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3217}
3218
3219static VALUE
3220str_substr(VALUE str, long beg, long len, int empty)
3221{
3222 char *p = rb_str_subpos(str, beg, &len);
3223
3224 if (!p) return Qnil;
3225 if (!len && !empty) return Qnil;
3226
3227 beg = p - RSTRING_PTR(str);
3228
3229 VALUE str2 = str_subseq(str, beg, len);
3230 rb_enc_cr_str_copy_for_substr(str2, str);
3231 return str2;
3232}
3233
3234/* :nodoc: */
3235VALUE
3237{
3238 if (CHILLED_STRING_P(str)) {
3239 FL_UNSET_RAW(str, STR_CHILLED);
3240 }
3241
3242 if (OBJ_FROZEN(str)) return str;
3243 rb_str_resize(str, RSTRING_LEN(str));
3244 return rb_obj_freeze(str);
3245}
3246
3247/*
3248 * call-seq:
3249 * +string -> new_string or self
3250 *
3251 * Returns +self+ if +self+ is not frozen and can be mutated
3252 * without warning issuance.
3253 *
3254 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3255 *
3256 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3257 */
3258static VALUE
3259str_uplus(VALUE str)
3260{
3261 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3262 return rb_str_dup(str);
3263 }
3264 else {
3265 return str;
3266 }
3267}
3268
3269/*
3270 * call-seq:
3271 * -self -> frozen_string
3272 *
3273 * Returns a frozen string equal to +self+.
3274 *
3275 * The returned string is +self+ if and only if all of the following are true:
3276 *
3277 * - +self+ is already frozen.
3278 * - +self+ is an instance of \String (rather than of a subclass of \String)
3279 * - +self+ has no instance variables set on it.
3280 *
3281 * Otherwise, the returned string is a frozen copy of +self+.
3282 *
3283 * Returning +self+, when possible, saves duplicating +self+;
3284 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3285 *
3286 * It may also save duplicating other, already-existing, strings:
3287 *
3288 * s0 = 'foo'
3289 * s1 = 'foo'
3290 * s0.object_id == s1.object_id # => false
3291 * (-s0).object_id == (-s1).object_id # => true
3292 *
3293 * Note that method #-@ is convenient for defining a constant:
3294 *
3295 * FileName = -'config/database.yml'
3296 *
3297 * While its alias #dedup is better suited for chaining:
3298 *
3299 * 'foo'.dedup.gsub!('o')
3300 *
3301 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3302 */
3303static VALUE
3304str_uminus(VALUE str)
3305{
3306 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3307 str = rb_str_dup(str);
3308 }
3309 return rb_fstring(str);
3310}
3311
3312RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3313#define rb_str_dup_frozen rb_str_new_frozen
3314
3315VALUE
3317{
3318 rb_check_frozen(str);
3319 if (FL_TEST(str, STR_TMPLOCK)) {
3320 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3321 }
3322 FL_SET(str, STR_TMPLOCK);
3323 return str;
3324}
3325
3326VALUE
3328{
3329 rb_check_frozen(str);
3330 if (!FL_TEST(str, STR_TMPLOCK)) {
3331 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3332 }
3333 FL_UNSET(str, STR_TMPLOCK);
3334 return str;
3335}
3336
3337VALUE
3338rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3339{
3340 rb_str_locktmp(str);
3341 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3342}
3343
3344void
3346{
3347 RUBY_ASSERT(ruby_thread_has_gvl_p());
3348
3349 long capa;
3350 const int termlen = TERM_LEN(str);
3351
3352 str_modifiable(str);
3353 if (STR_SHARED_P(str)) {
3354 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3355 }
3356 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3357 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3358 }
3359
3360 int cr = ENC_CODERANGE(str);
3361 if (len == 0) {
3362 /* Empty string does not contain non-ASCII */
3364 }
3365 else if (cr == ENC_CODERANGE_UNKNOWN) {
3366 /* Leave unknown. */
3367 }
3368 else if (len > RSTRING_LEN(str)) {
3369 if (ENC_CODERANGE_CLEAN_P(cr)) {
3370 /* Update the coderange regarding the extended part. */
3371 const char *const prev_end = RSTRING_END(str);
3372 const char *const new_end = RSTRING_PTR(str) + len;
3373 rb_encoding *enc = rb_enc_get(str);
3374 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3375 ENC_CODERANGE_SET(str, cr);
3376 }
3377 else if (cr == ENC_CODERANGE_BROKEN) {
3378 /* May be valid now, by appended part. */
3380 }
3381 }
3382 else if (len < RSTRING_LEN(str)) {
3383 if (cr != ENC_CODERANGE_7BIT) {
3384 /* ASCII-only string is keeping after truncated. Valid
3385 * and broken may be invalid or valid, leave unknown. */
3387 }
3388 }
3389
3390 STR_SET_LEN(str, len);
3391 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3392}
3393
3394VALUE
3395rb_str_resize(VALUE str, long len)
3396{
3397 if (len < 0) {
3398 rb_raise(rb_eArgError, "negative string size (or size too big)");
3399 }
3400
3401 int independent = str_independent(str);
3402 long slen = RSTRING_LEN(str);
3403 const int termlen = TERM_LEN(str);
3404
3405 if (slen > len || (termlen != 1 && slen < len)) {
3407 }
3408
3409 {
3410 long capa;
3411 if (STR_EMBED_P(str)) {
3412 if (len == slen) return str;
3413 if (str_embed_capa(str) >= len + termlen) {
3414 STR_SET_LEN(str, len);
3415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3416 return str;
3417 }
3418 str_make_independent_expand(str, slen, len - slen, termlen);
3419 }
3420 else if (str_embed_capa(str) >= len + termlen) {
3421 char *ptr = STR_HEAP_PTR(str);
3422 STR_SET_EMBED(str);
3423 if (slen > len) slen = len;
3424 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3425 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3426 STR_SET_LEN(str, len);
3427 if (independent) ruby_xfree(ptr);
3428 return str;
3429 }
3430 else if (!independent) {
3431 if (len == slen) return str;
3432 str_make_independent_expand(str, slen, len - slen, termlen);
3433 }
3434 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3435 (capa - len) > (len < 1024 ? len : 1024)) {
3436 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3437 (size_t)len + termlen, STR_HEAP_SIZE(str));
3438 RSTRING(str)->as.heap.aux.capa = len;
3439 }
3440 else if (len == slen) return str;
3441 STR_SET_LEN(str, len);
3442 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3443 }
3444 return str;
3445}
3446
3447static void
3448str_ensure_available_capa(VALUE str, long len)
3449{
3450 str_modify_keep_cr(str);
3451
3452 const int termlen = TERM_LEN(str);
3453 long olen = RSTRING_LEN(str);
3454
3455 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3456 rb_raise(rb_eArgError, "string sizes too big");
3457 }
3458
3459 long total = olen + len;
3460 long capa = str_capacity(str, termlen);
3461
3462 if (capa < total) {
3463 if (total >= LONG_MAX / 2) {
3464 capa = total;
3465 }
3466 while (total > capa) {
3467 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3468 }
3469 RESIZE_CAPA_TERM(str, capa, termlen);
3470 }
3471}
3472
3473static VALUE
3474str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3475{
3476 if (keep_cr) {
3477 str_modify_keep_cr(str);
3478 }
3479 else {
3480 rb_str_modify(str);
3481 }
3482 if (len == 0) return 0;
3483
3484 long total, olen, off = -1;
3485 char *sptr;
3486 const int termlen = TERM_LEN(str);
3487
3488 RSTRING_GETMEM(str, sptr, olen);
3489 if (ptr >= sptr && ptr <= sptr + olen) {
3490 off = ptr - sptr;
3491 }
3492
3493 long capa = str_capacity(str, termlen);
3494
3495 if (olen > LONG_MAX - len) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498 total = olen + len;
3499 if (capa < total) {
3500 if (total >= LONG_MAX / 2) {
3501 capa = total;
3502 }
3503 while (total > capa) {
3504 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3505 }
3506 RESIZE_CAPA_TERM(str, capa, termlen);
3507 sptr = RSTRING_PTR(str);
3508 }
3509 if (off != -1) {
3510 ptr = sptr + off;
3511 }
3512 memcpy(sptr + olen, ptr, len);
3513 STR_SET_LEN(str, total);
3514 TERM_FILL(sptr + total, termlen); /* sentinel */
3515
3516 return str;
3517}
3518
3519#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3520#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3521
3522VALUE
3523rb_str_cat(VALUE str, const char *ptr, long len)
3524{
3525 if (len == 0) return str;
3526 if (len < 0) {
3527 rb_raise(rb_eArgError, "negative string size (or size too big)");
3528 }
3529 return str_buf_cat(str, ptr, len);
3530}
3531
3532VALUE
3533rb_str_cat_cstr(VALUE str, const char *ptr)
3534{
3535 must_not_null(ptr);
3536 return rb_str_buf_cat(str, ptr, strlen(ptr));
3537}
3538
3539static void
3540rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3541{
3542 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3543
3544 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3545 if (UNLIKELY(!str_independent(str))) {
3546 str_make_independent(str);
3547 }
3548
3549 long string_length = -1;
3550 const int null_terminator_length = 1;
3551 char *sptr;
3552 RSTRING_GETMEM(str, sptr, string_length);
3553
3554 // Ensure the resulting string wouldn't be too long.
3555 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3556 rb_raise(rb_eArgError, "string sizes too big");
3557 }
3558
3559 long string_capacity = str_capacity(str, null_terminator_length);
3560
3561 // Get the code range before any modifications since those might clear the code range.
3562 int cr = ENC_CODERANGE(str);
3563
3564 // Check if the string has spare string_capacity to write the new byte.
3565 if (LIKELY(string_capacity >= string_length + 1)) {
3566 // In fast path we can write the new byte and note the string's new length.
3567 sptr[string_length] = byte;
3568 STR_SET_LEN(str, string_length + 1);
3569 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3570 }
3571 else {
3572 // If there's not enough string_capacity, make a call into the general string concatenation function.
3573 str_buf_cat(str, (char *)&byte, 1);
3574 }
3575
3576 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3577 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3578 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3579 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3580 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3581 if (ISASCII(byte)) {
3583 }
3584 else {
3586
3587 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3588 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3589 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3590 }
3591 }
3592 }
3593}
3594
3595RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3596RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3597RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3598
3599static VALUE
3600rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3601 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3602{
3603 int str_encindex = ENCODING_GET(str);
3604 int res_encindex;
3605 int str_cr, res_cr;
3606 rb_encoding *str_enc, *ptr_enc;
3607
3608 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3609
3610 if (str_encindex == ptr_encindex) {
3611 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3612 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3613 }
3614 }
3615 else {
3616 str_enc = rb_enc_from_index(str_encindex);
3617 ptr_enc = rb_enc_from_index(ptr_encindex);
3618 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3619 if (len == 0)
3620 return str;
3621 if (RSTRING_LEN(str) == 0) {
3622 rb_str_buf_cat(str, ptr, len);
3623 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3624 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3625 return str;
3626 }
3627 goto incompatible;
3628 }
3629 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3630 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3631 }
3632 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3633 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3634 str_cr = rb_enc_str_coderange(str);
3635 }
3636 }
3637 }
3638 if (ptr_cr_ret)
3639 *ptr_cr_ret = ptr_cr;
3640
3641 if (str_encindex != ptr_encindex &&
3642 str_cr != ENC_CODERANGE_7BIT &&
3643 ptr_cr != ENC_CODERANGE_7BIT) {
3644 str_enc = rb_enc_from_index(str_encindex);
3645 ptr_enc = rb_enc_from_index(ptr_encindex);
3646 goto incompatible;
3647 }
3648
3649 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3650 res_encindex = str_encindex;
3651 res_cr = ENC_CODERANGE_UNKNOWN;
3652 }
3653 else if (str_cr == ENC_CODERANGE_7BIT) {
3654 if (ptr_cr == ENC_CODERANGE_7BIT) {
3655 res_encindex = str_encindex;
3656 res_cr = ENC_CODERANGE_7BIT;
3657 }
3658 else {
3659 res_encindex = ptr_encindex;
3660 res_cr = ptr_cr;
3661 }
3662 }
3663 else if (str_cr == ENC_CODERANGE_VALID) {
3664 res_encindex = str_encindex;
3665 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3666 res_cr = str_cr;
3667 else
3668 res_cr = ptr_cr;
3669 }
3670 else { /* str_cr == ENC_CODERANGE_BROKEN */
3671 res_encindex = str_encindex;
3672 res_cr = str_cr;
3673 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3674 }
3675
3676 if (len < 0) {
3677 rb_raise(rb_eArgError, "negative string size (or size too big)");
3678 }
3679 str_buf_cat(str, ptr, len);
3680 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3681 return str;
3682
3683 incompatible:
3684 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3685 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3687}
3688
3689VALUE
3690rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3691{
3692 return rb_enc_cr_str_buf_cat(str, ptr, len,
3693 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3694}
3695
3696VALUE
3698{
3699 /* ptr must reference NUL terminated ASCII string. */
3700 int encindex = ENCODING_GET(str);
3701 rb_encoding *enc = rb_enc_from_index(encindex);
3702 if (rb_enc_asciicompat(enc)) {
3703 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3704 encindex, ENC_CODERANGE_7BIT, 0);
3705 }
3706 else {
3707 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3708 while (*ptr) {
3709 unsigned int c = (unsigned char)*ptr;
3710 int len = rb_enc_codelen(c, enc);
3711 rb_enc_mbcput(c, buf, enc);
3712 rb_enc_cr_str_buf_cat(str, buf, len,
3713 encindex, ENC_CODERANGE_VALID, 0);
3714 ptr++;
3715 }
3716 return str;
3717 }
3718}
3719
3720VALUE
3722{
3723 int str2_cr = rb_enc_str_coderange(str2);
3724
3725 if (str_enc_fastpath(str)) {
3726 switch (str2_cr) {
3727 case ENC_CODERANGE_7BIT:
3728 // If RHS is 7bit we can do simple concatenation
3729 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3730 RB_GC_GUARD(str2);
3731 return str;
3733 // If RHS is valid, we can do simple concatenation if encodings are the same
3734 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3735 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3736 int str_cr = ENC_CODERANGE(str);
3737 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3738 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3739 }
3740 RB_GC_GUARD(str2);
3741 return str;
3742 }
3743 }
3744 }
3745
3746 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3747 ENCODING_GET(str2), str2_cr, &str2_cr);
3748
3749 ENC_CODERANGE_SET(str2, str2_cr);
3750
3751 return str;
3752}
3753
3754VALUE
3756{
3757 StringValue(str2);
3758 return rb_str_buf_append(str, str2);
3759}
3760
3761VALUE
3762rb_str_concat_literals(size_t num, const VALUE *strary)
3763{
3764 VALUE str;
3765 size_t i, s = 0;
3766 unsigned long len = 1;
3767
3768 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3769 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3770
3771 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3772 str = rb_str_buf_new(len);
3773 str_enc_copy_direct(str, strary[0]);
3774
3775 for (i = s; i < num; ++i) {
3776 const VALUE v = strary[i];
3777 int encidx = ENCODING_GET(v);
3778
3779 rb_str_buf_append(str, v);
3780 if (encidx != ENCINDEX_US_ASCII) {
3781 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3782 rb_enc_set_index(str, encidx);
3783 }
3784 }
3785 return str;
3786}
3787
3788/*
3789 * call-seq:
3790 * concat(*objects) -> string
3791 *
3792 * :include: doc/string/concat.rdoc
3793 */
3794static VALUE
3795rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3796{
3797 str_modifiable(str);
3798
3799 if (argc == 1) {
3800 return rb_str_concat(str, argv[0]);
3801 }
3802 else if (argc > 1) {
3803 int i;
3804 VALUE arg_str = rb_str_tmp_new(0);
3805 rb_enc_copy(arg_str, str);
3806 for (i = 0; i < argc; i++) {
3807 rb_str_concat(arg_str, argv[i]);
3808 }
3809 rb_str_buf_append(str, arg_str);
3810 }
3811
3812 return str;
3813}
3814
3815/*
3816 * call-seq:
3817 * append_as_bytes(*objects) -> self
3818 *
3819 * Concatenates each object in +objects+ into +self+; returns +self+;
3820 * performs no encoding validation or conversion:
3821 *
3822 * s = 'foo'
3823 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3824 * s.valid_encoding? # => false
3825 * s.append_as_bytes("\xAC 12")
3826 * s.valid_encoding? # => true
3827 *
3828 * When a given object is an integer,
3829 * the value is considered an 8-bit byte;
3830 * if the integer occupies more than one byte (i.e,. is greater than 255),
3831 * appends only the low-order byte (similar to String#setbyte):
3832 *
3833 * s = ""
3834 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3835 * s.bytesize # => 2
3836 *
3837 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3838 */
3839
3840VALUE
3841rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3842{
3843 long needed_capacity = 0;
3844 volatile VALUE t0;
3845 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3846
3847 for (int index = 0; index < argc; index++) {
3848 VALUE obj = argv[index];
3849 enum ruby_value_type type = types[index] = rb_type(obj);
3850 switch (type) {
3851 case T_FIXNUM:
3852 case T_BIGNUM:
3853 needed_capacity++;
3854 break;
3855 case T_STRING:
3856 needed_capacity += RSTRING_LEN(obj);
3857 break;
3858 default:
3859 rb_raise(
3861 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3862 rb_obj_class(obj)
3863 );
3864 break;
3865 }
3866 }
3867
3868 str_ensure_available_capa(str, needed_capacity);
3869 char *sptr = RSTRING_END(str);
3870
3871 for (int index = 0; index < argc; index++) {
3872 VALUE obj = argv[index];
3873 enum ruby_value_type type = types[index];
3874 switch (type) {
3875 case T_FIXNUM:
3876 case T_BIGNUM: {
3877 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3878 char byte = (char)(NUM2INT(obj) & 0xFF);
3879 *sptr = byte;
3880 sptr++;
3881 break;
3882 }
3883 case T_STRING: {
3884 const char *ptr;
3885 long len;
3886 RSTRING_GETMEM(obj, ptr, len);
3887 memcpy(sptr, ptr, len);
3888 sptr += len;
3889 break;
3890 }
3891 default:
3892 rb_bug("append_as_bytes arguments should have been validated");
3893 }
3894 }
3895
3896 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3897 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3898
3899 int cr = ENC_CODERANGE(str);
3900 switch (cr) {
3901 case ENC_CODERANGE_7BIT: {
3902 for (int index = 0; index < argc; index++) {
3903 VALUE obj = argv[index];
3904 enum ruby_value_type type = types[index];
3905 switch (type) {
3906 case T_FIXNUM:
3907 case T_BIGNUM: {
3908 if (!ISASCII(NUM2INT(obj))) {
3909 goto clear_cr;
3910 }
3911 break;
3912 }
3913 case T_STRING: {
3914 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3915 goto clear_cr;
3916 }
3917 break;
3918 }
3919 default:
3920 rb_bug("append_as_bytes arguments should have been validated");
3921 }
3922 }
3923 break;
3924 }
3926 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3927 goto keep_cr;
3928 }
3929 else {
3930 goto clear_cr;
3931 }
3932 break;
3933 default:
3934 goto clear_cr;
3935 break;
3936 }
3937
3938 RB_GC_GUARD(t0);
3939
3940 clear_cr:
3941 // If no fast path was hit, we clear the coderange.
3942 // append_as_bytes is predominently meant to be used in
3943 // buffering situation, hence it's likely the coderange
3944 // will never be scanned, so it's not worth spending time
3945 // precomputing the coderange except for simple and common
3946 // situations.
3948 keep_cr:
3949 return str;
3950}
3951
3952/*
3953 * call-seq:
3954 * self << object -> self
3955 *
3956 * Appends a string representation of +object+ to +self+;
3957 * returns +self+.
3958 *
3959 * If +object+ is a string, appends it to +self+:
3960 *
3961 * s = 'foo'
3962 * s << 'bar' # => "foobar"
3963 * s # => "foobar"
3964 *
3965 * If +object+ is an integer,
3966 * its value is considered a codepoint;
3967 * converts the value to a character before concatenating:
3968 *
3969 * s = 'foo'
3970 * s << 33 # => "foo!"
3971 *
3972 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3973 * and the encoding of +self+ is Encoding::US_ASCII,
3974 * changes the encoding to Encoding::ASCII_8BIT:
3975 *
3976 * s = 'foo'.encode(Encoding::US_ASCII)
3977 * s.encoding # => #<Encoding:US-ASCII>
3978 * s << 0xff # => "foo\xFF"
3979 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3980 *
3981 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3982 *
3983 * s = 'foo'
3984 * s.encoding # => <Encoding:UTF-8>
3985 * s << 0x00110000 # 1114112 out of char range (RangeError)
3986 * s = 'foo'.encode(Encoding::EUC_JP)
3987 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3988 *
3989 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3990 */
3991VALUE
3993{
3994 unsigned int code;
3995 rb_encoding *enc = STR_ENC_GET(str1);
3996 int encidx;
3997
3998 if (RB_INTEGER_TYPE_P(str2)) {
3999 if (rb_num_to_uint(str2, &code) == 0) {
4000 }
4001 else if (FIXNUM_P(str2)) {
4002 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4003 }
4004 else {
4005 rb_raise(rb_eRangeError, "bignum out of char range");
4006 }
4007 }
4008 else {
4009 return rb_str_append(str1, str2);
4010 }
4011
4012 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4013
4014 if (encidx >= 0) {
4015 rb_str_buf_cat_byte(str1, (unsigned char)code);
4016 }
4017 else {
4018 long pos = RSTRING_LEN(str1);
4019 int cr = ENC_CODERANGE(str1);
4020 int len;
4021 char *buf;
4022
4023 switch (len = rb_enc_codelen(code, enc)) {
4024 case ONIGERR_INVALID_CODE_POINT_VALUE:
4025 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4026 break;
4027 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4028 case 0:
4029 rb_raise(rb_eRangeError, "%u out of char range", code);
4030 break;
4031 }
4032 buf = ALLOCA_N(char, len + 1);
4033 rb_enc_mbcput(code, buf, enc);
4034 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4035 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4036 }
4037 rb_str_resize(str1, pos+len);
4038 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4039 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4041 }
4042 else if (cr == ENC_CODERANGE_BROKEN) {
4044 }
4045 ENC_CODERANGE_SET(str1, cr);
4046 }
4047 return str1;
4048}
4049
4050int
4051rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4052{
4053 int encidx = rb_enc_to_index(enc);
4054
4055 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4056 /* US-ASCII automatically extended to ASCII-8BIT */
4057 if (code > 0xFF) {
4058 rb_raise(rb_eRangeError, "%u out of char range", code);
4059 }
4060 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4061 return ENCINDEX_ASCII_8BIT;
4062 }
4063 return encidx;
4064 }
4065 else {
4066 return -1;
4067 }
4068}
4069
4070/*
4071 * call-seq:
4072 * prepend(*other_strings) -> string
4073 *
4074 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4075 *
4076 * s = 'foo'
4077 * s.prepend('bar', 'baz') # => "barbazfoo"
4078 * s # => "barbazfoo"
4079 *
4080 * Related: String#concat.
4081 */
4082
4083static VALUE
4084rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4085{
4086 str_modifiable(str);
4087
4088 if (argc == 1) {
4089 rb_str_update(str, 0L, 0L, argv[0]);
4090 }
4091 else if (argc > 1) {
4092 int i;
4093 VALUE arg_str = rb_str_tmp_new(0);
4094 rb_enc_copy(arg_str, str);
4095 for (i = 0; i < argc; i++) {
4096 rb_str_append(arg_str, argv[i]);
4097 }
4098 rb_str_update(str, 0L, 0L, arg_str);
4099 }
4100
4101 return str;
4102}
4103
4104st_index_t
4106{
4107 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4108 st_index_t precomputed_hash;
4109 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4110
4111 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4112 return precomputed_hash;
4113 }
4114
4115 return str_do_hash(str);
4116}
4117
4118int
4120{
4121 long len1, len2;
4122 const char *ptr1, *ptr2;
4123 RSTRING_GETMEM(str1, ptr1, len1);
4124 RSTRING_GETMEM(str2, ptr2, len2);
4125 return (len1 != len2 ||
4126 !rb_str_comparable(str1, str2) ||
4127 memcmp(ptr1, ptr2, len1) != 0);
4128}
4129
4130/*
4131 * call-seq:
4132 * hash -> integer
4133 *
4134 * Returns the integer hash value for +self+.
4135 * The value is based on the length, content and encoding of +self+.
4136 *
4137 * Related: Object#hash.
4138 */
4139
4140static VALUE
4141rb_str_hash_m(VALUE str)
4142{
4143 st_index_t hval = rb_str_hash(str);
4144 return ST2FIX(hval);
4145}
4146
4147#define lesser(a,b) (((a)>(b))?(b):(a))
4148
4149int
4151{
4152 int idx1, idx2;
4153 int rc1, rc2;
4154
4155 if (RSTRING_LEN(str1) == 0) return TRUE;
4156 if (RSTRING_LEN(str2) == 0) return TRUE;
4157 idx1 = ENCODING_GET(str1);
4158 idx2 = ENCODING_GET(str2);
4159 if (idx1 == idx2) return TRUE;
4160 rc1 = rb_enc_str_coderange(str1);
4161 rc2 = rb_enc_str_coderange(str2);
4162 if (rc1 == ENC_CODERANGE_7BIT) {
4163 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4164 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4165 return TRUE;
4166 }
4167 if (rc2 == ENC_CODERANGE_7BIT) {
4168 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4169 return TRUE;
4170 }
4171 return FALSE;
4172}
4173
4174int
4176{
4177 long len1, len2;
4178 const char *ptr1, *ptr2;
4179 int retval;
4180
4181 if (str1 == str2) return 0;
4182 RSTRING_GETMEM(str1, ptr1, len1);
4183 RSTRING_GETMEM(str2, ptr2, len2);
4184 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4185 if (len1 == len2) {
4186 if (!rb_str_comparable(str1, str2)) {
4187 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4188 return 1;
4189 return -1;
4190 }
4191 return 0;
4192 }
4193 if (len1 > len2) return 1;
4194 return -1;
4195 }
4196 if (retval > 0) return 1;
4197 return -1;
4198}
4199
4200/*
4201 * call-seq:
4202 * self == object -> true or false
4203 *
4204 * Returns whether +object+ is equal to +self+.
4205 *
4206 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4207 *
4208 * s = 'foo'
4209 * s == 'foo' # => true
4210 * s == 'food' # => false
4211 * s == 'FOO' # => false
4212 *
4213 * Returns +false+ if the two strings' encodings are not compatible:
4214 *
4215 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4216 *
4217 * When +object+ is not a string:
4218 *
4219 * - If +object+ responds to method <tt>to_str</tt>,
4220 * <tt>object == self</tt> is called and its return value is returned.
4221 * - If +object+ does not respond to <tt>to_str</tt>,
4222 * +false+ is returned.
4223 *
4224 * Related: {Comparing}[rdoc-ref:String@Comparing].
4225 */
4226
4227VALUE
4229{
4230 if (str1 == str2) return Qtrue;
4231 if (!RB_TYPE_P(str2, T_STRING)) {
4232 if (!rb_respond_to(str2, idTo_str)) {
4233 return Qfalse;
4234 }
4235 return rb_equal(str2, str1);
4236 }
4237 return rb_str_eql_internal(str1, str2);
4238}
4239
4240/*
4241 * call-seq:
4242 * eql?(object) -> true or false
4243 *
4244 * Returns +true+ if +object+ has the same length and content;
4245 * as +self+; +false+ otherwise:
4246 *
4247 * s = 'foo'
4248 * s.eql?('foo') # => true
4249 * s.eql?('food') # => false
4250 * s.eql?('FOO') # => false
4251 *
4252 * Returns +false+ if the two strings' encodings are not compatible:
4253 *
4254 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4255 *
4256 */
4257
4258VALUE
4259rb_str_eql(VALUE str1, VALUE str2)
4260{
4261 if (str1 == str2) return Qtrue;
4262 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4263 return rb_str_eql_internal(str1, str2);
4264}
4265
4266/*
4267 * call-seq:
4268 * self <=> other_string -> -1, 0, 1, or nil
4269 *
4270 * Compares +self+ and +other_string+, returning:
4271 *
4272 * - -1 if +other_string+ is larger.
4273 * - 0 if the two are equal.
4274 * - 1 if +other_string+ is smaller.
4275 * - +nil+ if the two are incomparable.
4276 *
4277 * Examples:
4278 *
4279 * 'foo' <=> 'foo' # => 0
4280 * 'foo' <=> 'food' # => -1
4281 * 'food' <=> 'foo' # => 1
4282 * 'FOO' <=> 'foo' # => -1
4283 * 'foo' <=> 'FOO' # => 1
4284 * 'foo' <=> 1 # => nil
4285 *
4286 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4287 */
4288
4289static VALUE
4290rb_str_cmp_m(VALUE str1, VALUE str2)
4291{
4292 int result;
4293 VALUE s = rb_check_string_type(str2);
4294 if (NIL_P(s)) {
4295 return rb_invcmp(str1, str2);
4296 }
4297 result = rb_str_cmp(str1, s);
4298 return INT2FIX(result);
4299}
4300
4301static VALUE str_casecmp(VALUE str1, VALUE str2);
4302static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4303
4304/*
4305 * call-seq:
4306 * casecmp(other_string) -> -1, 0, 1, or nil
4307 *
4308 * Ignoring case, compares +self+ and +other_string+; returns:
4309 *
4310 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4311 * - 0 if the two are equal.
4312 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4313 * - +nil+ if the two are incomparable.
4314 *
4315 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4316 *
4317 * Examples:
4318 *
4319 * 'foo'.casecmp('goo') # => -1
4320 * 'goo'.casecmp('foo') # => 1
4321 * 'foo'.casecmp('food') # => -1
4322 * 'food'.casecmp('foo') # => 1
4323 * 'FOO'.casecmp('foo') # => 0
4324 * 'foo'.casecmp('FOO') # => 0
4325 * 'foo'.casecmp(1) # => nil
4326 *
4327 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4328 */
4329
4330static VALUE
4331rb_str_casecmp(VALUE str1, VALUE str2)
4332{
4333 VALUE s = rb_check_string_type(str2);
4334 if (NIL_P(s)) {
4335 return Qnil;
4336 }
4337 return str_casecmp(str1, s);
4338}
4339
4340static VALUE
4341str_casecmp(VALUE str1, VALUE str2)
4342{
4343 long len;
4344 rb_encoding *enc;
4345 const char *p1, *p1end, *p2, *p2end;
4346
4347 enc = rb_enc_compatible(str1, str2);
4348 if (!enc) {
4349 return Qnil;
4350 }
4351
4352 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4353 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4354 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4355 while (p1 < p1end && p2 < p2end) {
4356 if (*p1 != *p2) {
4357 unsigned int c1 = TOLOWER(*p1 & 0xff);
4358 unsigned int c2 = TOLOWER(*p2 & 0xff);
4359 if (c1 != c2)
4360 return INT2FIX(c1 < c2 ? -1 : 1);
4361 }
4362 p1++;
4363 p2++;
4364 }
4365 }
4366 else {
4367 while (p1 < p1end && p2 < p2end) {
4368 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4369 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4370
4371 if (0 <= c1 && 0 <= c2) {
4372 c1 = TOLOWER(c1);
4373 c2 = TOLOWER(c2);
4374 if (c1 != c2)
4375 return INT2FIX(c1 < c2 ? -1 : 1);
4376 }
4377 else {
4378 int r;
4379 l1 = rb_enc_mbclen(p1, p1end, enc);
4380 l2 = rb_enc_mbclen(p2, p2end, enc);
4381 len = l1 < l2 ? l1 : l2;
4382 r = memcmp(p1, p2, len);
4383 if (r != 0)
4384 return INT2FIX(r < 0 ? -1 : 1);
4385 if (l1 != l2)
4386 return INT2FIX(l1 < l2 ? -1 : 1);
4387 }
4388 p1 += l1;
4389 p2 += l2;
4390 }
4391 }
4392 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4393 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4394 return INT2FIX(-1);
4395}
4396
4397/*
4398 * call-seq:
4399 * casecmp?(other_string) -> true, false, or nil
4400 *
4401 * Returns +true+ if +self+ and +other_string+ are equal after
4402 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4403 *
4404 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4405 *
4406 * Examples:
4407 *
4408 * 'foo'.casecmp?('goo') # => false
4409 * 'goo'.casecmp?('foo') # => false
4410 * 'foo'.casecmp?('food') # => false
4411 * 'food'.casecmp?('foo') # => false
4412 * 'FOO'.casecmp?('foo') # => true
4413 * 'foo'.casecmp?('FOO') # => true
4414 * 'foo'.casecmp?(1) # => nil
4415 *
4416 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4417 */
4418
4419static VALUE
4420rb_str_casecmp_p(VALUE str1, VALUE str2)
4421{
4422 VALUE s = rb_check_string_type(str2);
4423 if (NIL_P(s)) {
4424 return Qnil;
4425 }
4426 return str_casecmp_p(str1, s);
4427}
4428
4429static VALUE
4430str_casecmp_p(VALUE str1, VALUE str2)
4431{
4432 rb_encoding *enc;
4433 VALUE folded_str1, folded_str2;
4434 VALUE fold_opt = sym_fold;
4435
4436 enc = rb_enc_compatible(str1, str2);
4437 if (!enc) {
4438 return Qnil;
4439 }
4440
4441 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4442 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4443
4444 return rb_str_eql(folded_str1, folded_str2);
4445}
4446
4447static long
4448strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4449 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4450{
4451 const char *search_start = str_ptr;
4452 long pos, search_len = str_len - offset;
4453
4454 for (;;) {
4455 const char *t;
4456 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4457 if (pos < 0) return pos;
4458 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4459 if (t == search_start + pos) break;
4460 search_len -= t - search_start;
4461 if (search_len <= 0) return -1;
4462 offset += t - search_start;
4463 search_start = t;
4464 }
4465 return pos + offset;
4466}
4467
4468/* found index in byte */
4469#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4470#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4471
4472static long
4473rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4474{
4475 const char *str_ptr, *str_ptr_end, *sub_ptr;
4476 long str_len, sub_len;
4477 rb_encoding *enc;
4478
4479 enc = rb_enc_check(str, sub);
4480 if (is_broken_string(sub)) return -1;
4481
4482 str_ptr = RSTRING_PTR(str);
4483 str_ptr_end = RSTRING_END(str);
4484 str_len = RSTRING_LEN(str);
4485 sub_ptr = RSTRING_PTR(sub);
4486 sub_len = RSTRING_LEN(sub);
4487
4488 if (str_len < sub_len) return -1;
4489
4490 if (offset != 0) {
4491 long str_len_char, sub_len_char;
4492 int single_byte = single_byte_optimizable(str);
4493 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4494 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4495 if (offset < 0) {
4496 offset += str_len_char;
4497 if (offset < 0) return -1;
4498 }
4499 if (str_len_char - offset < sub_len_char) return -1;
4500 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4501 str_ptr += offset;
4502 }
4503 if (sub_len == 0) return offset;
4504
4505 /* need proceed one character at a time */
4506 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4507}
4508
4509
4510/*
4511 * call-seq:
4512 * index(substring, offset = 0) -> integer or nil
4513 * index(regexp, offset = 0) -> integer or nil
4514 *
4515 * :include: doc/string/index.rdoc
4516 *
4517 */
4518
4519static VALUE
4520rb_str_index_m(int argc, VALUE *argv, VALUE str)
4521{
4522 VALUE sub;
4523 VALUE initpos;
4524 rb_encoding *enc = STR_ENC_GET(str);
4525 long pos;
4526
4527 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4528 long slen = str_strlen(str, enc); /* str's enc */
4529 pos = NUM2LONG(initpos);
4530 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4531 if (RB_TYPE_P(sub, T_REGEXP)) {
4533 }
4534 return Qnil;
4535 }
4536 }
4537 else {
4538 pos = 0;
4539 }
4540
4541 if (RB_TYPE_P(sub, T_REGEXP)) {
4542 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4543 enc, single_byte_optimizable(str));
4544
4545 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4546 VALUE match = rb_backref_get();
4547 struct re_registers *regs = RMATCH_REGS(match);
4548 pos = rb_str_sublen(str, BEG(0));
4549 return LONG2NUM(pos);
4550 }
4551 }
4552 else {
4553 StringValue(sub);
4554 pos = rb_str_index(str, sub, pos);
4555 if (pos >= 0) {
4556 pos = rb_str_sublen(str, pos);
4557 return LONG2NUM(pos);
4558 }
4559 }
4560 return Qnil;
4561}
4562
4563/* Ensure that the given pos is a valid character boundary.
4564 * Note that in this function, "character" means a code point
4565 * (Unicode scalar value), not a grapheme cluster.
4566 */
4567static void
4568str_ensure_byte_pos(VALUE str, long pos)
4569{
4570 if (!single_byte_optimizable(str)) {
4571 const char *s = RSTRING_PTR(str);
4572 const char *e = RSTRING_END(str);
4573 const char *p = s + pos;
4574 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4575 rb_raise(rb_eIndexError,
4576 "offset %ld does not land on character boundary", pos);
4577 }
4578 }
4579}
4580
4581/*
4582 * call-seq:
4583 * byteindex(object, offset = 0) -> integer or nil
4584 *
4585 * Returns the 0-based integer index of a substring of +self+
4586 * specified by +object+ (a string or Regexp) and +offset+,
4587 * or +nil+ if there is no such substring;
4588 * the returned index is the count of _bytes_ (not characters).
4589 *
4590 * When +object+ is a string,
4591 * returns the index of the first found substring equal to +object+:
4592 *
4593 * s = 'foo' # => "foo"
4594 * s.size # => 3 # Three 1-byte characters.
4595 * s.bytesize # => 3 # Three bytes.
4596 * s.byteindex('f') # => 0
4597 * s.byteindex('o') # => 1
4598 * s.byteindex('oo') # => 1
4599 * s.byteindex('ooo') # => nil
4600 *
4601 * When +object+ is a Regexp,
4602 * returns the index of the first found substring matching +object+;
4603 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4604 *
4605 * s = 'foo'
4606 * s.byteindex(/f/) # => 0
4607 * $~ # => #<MatchData "f">
4608 * s.byteindex(/o/) # => 1
4609 * s.byteindex(/oo/) # => 1
4610 * s.byteindex(/ooo/) # => nil
4611 * $~ # => nil
4612 *
4613 * \Integer argument +offset+, if given, specifies the 0-based index
4614 * of the byte where searching is to begin.
4615 *
4616 * When +offset+ is non-negative,
4617 * searching begins at byte position +offset+:
4618 *
4619 * s = 'foo'
4620 * s.byteindex('o', 1) # => 1
4621 * s.byteindex('o', 2) # => 2
4622 * s.byteindex('o', 3) # => nil
4623 *
4624 * When +offset+ is negative, counts backward from the end of +self+:
4625 *
4626 * s = 'foo'
4627 * s.byteindex('o', -1) # => 2
4628 * s.byteindex('o', -2) # => 1
4629 * s.byteindex('o', -3) # => 1
4630 * s.byteindex('o', -4) # => nil
4631 *
4632 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4633 *
4634 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4635 * s.size # => 2 # Two 3-byte characters.
4636 * s.bytesize # => 6 # Six bytes.
4637 * s.byteindex("\uFFFF") # => 0
4638 * s.byteindex("\uFFFF", 1) # Raises IndexError
4639 * s.byteindex("\uFFFF", 2) # Raises IndexError
4640 * s.byteindex("\uFFFF", 3) # => 3
4641 * s.byteindex("\uFFFF", 4) # Raises IndexError
4642 * s.byteindex("\uFFFF", 5) # Raises IndexError
4643 * s.byteindex("\uFFFF", 6) # => nil
4644 *
4645 * Related: see {Querying}[rdoc-ref:String@Querying].
4646 */
4647
4648static VALUE
4649rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4650{
4651 VALUE sub;
4652 VALUE initpos;
4653 long pos;
4654
4655 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4656 long slen = RSTRING_LEN(str);
4657 pos = NUM2LONG(initpos);
4658 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4659 if (RB_TYPE_P(sub, T_REGEXP)) {
4661 }
4662 return Qnil;
4663 }
4664 }
4665 else {
4666 pos = 0;
4667 }
4668
4669 str_ensure_byte_pos(str, pos);
4670
4671 if (RB_TYPE_P(sub, T_REGEXP)) {
4672 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4673 VALUE match = rb_backref_get();
4674 struct re_registers *regs = RMATCH_REGS(match);
4675 pos = BEG(0);
4676 return LONG2NUM(pos);
4677 }
4678 }
4679 else {
4680 StringValue(sub);
4681 pos = rb_str_byteindex(str, sub, pos);
4682 if (pos >= 0) return LONG2NUM(pos);
4683 }
4684 return Qnil;
4685}
4686
4687#ifndef HAVE_MEMRCHR
4688static void*
4689memrchr(const char *search_str, int chr, long search_len)
4690{
4691 const char *ptr = search_str + search_len;
4692 while (ptr > search_str) {
4693 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4694 }
4695
4696 return ((void *)0);
4697}
4698#endif
4699
4700static long
4701str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4702{
4703 char *hit, *adjusted;
4704 int c;
4705 long slen, searchlen;
4706 char *sbeg, *e, *t;
4707
4708 sbeg = RSTRING_PTR(str);
4709 slen = RSTRING_LEN(sub);
4710 if (slen == 0) return s - sbeg;
4711 e = RSTRING_END(str);
4712 t = RSTRING_PTR(sub);
4713 c = *t & 0xff;
4714 searchlen = s - sbeg + 1;
4715
4716 if (memcmp(s, t, slen) == 0) {
4717 return s - sbeg;
4718 }
4719
4720 do {
4721 hit = memrchr(sbeg, c, searchlen);
4722 if (!hit) break;
4723 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4724 if (hit != adjusted) {
4725 searchlen = adjusted - sbeg;
4726 continue;
4727 }
4728 if (memcmp(hit, t, slen) == 0)
4729 return hit - sbeg;
4730 searchlen = adjusted - sbeg;
4731 } while (searchlen > 0);
4732
4733 return -1;
4734}
4735
4736/* found index in byte */
4737static long
4738rb_str_rindex(VALUE str, VALUE sub, long pos)
4739{
4740 long len, slen;
4741 char *sbeg, *s;
4742 rb_encoding *enc;
4743 int singlebyte;
4744
4745 enc = rb_enc_check(str, sub);
4746 if (is_broken_string(sub)) return -1;
4747 singlebyte = single_byte_optimizable(str);
4748 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4749 slen = str_strlen(sub, enc); /* rb_enc_check */
4750
4751 /* substring longer than string */
4752 if (len < slen) return -1;
4753 if (len - pos < slen) pos = len - slen;
4754 if (len == 0) return pos;
4755
4756 sbeg = RSTRING_PTR(str);
4757
4758 if (pos == 0) {
4759 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4760 return 0;
4761 else
4762 return -1;
4763 }
4764
4765 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4766 return str_rindex(str, sub, s, enc);
4767}
4768
4769/*
4770 * call-seq:
4771 * rindex(substring, offset = self.length) -> integer or nil
4772 * rindex(regexp, offset = self.length) -> integer or nil
4773 *
4774 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4775 * or +nil+ if none found:
4776 *
4777 * 'foo'.rindex('f') # => 0
4778 * 'foo'.rindex('o') # => 2
4779 * 'foo'.rindex('oo') # => 1
4780 * 'foo'.rindex('ooo') # => nil
4781 *
4782 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4783 * or +nil+ if none found:
4784 *
4785 * 'foo'.rindex(/f/) # => 0
4786 * 'foo'.rindex(/o/) # => 2
4787 * 'foo'.rindex(/oo/) # => 1
4788 * 'foo'.rindex(/ooo/) # => nil
4789 *
4790 * The _last_ match means starting at the possible last position, not
4791 * the last of longest matches.
4792 *
4793 * 'foo'.rindex(/o+/) # => 2
4794 * $~ #=> #<MatchData "o">
4795 *
4796 * To get the last longest match, needs to combine with negative
4797 * lookbehind.
4798 *
4799 * 'foo'.rindex(/(?<!o)o+/) # => 1
4800 * $~ #=> #<MatchData "oo">
4801 *
4802 * Or String#index with negative lookforward.
4803 *
4804 * 'foo'.index(/o+(?!.*o)/) # => 1
4805 * $~ #=> #<MatchData "oo">
4806 *
4807 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4808 * string to _end_ the search:
4809 *
4810 * 'foo'.rindex('o', 0) # => nil
4811 * 'foo'.rindex('o', 1) # => 1
4812 * 'foo'.rindex('o', 2) # => 2
4813 * 'foo'.rindex('o', 3) # => 2
4814 *
4815 * If +offset+ is a negative Integer, the maximum starting position in the
4816 * string to _end_ the search is the sum of the string's length and +offset+:
4817 *
4818 * 'foo'.rindex('o', -1) # => 2
4819 * 'foo'.rindex('o', -2) # => 1
4820 * 'foo'.rindex('o', -3) # => nil
4821 * 'foo'.rindex('o', -4) # => nil
4822 *
4823 * Related: String#index.
4824 */
4825
4826static VALUE
4827rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4828{
4829 VALUE sub;
4830 VALUE initpos;
4831 rb_encoding *enc = STR_ENC_GET(str);
4832 long pos, len = str_strlen(str, enc); /* str's enc */
4833
4834 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4835 pos = NUM2LONG(initpos);
4836 if (pos < 0 && (pos += len) < 0) {
4837 if (RB_TYPE_P(sub, T_REGEXP)) {
4839 }
4840 return Qnil;
4841 }
4842 if (pos > len) pos = len;
4843 }
4844 else {
4845 pos = len;
4846 }
4847
4848 if (RB_TYPE_P(sub, T_REGEXP)) {
4849 /* enc = rb_enc_check(str, sub); */
4850 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4851 enc, single_byte_optimizable(str));
4852
4853 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4854 VALUE match = rb_backref_get();
4855 struct re_registers *regs = RMATCH_REGS(match);
4856 pos = rb_str_sublen(str, BEG(0));
4857 return LONG2NUM(pos);
4858 }
4859 }
4860 else {
4861 StringValue(sub);
4862 pos = rb_str_rindex(str, sub, pos);
4863 if (pos >= 0) {
4864 pos = rb_str_sublen(str, pos);
4865 return LONG2NUM(pos);
4866 }
4867 }
4868 return Qnil;
4869}
4870
4871static long
4872rb_str_byterindex(VALUE str, VALUE sub, long pos)
4873{
4874 long len, slen;
4875 char *sbeg, *s;
4876 rb_encoding *enc;
4877
4878 enc = rb_enc_check(str, sub);
4879 if (is_broken_string(sub)) return -1;
4880 len = RSTRING_LEN(str);
4881 slen = RSTRING_LEN(sub);
4882
4883 /* substring longer than string */
4884 if (len < slen) return -1;
4885 if (len - pos < slen) pos = len - slen;
4886 if (len == 0) return pos;
4887
4888 sbeg = RSTRING_PTR(str);
4889
4890 if (pos == 0) {
4891 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4892 return 0;
4893 else
4894 return -1;
4895 }
4896
4897 s = sbeg + pos;
4898 return str_rindex(str, sub, s, enc);
4899}
4900
4901/*
4902 * call-seq:
4903 * byterindex(object, offset = self.bytesize) -> integer or nil
4904 *
4905 * Returns the 0-based integer index of a substring of +self+
4906 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4907 * or +nil+ if there is no such substring;
4908 * the returned index is the count of _bytes_ (not characters).
4909 *
4910 * When +object+ is a string,
4911 * returns the index of the _last_ found substring equal to +object+:
4912 *
4913 * s = 'foo' # => "foo"
4914 * s.size # => 3 # Three 1-byte characters.
4915 * s.bytesize # => 3 # Three bytes.
4916 * s.byterindex('f') # => 0
4917 s.byterindex('o') # => 2
4918 s.byterindex('oo') # => 1
4919 s.byterindex('ooo') # => nil
4920 *
4921 * When +object+ is a Regexp,
4922 * returns the index of the last found substring matching +object+;
4923 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4924 *
4925 * s = 'foo'
4926 * s.byterindex(/f/) # => 0
4927 * $~ # => #<MatchData "f">
4928 * s.byterindex(/o/) # => 2
4929 * s.byterindex(/oo/) # => 1
4930 * s.byterindex(/ooo/) # => nil
4931 * $~ # => nil
4932 *
4933 * The last match means starting at the possible last position,
4934 * not the last of the longest matches:
4935 *
4936 * s = 'foo'
4937 * s.byterindex(/o+/) # => 2
4938 * $~ #=> #<MatchData "o">
4939 *
4940 * To get the last longest match, use a negative lookbehind:
4941 *
4942 * s = 'foo'
4943 * s.byterindex(/(?<!o)o+/) # => 1
4944 * $~ # => #<MatchData "oo">
4945 *
4946 * Or use method #byteindex with negative lookahead:
4947 *
4948 * s = 'foo'
4949 * s.byteindex(/o+(?!.*o)/) # => 1
4950 * $~ #=> #<MatchData "oo">
4951 *
4952 * \Integer argument +offset+, if given, specifies the 0-based index
4953 * of the byte where searching is to end.
4954 *
4955 * When +offset+ is non-negative,
4956 * searching ends at byte position +offset+:
4957 *
4958 * s = 'foo'
4959 * s.byterindex('o', 0) # => nil
4960 * s.byterindex('o', 1) # => 1
4961 * s.byterindex('o', 2) # => 2
4962 * s.byterindex('o', 3) # => 2
4963 *
4964 * When +offset+ is negative, counts backward from the end of +self+:
4965 *
4966 * s = 'foo'
4967 * s.byterindex('o', -1) # => 2
4968 * s.byterindex('o', -2) # => 1
4969 * s.byterindex('o', -3) # => nil
4970 *
4971 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4972 *
4973 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4974 * s.size # => 2 # Two 3-byte characters.
4975 * s.bytesize # => 6 # Six bytes.
4976 * s.byterindex("\uFFFF") # => 3
4977 * s.byterindex("\uFFFF", 1) # Raises IndexError
4978 * s.byterindex("\uFFFF", 2) # Raises IndexError
4979 * s.byterindex("\uFFFF", 3) # => 3
4980 * s.byterindex("\uFFFF", 4) # Raises IndexError
4981 * s.byterindex("\uFFFF", 5) # Raises IndexError
4982 * s.byterindex("\uFFFF", 6) # => nil
4983 *
4984 * Related: see {Querying}[rdoc-ref:String@Querying].
4985 */
4986
4987static VALUE
4988rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4989{
4990 VALUE sub;
4991 VALUE initpos;
4992 long pos, len = RSTRING_LEN(str);
4993
4994 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4995 pos = NUM2LONG(initpos);
4996 if (pos < 0 && (pos += len) < 0) {
4997 if (RB_TYPE_P(sub, T_REGEXP)) {
4999 }
5000 return Qnil;
5001 }
5002 if (pos > len) pos = len;
5003 }
5004 else {
5005 pos = len;
5006 }
5007
5008 str_ensure_byte_pos(str, pos);
5009
5010 if (RB_TYPE_P(sub, T_REGEXP)) {
5011 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5012 VALUE match = rb_backref_get();
5013 struct re_registers *regs = RMATCH_REGS(match);
5014 pos = BEG(0);
5015 return LONG2NUM(pos);
5016 }
5017 }
5018 else {
5019 StringValue(sub);
5020 pos = rb_str_byterindex(str, sub, pos);
5021 if (pos >= 0) return LONG2NUM(pos);
5022 }
5023 return Qnil;
5024}
5025
5026/*
5027 * call-seq:
5028 * self =~ object -> integer or nil
5029 *
5030 * When +object+ is a Regexp, returns the index of the first substring in +self+
5031 * matched by +object+,
5032 * or +nil+ if no match is found;
5033 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5034 *
5035 * 'foo' =~ /f/ # => 0
5036 * $~ # => #<MatchData "f">
5037 * 'foo' =~ /o/ # => 1
5038 * $~ # => #<MatchData "o">
5039 * 'foo' =~ /x/ # => nil
5040 * $~ # => nil
5041 *
5042 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5043 * (see Regexp#=~):
5044 *
5045 * number = nil
5046 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5047 * number # => nil # Not assigned.
5048 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5049 * number # => "9" # Assigned.
5050 *
5051 * If +object+ is not a Regexp, returns the value
5052 * returned by <tt>object =~ self</tt>.
5053 *
5054 * Related: see {Querying}[rdoc-ref:String@Querying].
5055 */
5056
5057static VALUE
5058rb_str_match(VALUE x, VALUE y)
5059{
5060 switch (OBJ_BUILTIN_TYPE(y)) {
5061 case T_STRING:
5062 rb_raise(rb_eTypeError, "type mismatch: String given");
5063
5064 case T_REGEXP:
5065 return rb_reg_match(y, x);
5066
5067 default:
5068 return rb_funcall(y, idEqTilde, 1, x);
5069 }
5070}
5071
5072
5073static VALUE get_pat(VALUE);
5074
5075
5076/*
5077 * call-seq:
5078 * match(pattern, offset = 0) -> matchdata or nil
5079 * match(pattern, offset = 0) {|matchdata| ... } -> object
5080 *
5081 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5082 *
5083 * Note: also updates Regexp@Global+Variables.
5084 *
5085 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5086 * regexp = Regexp.new(pattern)
5087 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5088 * (see Regexp#match):
5089 * matchdata = regexp.match(self)
5090 *
5091 * With no block given, returns the computed +matchdata+:
5092 *
5093 * 'foo'.match('f') # => #<MatchData "f">
5094 * 'foo'.match('o') # => #<MatchData "o">
5095 * 'foo'.match('x') # => nil
5096 *
5097 * If Integer argument +offset+ is given, the search begins at index +offset+:
5098 *
5099 * 'foo'.match('f', 1) # => nil
5100 * 'foo'.match('o', 1) # => #<MatchData "o">
5101 *
5102 * With a block given, calls the block with the computed +matchdata+
5103 * and returns the block's return value:
5104 *
5105 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5106 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5107 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5108 *
5109 */
5110
5111static VALUE
5112rb_str_match_m(int argc, VALUE *argv, VALUE str)
5113{
5114 VALUE re, result;
5115 if (argc < 1)
5116 rb_check_arity(argc, 1, 2);
5117 re = argv[0];
5118 argv[0] = str;
5119 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5120 if (!NIL_P(result) && rb_block_given_p()) {
5121 return rb_yield(result);
5122 }
5123 return result;
5124}
5125
5126/*
5127 * call-seq:
5128 * match?(pattern, offset = 0) -> true or false
5129 *
5130 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5131 *
5132 * Note: does not update Regexp@Global+Variables.
5133 *
5134 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5135 * regexp = Regexp.new(pattern)
5136 *
5137 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5138 * +false+ otherwise:
5139 *
5140 * 'foo'.match?(/o/) # => true
5141 * 'foo'.match?('o') # => true
5142 * 'foo'.match?(/x/) # => false
5143 *
5144 * If Integer argument +offset+ is given, the search begins at index +offset+:
5145 * 'foo'.match?('f', 1) # => false
5146 * 'foo'.match?('o', 1) # => true
5147 *
5148 */
5149
5150static VALUE
5151rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5152{
5153 VALUE re;
5154 rb_check_arity(argc, 1, 2);
5155 re = get_pat(argv[0]);
5156 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5157}
5158
5159enum neighbor_char {
5160 NEIGHBOR_NOT_CHAR,
5161 NEIGHBOR_FOUND,
5162 NEIGHBOR_WRAPPED
5163};
5164
5165static enum neighbor_char
5166enc_succ_char(char *p, long len, rb_encoding *enc)
5167{
5168 long i;
5169 int l;
5170
5171 if (rb_enc_mbminlen(enc) > 1) {
5172 /* wchar, trivial case */
5173 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5174 if (!MBCLEN_CHARFOUND_P(r)) {
5175 return NEIGHBOR_NOT_CHAR;
5176 }
5177 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5178 l = rb_enc_code_to_mbclen(c, enc);
5179 if (!l) return NEIGHBOR_NOT_CHAR;
5180 if (l != len) return NEIGHBOR_WRAPPED;
5181 rb_enc_mbcput(c, p, enc);
5182 r = rb_enc_precise_mbclen(p, p + len, enc);
5183 if (!MBCLEN_CHARFOUND_P(r)) {
5184 return NEIGHBOR_NOT_CHAR;
5185 }
5186 return NEIGHBOR_FOUND;
5187 }
5188 while (1) {
5189 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5190 p[i] = '\0';
5191 if (i < 0)
5192 return NEIGHBOR_WRAPPED;
5193 ++((unsigned char*)p)[i];
5194 l = rb_enc_precise_mbclen(p, p+len, enc);
5195 if (MBCLEN_CHARFOUND_P(l)) {
5196 l = MBCLEN_CHARFOUND_LEN(l);
5197 if (l == len) {
5198 return NEIGHBOR_FOUND;
5199 }
5200 else {
5201 memset(p+l, 0xff, len-l);
5202 }
5203 }
5204 if (MBCLEN_INVALID_P(l) && i < len-1) {
5205 long len2;
5206 int l2;
5207 for (len2 = len-1; 0 < len2; len2--) {
5208 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5209 if (!MBCLEN_INVALID_P(l2))
5210 break;
5211 }
5212 memset(p+len2+1, 0xff, len-(len2+1));
5213 }
5214 }
5215}
5216
5217static enum neighbor_char
5218enc_pred_char(char *p, long len, rb_encoding *enc)
5219{
5220 long i;
5221 int l;
5222 if (rb_enc_mbminlen(enc) > 1) {
5223 /* wchar, trivial case */
5224 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5225 if (!MBCLEN_CHARFOUND_P(r)) {
5226 return NEIGHBOR_NOT_CHAR;
5227 }
5228 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5229 if (!c) return NEIGHBOR_NOT_CHAR;
5230 --c;
5231 l = rb_enc_code_to_mbclen(c, enc);
5232 if (!l) return NEIGHBOR_NOT_CHAR;
5233 if (l != len) return NEIGHBOR_WRAPPED;
5234 rb_enc_mbcput(c, p, enc);
5235 r = rb_enc_precise_mbclen(p, p + len, enc);
5236 if (!MBCLEN_CHARFOUND_P(r)) {
5237 return NEIGHBOR_NOT_CHAR;
5238 }
5239 return NEIGHBOR_FOUND;
5240 }
5241 while (1) {
5242 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5243 p[i] = '\xff';
5244 if (i < 0)
5245 return NEIGHBOR_WRAPPED;
5246 --((unsigned char*)p)[i];
5247 l = rb_enc_precise_mbclen(p, p+len, enc);
5248 if (MBCLEN_CHARFOUND_P(l)) {
5249 l = MBCLEN_CHARFOUND_LEN(l);
5250 if (l == len) {
5251 return NEIGHBOR_FOUND;
5252 }
5253 else {
5254 memset(p+l, 0, len-l);
5255 }
5256 }
5257 if (MBCLEN_INVALID_P(l) && i < len-1) {
5258 long len2;
5259 int l2;
5260 for (len2 = len-1; 0 < len2; len2--) {
5261 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5262 if (!MBCLEN_INVALID_P(l2))
5263 break;
5264 }
5265 memset(p+len2+1, 0, len-(len2+1));
5266 }
5267 }
5268}
5269
5270/*
5271 overwrite +p+ by succeeding letter in +enc+ and returns
5272 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5273 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5274 assuming each ranges are successive, and mbclen
5275 never change in each ranges.
5276 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5277 character.
5278 */
5279static enum neighbor_char
5280enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5281{
5282 enum neighbor_char ret;
5283 unsigned int c;
5284 int ctype;
5285 int range;
5286 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5287
5288 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5289 int try;
5290 const int max_gaps = 1;
5291
5292 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5293 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5294 ctype = ONIGENC_CTYPE_DIGIT;
5295 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5296 ctype = ONIGENC_CTYPE_ALPHA;
5297 else
5298 return NEIGHBOR_NOT_CHAR;
5299
5300 MEMCPY(save, p, char, len);
5301 for (try = 0; try <= max_gaps; ++try) {
5302 ret = enc_succ_char(p, len, enc);
5303 if (ret == NEIGHBOR_FOUND) {
5304 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5305 if (rb_enc_isctype(c, ctype, enc))
5306 return NEIGHBOR_FOUND;
5307 }
5308 }
5309 MEMCPY(p, save, char, len);
5310 range = 1;
5311 while (1) {
5312 MEMCPY(save, p, char, len);
5313 ret = enc_pred_char(p, len, enc);
5314 if (ret == NEIGHBOR_FOUND) {
5315 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5316 if (!rb_enc_isctype(c, ctype, enc)) {
5317 MEMCPY(p, save, char, len);
5318 break;
5319 }
5320 }
5321 else {
5322 MEMCPY(p, save, char, len);
5323 break;
5324 }
5325 range++;
5326 }
5327 if (range == 1) {
5328 return NEIGHBOR_NOT_CHAR;
5329 }
5330
5331 if (ctype != ONIGENC_CTYPE_DIGIT) {
5332 MEMCPY(carry, p, char, len);
5333 return NEIGHBOR_WRAPPED;
5334 }
5335
5336 MEMCPY(carry, p, char, len);
5337 enc_succ_char(carry, len, enc);
5338 return NEIGHBOR_WRAPPED;
5339}
5340
5341
5342static VALUE str_succ(VALUE str);
5343
5344/*
5345 * call-seq:
5346 * succ -> new_str
5347 *
5348 * Returns the successor to +self+. The successor is calculated by
5349 * incrementing characters.
5350 *
5351 * The first character to be incremented is the rightmost alphanumeric:
5352 * or, if no alphanumerics, the rightmost character:
5353 *
5354 * 'THX1138'.succ # => "THX1139"
5355 * '<<koala>>'.succ # => "<<koalb>>"
5356 * '***'.succ # => '**+'
5357 *
5358 * The successor to a digit is another digit, "carrying" to the next-left
5359 * character for a "rollover" from 9 to 0, and prepending another digit
5360 * if necessary:
5361 *
5362 * '00'.succ # => "01"
5363 * '09'.succ # => "10"
5364 * '99'.succ # => "100"
5365 *
5366 * The successor to a letter is another letter of the same case,
5367 * carrying to the next-left character for a rollover,
5368 * and prepending another same-case letter if necessary:
5369 *
5370 * 'aa'.succ # => "ab"
5371 * 'az'.succ # => "ba"
5372 * 'zz'.succ # => "aaa"
5373 * 'AA'.succ # => "AB"
5374 * 'AZ'.succ # => "BA"
5375 * 'ZZ'.succ # => "AAA"
5376 *
5377 * The successor to a non-alphanumeric character is the next character
5378 * in the underlying character set's collating sequence,
5379 * carrying to the next-left character for a rollover,
5380 * and prepending another character if necessary:
5381 *
5382 * s = 0.chr * 3
5383 * s # => "\x00\x00\x00"
5384 * s.succ # => "\x00\x00\x01"
5385 * s = 255.chr * 3
5386 * s # => "\xFF\xFF\xFF"
5387 * s.succ # => "\x01\x00\x00\x00"
5388 *
5389 * Carrying can occur between and among mixtures of alphanumeric characters:
5390 *
5391 * s = 'zz99zz99'
5392 * s.succ # => "aaa00aa00"
5393 * s = '99zz99zz'
5394 * s.succ # => "100aa00aa"
5395 *
5396 * The successor to an empty +String+ is a new empty +String+:
5397 *
5398 * ''.succ # => ""
5399 *
5400 */
5401
5402VALUE
5404{
5405 VALUE str;
5406 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5407 rb_enc_cr_str_copy_for_substr(str, orig);
5408 return str_succ(str);
5409}
5410
5411static VALUE
5412str_succ(VALUE str)
5413{
5414 rb_encoding *enc;
5415 char *sbeg, *s, *e, *last_alnum = 0;
5416 int found_alnum = 0;
5417 long l, slen;
5418 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5419 long carry_pos = 0, carry_len = 1;
5420 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5421
5422 slen = RSTRING_LEN(str);
5423 if (slen == 0) return str;
5424
5425 enc = STR_ENC_GET(str);
5426 sbeg = RSTRING_PTR(str);
5427 s = e = sbeg + slen;
5428
5429 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5430 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5431 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5432 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5433 break;
5434 }
5435 }
5436 l = rb_enc_precise_mbclen(s, e, enc);
5437 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5438 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5439 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5440 switch (neighbor) {
5441 case NEIGHBOR_NOT_CHAR:
5442 continue;
5443 case NEIGHBOR_FOUND:
5444 return str;
5445 case NEIGHBOR_WRAPPED:
5446 last_alnum = s;
5447 break;
5448 }
5449 found_alnum = 1;
5450 carry_pos = s - sbeg;
5451 carry_len = l;
5452 }
5453 if (!found_alnum) { /* str contains no alnum */
5454 s = e;
5455 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5456 enum neighbor_char neighbor;
5457 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5458 l = rb_enc_precise_mbclen(s, e, enc);
5459 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5460 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5461 MEMCPY(tmp, s, char, l);
5462 neighbor = enc_succ_char(tmp, l, enc);
5463 switch (neighbor) {
5464 case NEIGHBOR_FOUND:
5465 MEMCPY(s, tmp, char, l);
5466 return str;
5467 break;
5468 case NEIGHBOR_WRAPPED:
5469 MEMCPY(s, tmp, char, l);
5470 break;
5471 case NEIGHBOR_NOT_CHAR:
5472 break;
5473 }
5474 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5475 /* wrapped to \0...\0. search next valid char. */
5476 enc_succ_char(s, l, enc);
5477 }
5478 if (!rb_enc_asciicompat(enc)) {
5479 MEMCPY(carry, s, char, l);
5480 carry_len = l;
5481 }
5482 carry_pos = s - sbeg;
5483 }
5485 }
5486 RESIZE_CAPA(str, slen + carry_len);
5487 sbeg = RSTRING_PTR(str);
5488 s = sbeg + carry_pos;
5489 memmove(s + carry_len, s, slen - carry_pos);
5490 memmove(s, carry, carry_len);
5491 slen += carry_len;
5492 STR_SET_LEN(str, slen);
5493 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5495 return str;
5496}
5497
5498
5499/*
5500 * call-seq:
5501 * succ! -> self
5502 *
5503 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5504 */
5505
5506static VALUE
5507rb_str_succ_bang(VALUE str)
5508{
5509 rb_str_modify(str);
5510 str_succ(str);
5511 return str;
5512}
5513
5514static int
5515all_digits_p(const char *s, long len)
5516{
5517 while (len-- > 0) {
5518 if (!ISDIGIT(*s)) return 0;
5519 s++;
5520 }
5521 return 1;
5522}
5523
5524static int
5525str_upto_i(VALUE str, VALUE arg)
5526{
5527 rb_yield(str);
5528 return 0;
5529}
5530
5531/*
5532 * call-seq:
5533 * upto(other_string, exclusive = false) {|string| ... } -> self
5534 * upto(other_string, exclusive = false) -> new_enumerator
5535 *
5536 * With a block given, calls the block with each +String+ value
5537 * returned by successive calls to String#succ;
5538 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5539 * the sequence terminates when value +other_string+ is reached;
5540 * returns +self+:
5541 *
5542 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5543 * Output:
5544 *
5545 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5546 *
5547 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5548 *
5549 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5550 *
5551 * Output:
5552 *
5553 * a8 a9 b0 b1 b2 b3 b4 b5
5554 *
5555 * If +other_string+ would not be reached, does not call the block:
5556 *
5557 * '25'.upto('5') {|s| fail s }
5558 * 'aa'.upto('a') {|s| fail s }
5559 *
5560 * With no block given, returns a new Enumerator:
5561 *
5562 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5563 *
5564 */
5565
5566static VALUE
5567rb_str_upto(int argc, VALUE *argv, VALUE beg)
5568{
5569 VALUE end, exclusive;
5570
5571 rb_scan_args(argc, argv, "11", &end, &exclusive);
5572 RETURN_ENUMERATOR(beg, argc, argv);
5573 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5574}
5575
5576VALUE
5577rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5578{
5579 VALUE current, after_end;
5580 ID succ;
5581 int n, ascii;
5582 rb_encoding *enc;
5583
5584 CONST_ID(succ, "succ");
5585 StringValue(end);
5586 enc = rb_enc_check(beg, end);
5587 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5588 /* single character */
5589 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5590 char c = RSTRING_PTR(beg)[0];
5591 char e = RSTRING_PTR(end)[0];
5592
5593 if (c > e || (excl && c == e)) return beg;
5594 for (;;) {
5595 VALUE str = rb_enc_str_new(&c, 1, enc);
5597 if ((*each)(str, arg)) break;
5598 if (!excl && c == e) break;
5599 c++;
5600 if (excl && c == e) break;
5601 }
5602 return beg;
5603 }
5604 /* both edges are all digits */
5605 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5606 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5607 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5608 VALUE b, e;
5609 int width;
5610
5611 width = RSTRING_LENINT(beg);
5612 b = rb_str_to_inum(beg, 10, FALSE);
5613 e = rb_str_to_inum(end, 10, FALSE);
5614 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5615 long bi = FIX2LONG(b);
5616 long ei = FIX2LONG(e);
5617 rb_encoding *usascii = rb_usascii_encoding();
5618
5619 while (bi <= ei) {
5620 if (excl && bi == ei) break;
5621 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5622 bi++;
5623 }
5624 }
5625 else {
5626 ID op = excl ? '<' : idLE;
5627 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5628
5629 args[0] = INT2FIX(width);
5630 while (rb_funcall(b, op, 1, e)) {
5631 args[1] = b;
5632 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5633 b = rb_funcallv(b, succ, 0, 0);
5634 }
5635 }
5636 return beg;
5637 }
5638 /* normal case */
5639 n = rb_str_cmp(beg, end);
5640 if (n > 0 || (excl && n == 0)) return beg;
5641
5642 after_end = rb_funcallv(end, succ, 0, 0);
5643 current = str_duplicate(rb_cString, beg);
5644 while (!rb_str_equal(current, after_end)) {
5645 VALUE next = Qnil;
5646 if (excl || !rb_str_equal(current, end))
5647 next = rb_funcallv(current, succ, 0, 0);
5648 if ((*each)(current, arg)) break;
5649 if (NIL_P(next)) break;
5650 current = next;
5651 StringValue(current);
5652 if (excl && rb_str_equal(current, end)) break;
5653 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5654 break;
5655 }
5656
5657 return beg;
5658}
5659
5660VALUE
5661rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5662{
5663 VALUE current;
5664 ID succ;
5665
5666 CONST_ID(succ, "succ");
5667 /* both edges are all digits */
5668 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5669 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5670 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5671 int width = RSTRING_LENINT(beg);
5672 b = rb_str_to_inum(beg, 10, FALSE);
5673 if (FIXNUM_P(b)) {
5674 long bi = FIX2LONG(b);
5675 rb_encoding *usascii = rb_usascii_encoding();
5676
5677 while (FIXABLE(bi)) {
5678 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5679 bi++;
5680 }
5681 b = LONG2NUM(bi);
5682 }
5683 args[0] = INT2FIX(width);
5684 while (1) {
5685 args[1] = b;
5686 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5687 b = rb_funcallv(b, succ, 0, 0);
5688 }
5689 }
5690 /* normal case */
5691 current = str_duplicate(rb_cString, beg);
5692 while (1) {
5693 VALUE next = rb_funcallv(current, succ, 0, 0);
5694 if ((*each)(current, arg)) break;
5695 current = next;
5696 StringValue(current);
5697 if (RSTRING_LEN(current) == 0)
5698 break;
5699 }
5700
5701 return beg;
5702}
5703
5704static int
5705include_range_i(VALUE str, VALUE arg)
5706{
5707 VALUE *argp = (VALUE *)arg;
5708 if (!rb_equal(str, *argp)) return 0;
5709 *argp = Qnil;
5710 return 1;
5711}
5712
5713VALUE
5714rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5715{
5716 beg = rb_str_new_frozen(beg);
5717 StringValue(end);
5718 end = rb_str_new_frozen(end);
5719 if (NIL_P(val)) return Qfalse;
5720 val = rb_check_string_type(val);
5721 if (NIL_P(val)) return Qfalse;
5722 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5723 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5724 rb_enc_asciicompat(STR_ENC_GET(val))) {
5725 const char *bp = RSTRING_PTR(beg);
5726 const char *ep = RSTRING_PTR(end);
5727 const char *vp = RSTRING_PTR(val);
5728 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5729 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5730 return Qfalse;
5731 else {
5732 char b = *bp;
5733 char e = *ep;
5734 char v = *vp;
5735
5736 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5737 if (b <= v && v < e) return Qtrue;
5738 return RBOOL(!RTEST(exclusive) && v == e);
5739 }
5740 }
5741 }
5742#if 0
5743 /* both edges are all digits */
5744 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5745 all_digits_p(bp, RSTRING_LEN(beg)) &&
5746 all_digits_p(ep, RSTRING_LEN(end))) {
5747 /* TODO */
5748 }
5749#endif
5750 }
5751 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5752
5753 return RBOOL(NIL_P(val));
5754}
5755
5756static VALUE
5757rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5758{
5759 if (rb_reg_search(re, str, 0, 0) >= 0) {
5760 VALUE match = rb_backref_get();
5761 int nth = rb_reg_backref_number(match, backref);
5762 return rb_reg_nth_match(nth, match);
5763 }
5764 return Qnil;
5765}
5766
5767static VALUE
5768rb_str_aref(VALUE str, VALUE indx)
5769{
5770 long idx;
5771
5772 if (FIXNUM_P(indx)) {
5773 idx = FIX2LONG(indx);
5774 }
5775 else if (RB_TYPE_P(indx, T_REGEXP)) {
5776 return rb_str_subpat(str, indx, INT2FIX(0));
5777 }
5778 else if (RB_TYPE_P(indx, T_STRING)) {
5779 if (rb_str_index(str, indx, 0) != -1)
5780 return str_duplicate(rb_cString, indx);
5781 return Qnil;
5782 }
5783 else {
5784 /* check if indx is Range */
5785 long beg, len = str_strlen(str, NULL);
5786 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5787 case Qfalse:
5788 break;
5789 case Qnil:
5790 return Qnil;
5791 default:
5792 return rb_str_substr(str, beg, len);
5793 }
5794 idx = NUM2LONG(indx);
5795 }
5796
5797 return str_substr(str, idx, 1, FALSE);
5798}
5799
5800
5801/*
5802 * call-seq:
5803 * self[index] -> new_string or nil
5804 * self[start, length] -> new_string or nil
5805 * self[range] -> new_string or nil
5806 * self[regexp, capture = 0] -> new_string or nil
5807 * self[substring] -> new_string or nil
5808 *
5809 * Returns the substring of +self+ specified by the arguments.
5810 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5811 *
5812 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5813 */
5814
5815static VALUE
5816rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5817{
5818 if (argc == 2) {
5819 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5820 return rb_str_subpat(str, argv[0], argv[1]);
5821 }
5822 else {
5823 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5824 }
5825 }
5826 rb_check_arity(argc, 1, 2);
5827 return rb_str_aref(str, argv[0]);
5828}
5829
5830VALUE
5832{
5833 char *ptr = RSTRING_PTR(str);
5834 long olen = RSTRING_LEN(str), nlen;
5835
5836 str_modifiable(str);
5837 if (len > olen) len = olen;
5838 nlen = olen - len;
5839 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5840 char *oldptr = ptr;
5841 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5842 STR_SET_EMBED(str);
5843 ptr = RSTRING(str)->as.embed.ary;
5844 memmove(ptr, oldptr + len, nlen);
5845 if (fl == STR_NOEMBED) xfree(oldptr);
5846 }
5847 else {
5848 if (!STR_SHARED_P(str)) {
5849 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5850 rb_enc_cr_str_exact_copy(shared, str);
5851 OBJ_FREEZE(shared);
5852 }
5853 ptr = RSTRING(str)->as.heap.ptr += len;
5854 }
5855 STR_SET_LEN(str, nlen);
5856
5857 if (!SHARABLE_MIDDLE_SUBSTRING) {
5858 TERM_FILL(ptr + nlen, TERM_LEN(str));
5859 }
5861 return str;
5862}
5863
5864static void
5865rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5866{
5867 char *sptr;
5868 long slen;
5869 int cr;
5870
5871 if (beg == 0 && vlen == 0) {
5872 rb_str_drop_bytes(str, len);
5873 return;
5874 }
5875
5876 str_modify_keep_cr(str);
5877 RSTRING_GETMEM(str, sptr, slen);
5878 if (len < vlen) {
5879 /* expand string */
5880 RESIZE_CAPA(str, slen + vlen - len);
5881 sptr = RSTRING_PTR(str);
5882 }
5883
5885 cr = rb_enc_str_coderange(val);
5886 else
5888
5889 if (vlen != len) {
5890 memmove(sptr + beg + vlen,
5891 sptr + beg + len,
5892 slen - (beg + len));
5893 }
5894 if (vlen < beg && len < 0) {
5895 MEMZERO(sptr + slen, char, -len);
5896 }
5897 if (vlen > 0) {
5898 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5899 }
5900 slen += vlen - len;
5901 STR_SET_LEN(str, slen);
5902 TERM_FILL(&sptr[slen], TERM_LEN(str));
5903 ENC_CODERANGE_SET(str, cr);
5904}
5905
5906static inline void
5907rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5908{
5909 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5910}
5911
5912void
5913rb_str_update(VALUE str, long beg, long len, VALUE val)
5914{
5915 long slen;
5916 char *p, *e;
5917 rb_encoding *enc;
5918 int singlebyte = single_byte_optimizable(str);
5919 int cr;
5920
5921 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5922
5923 StringValue(val);
5924 enc = rb_enc_check(str, val);
5925 slen = str_strlen(str, enc); /* rb_enc_check */
5926
5927 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5928 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5929 }
5930 if (beg < 0) {
5931 beg += slen;
5932 }
5933 RUBY_ASSERT(beg >= 0);
5934 RUBY_ASSERT(beg <= slen);
5935
5936 if (len > slen - beg) {
5937 len = slen - beg;
5938 }
5939 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5940 if (!p) p = RSTRING_END(str);
5941 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5942 if (!e) e = RSTRING_END(str);
5943 /* error check */
5944 beg = p - RSTRING_PTR(str); /* physical position */
5945 len = e - p; /* physical length */
5946 rb_str_update_0(str, beg, len, val);
5947 rb_enc_associate(str, enc);
5949 if (cr != ENC_CODERANGE_BROKEN)
5950 ENC_CODERANGE_SET(str, cr);
5951}
5952
5953static void
5954rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5955{
5956 int nth;
5957 VALUE match;
5958 long start, end, len;
5959 rb_encoding *enc;
5960 struct re_registers *regs;
5961
5962 if (rb_reg_search(re, str, 0, 0) < 0) {
5963 rb_raise(rb_eIndexError, "regexp not matched");
5964 }
5965 match = rb_backref_get();
5966 nth = rb_reg_backref_number(match, backref);
5967 regs = RMATCH_REGS(match);
5968 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5969 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5970 }
5971 if (nth < 0) {
5972 nth += regs->num_regs;
5973 }
5974
5975 start = BEG(nth);
5976 if (start == -1) {
5977 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5978 }
5979 end = END(nth);
5980 len = end - start;
5981 StringValue(val);
5982 enc = rb_enc_check_str(str, val);
5983 rb_str_update_0(str, start, len, val);
5984 rb_enc_associate(str, enc);
5985}
5986
5987static VALUE
5988rb_str_aset(VALUE str, VALUE indx, VALUE val)
5989{
5990 long idx, beg;
5991
5992 switch (TYPE(indx)) {
5993 case T_REGEXP:
5994 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5995 return val;
5996
5997 case T_STRING:
5998 beg = rb_str_index(str, indx, 0);
5999 if (beg < 0) {
6000 rb_raise(rb_eIndexError, "string not matched");
6001 }
6002 beg = rb_str_sublen(str, beg);
6003 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6004 return val;
6005
6006 default:
6007 /* check if indx is Range */
6008 {
6009 long beg, len;
6010 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6011 rb_str_update(str, beg, len, val);
6012 return val;
6013 }
6014 }
6015 /* FALLTHROUGH */
6016
6017 case T_FIXNUM:
6018 idx = NUM2LONG(indx);
6019 rb_str_update(str, idx, 1, val);
6020 return val;
6021 }
6022}
6023
6024/*
6025 * call-seq:
6026 * self[index] = new_string
6027 * self[start, length] = new_string
6028 * self[range] = new_string
6029 * self[regexp, capture = 0] = new_string
6030 * self[substring] = new_string
6031 *
6032 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6033 * See {String Slices}[rdoc-ref:String@String+Slices].
6034 *
6035 * A few examples:
6036 *
6037 * s = 'foo'
6038 * s[2] = 'rtune' # => "rtune"
6039 * s # => "fortune"
6040 * s[1, 5] = 'init' # => "init"
6041 * s # => "finite"
6042 * s[3..4] = 'al' # => "al"
6043 * s # => "finale"
6044 * s[/e$/] = 'ly' # => "ly"
6045 * s # => "finally"
6046 * s['lly'] = 'ncial' # => "ncial"
6047 * s # => "financial"
6048 *
6049 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6050 */
6051
6052static VALUE
6053rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6054{
6055 if (argc == 3) {
6056 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6057 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6058 }
6059 else {
6060 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6061 }
6062 return argv[2];
6063 }
6064 rb_check_arity(argc, 2, 3);
6065 return rb_str_aset(str, argv[0], argv[1]);
6066}
6067
6068/*
6069 * call-seq:
6070 * insert(index, other_string) -> self
6071 *
6072 * Inserts the given +other_string+ into +self+; returns +self+.
6073 *
6074 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6075 *
6076 * 'foo'.insert(1, 'bar') # => "fbaroo"
6077 *
6078 * If the Integer +index+ is negative, counts backward from the end of +self+
6079 * and inserts +other_string+ at offset <tt>index+1</tt>
6080 * (that is, _after_ <tt>self[index]</tt>):
6081 *
6082 * 'foo'.insert(-2, 'bar') # => "fobaro"
6083 *
6084 */
6085
6086static VALUE
6087rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6088{
6089 long pos = NUM2LONG(idx);
6090
6091 if (pos == -1) {
6092 return rb_str_append(str, str2);
6093 }
6094 else if (pos < 0) {
6095 pos++;
6096 }
6097 rb_str_update(str, pos, 0, str2);
6098 return str;
6099}
6100
6101
6102/*
6103 * call-seq:
6104 * slice!(index) -> new_string or nil
6105 * slice!(start, length) -> new_string or nil
6106 * slice!(range) -> new_string or nil
6107 * slice!(regexp, capture = 0) -> new_string or nil
6108 * slice!(substring) -> new_string or nil
6109 *
6110 * Removes and returns the substring of +self+ specified by the arguments.
6111 * See {String Slices}[rdoc-ref:String@String+Slices].
6112 *
6113 * A few examples:
6114 *
6115 * string = "This is a string"
6116 * string.slice!(2) #=> "i"
6117 * string.slice!(3..6) #=> " is "
6118 * string.slice!(/s.*t/) #=> "sa st"
6119 * string.slice!("r") #=> "r"
6120 * string #=> "Thing"
6121 *
6122 */
6123
6124static VALUE
6125rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6126{
6127 VALUE result = Qnil;
6128 VALUE indx;
6129 long beg, len = 1;
6130 char *p;
6131
6132 rb_check_arity(argc, 1, 2);
6133 str_modify_keep_cr(str);
6134 indx = argv[0];
6135 if (RB_TYPE_P(indx, T_REGEXP)) {
6136 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6137 VALUE match = rb_backref_get();
6138 struct re_registers *regs = RMATCH_REGS(match);
6139 int nth = 0;
6140 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6141 if ((nth += regs->num_regs) <= 0) return Qnil;
6142 }
6143 else if (nth >= regs->num_regs) return Qnil;
6144 beg = BEG(nth);
6145 len = END(nth) - beg;
6146 goto subseq;
6147 }
6148 else if (argc == 2) {
6149 beg = NUM2LONG(indx);
6150 len = NUM2LONG(argv[1]);
6151 goto num_index;
6152 }
6153 else if (FIXNUM_P(indx)) {
6154 beg = FIX2LONG(indx);
6155 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6156 if (!len) return Qnil;
6157 beg = p - RSTRING_PTR(str);
6158 goto subseq;
6159 }
6160 else if (RB_TYPE_P(indx, T_STRING)) {
6161 beg = rb_str_index(str, indx, 0);
6162 if (beg == -1) return Qnil;
6163 len = RSTRING_LEN(indx);
6164 result = str_duplicate(rb_cString, indx);
6165 goto squash;
6166 }
6167 else {
6168 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6169 case Qnil:
6170 return Qnil;
6171 case Qfalse:
6172 beg = NUM2LONG(indx);
6173 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6174 if (!len) return Qnil;
6175 beg = p - RSTRING_PTR(str);
6176 goto subseq;
6177 default:
6178 goto num_index;
6179 }
6180 }
6181
6182 num_index:
6183 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6184 beg = p - RSTRING_PTR(str);
6185
6186 subseq:
6187 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6188 rb_enc_cr_str_copy_for_substr(result, str);
6189
6190 squash:
6191 if (len > 0) {
6192 if (beg == 0) {
6193 rb_str_drop_bytes(str, len);
6194 }
6195 else {
6196 char *sptr = RSTRING_PTR(str);
6197 long slen = RSTRING_LEN(str);
6198 if (beg + len > slen) /* pathological check */
6199 len = slen - beg;
6200 memmove(sptr + beg,
6201 sptr + beg + len,
6202 slen - (beg + len));
6203 slen -= len;
6204 STR_SET_LEN(str, slen);
6205 TERM_FILL(&sptr[slen], TERM_LEN(str));
6206 }
6207 }
6208 return result;
6209}
6210
6211static VALUE
6212get_pat(VALUE pat)
6213{
6214 VALUE val;
6215
6216 switch (OBJ_BUILTIN_TYPE(pat)) {
6217 case T_REGEXP:
6218 return pat;
6219
6220 case T_STRING:
6221 break;
6222
6223 default:
6224 val = rb_check_string_type(pat);
6225 if (NIL_P(val)) {
6226 Check_Type(pat, T_REGEXP);
6227 }
6228 pat = val;
6229 }
6230
6231 return rb_reg_regcomp(pat);
6232}
6233
6234static VALUE
6235get_pat_quoted(VALUE pat, int check)
6236{
6237 VALUE val;
6238
6239 switch (OBJ_BUILTIN_TYPE(pat)) {
6240 case T_REGEXP:
6241 return pat;
6242
6243 case T_STRING:
6244 break;
6245
6246 default:
6247 val = rb_check_string_type(pat);
6248 if (NIL_P(val)) {
6249 Check_Type(pat, T_REGEXP);
6250 }
6251 pat = val;
6252 }
6253 if (check && is_broken_string(pat)) {
6254 rb_exc_raise(rb_reg_check_preprocess(pat));
6255 }
6256 return pat;
6257}
6258
6259static long
6260rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6261{
6262 if (BUILTIN_TYPE(pat) == T_STRING) {
6263 pos = rb_str_byteindex(str, pat, pos);
6264 if (set_backref_str) {
6265 if (pos >= 0) {
6266 str = rb_str_new_frozen_String(str);
6267 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6268 if (match) {
6269 *match = match_data;
6270 }
6271 }
6272 else {
6274 }
6275 }
6276 return pos;
6277 }
6278 else {
6279 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6280 }
6281}
6282
6283static long
6284rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6285{
6286 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6287}
6288
6289
6290/*
6291 * call-seq:
6292 * sub!(pattern, replacement) -> self or nil
6293 * sub!(pattern) {|match| ... } -> self or nil
6294 *
6295 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6296 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6297 *
6298 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6299 *
6300 * Related: String#sub, String#gsub, String#gsub!.
6301 *
6302 */
6303
6304static VALUE
6305rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6306{
6307 VALUE pat, repl, hash = Qnil;
6308 int iter = 0;
6309 long plen;
6310 int min_arity = rb_block_given_p() ? 1 : 2;
6311 long beg;
6312
6313 rb_check_arity(argc, min_arity, 2);
6314 if (argc == 1) {
6315 iter = 1;
6316 }
6317 else {
6318 repl = argv[1];
6319 hash = rb_check_hash_type(argv[1]);
6320 if (NIL_P(hash)) {
6321 StringValue(repl);
6322 }
6323 }
6324
6325 pat = get_pat_quoted(argv[0], 1);
6326
6327 str_modifiable(str);
6328 beg = rb_pat_search(pat, str, 0, 1);
6329 if (beg >= 0) {
6330 rb_encoding *enc;
6331 int cr = ENC_CODERANGE(str);
6332 long beg0, end0;
6333 VALUE match, match0 = Qnil;
6334 struct re_registers *regs;
6335 char *p, *rp;
6336 long len, rlen;
6337
6338 match = rb_backref_get();
6339 regs = RMATCH_REGS(match);
6340 if (RB_TYPE_P(pat, T_STRING)) {
6341 beg0 = beg;
6342 end0 = beg0 + RSTRING_LEN(pat);
6343 match0 = pat;
6344 }
6345 else {
6346 beg0 = BEG(0);
6347 end0 = END(0);
6348 if (iter) match0 = rb_reg_nth_match(0, match);
6349 }
6350
6351 if (iter || !NIL_P(hash)) {
6352 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6353
6354 if (iter) {
6355 repl = rb_obj_as_string(rb_yield(match0));
6356 }
6357 else {
6358 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6359 repl = rb_obj_as_string(repl);
6360 }
6361 str_mod_check(str, p, len);
6362 rb_check_frozen(str);
6363 }
6364 else {
6365 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6366 }
6367
6368 enc = rb_enc_compatible(str, repl);
6369 if (!enc) {
6370 rb_encoding *str_enc = STR_ENC_GET(str);
6371 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6372 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6373 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6374 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6375 rb_enc_inspect_name(str_enc),
6376 rb_enc_inspect_name(STR_ENC_GET(repl)));
6377 }
6378 enc = STR_ENC_GET(repl);
6379 }
6380 rb_str_modify(str);
6381 rb_enc_associate(str, enc);
6383 int cr2 = ENC_CODERANGE(repl);
6384 if (cr2 == ENC_CODERANGE_BROKEN ||
6385 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6387 else
6388 cr = cr2;
6389 }
6390 plen = end0 - beg0;
6391 rlen = RSTRING_LEN(repl);
6392 len = RSTRING_LEN(str);
6393 if (rlen > plen) {
6394 RESIZE_CAPA(str, len + rlen - plen);
6395 }
6396 p = RSTRING_PTR(str);
6397 if (rlen != plen) {
6398 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6399 }
6400 rp = RSTRING_PTR(repl);
6401 memmove(p + beg0, rp, rlen);
6402 len += rlen - plen;
6403 STR_SET_LEN(str, len);
6404 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6405 ENC_CODERANGE_SET(str, cr);
6406
6407 RB_GC_GUARD(match);
6408
6409 return str;
6410 }
6411 return Qnil;
6412}
6413
6414
6415/*
6416 * call-seq:
6417 * sub(pattern, replacement) -> new_string
6418 * sub(pattern) {|match| ... } -> new_string
6419 *
6420 * Returns a copy of +self+ with only the first occurrence
6421 * (not all occurrences) of the given +pattern+ replaced.
6422 *
6423 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6424 *
6425 * Related: String#sub!, String#gsub, String#gsub!.
6426 *
6427 */
6428
6429static VALUE
6430rb_str_sub(int argc, VALUE *argv, VALUE str)
6431{
6432 str = str_duplicate(rb_cString, str);
6433 rb_str_sub_bang(argc, argv, str);
6434 return str;
6435}
6436
6437static VALUE
6438str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6439{
6440 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6441 long beg, beg0, end0;
6442 long offset, blen, slen, len, last;
6443 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6444 char *sp, *cp;
6445 int need_backref_str = -1;
6446 rb_encoding *str_enc;
6447
6448 switch (argc) {
6449 case 1:
6450 RETURN_ENUMERATOR(str, argc, argv);
6451 mode = ITER;
6452 break;
6453 case 2:
6454 repl = argv[1];
6455 hash = rb_check_hash_type(argv[1]);
6456 if (NIL_P(hash)) {
6457 StringValue(repl);
6458 }
6459 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6460 mode = FAST_MAP;
6461 }
6462 else {
6463 mode = MAP;
6464 }
6465 break;
6466 default:
6467 rb_error_arity(argc, 1, 2);
6468 }
6469
6470 pat = get_pat_quoted(argv[0], 1);
6471 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6472
6473 if (beg < 0) {
6474 if (bang) return Qnil; /* no match, no substitution */
6475 return str_duplicate(rb_cString, str);
6476 }
6477
6478 offset = 0;
6479 blen = RSTRING_LEN(str) + 30; /* len + margin */
6480 dest = rb_str_buf_new(blen);
6481 sp = RSTRING_PTR(str);
6482 slen = RSTRING_LEN(str);
6483 cp = sp;
6484 str_enc = STR_ENC_GET(str);
6485 rb_enc_associate(dest, str_enc);
6486 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6487
6488 do {
6489 struct re_registers *regs = RMATCH_REGS(match);
6490 if (RB_TYPE_P(pat, T_STRING)) {
6491 beg0 = beg;
6492 end0 = beg0 + RSTRING_LEN(pat);
6493 match0 = pat;
6494 }
6495 else {
6496 beg0 = BEG(0);
6497 end0 = END(0);
6498 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6499 }
6500
6501 if (mode != STR) {
6502 if (mode == ITER) {
6503 val = rb_obj_as_string(rb_yield(match0));
6504 }
6505 else {
6506 struct RString fake_str;
6507 VALUE key;
6508 if (mode == FAST_MAP) {
6509 // It is safe to use a fake_str here because we established that it won't escape,
6510 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6511 // default proc.
6512 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6513 }
6514 else {
6515 key = rb_str_subseq(str, beg0, end0 - beg0);
6516 }
6517 val = rb_hash_aref(hash, key);
6518 val = rb_obj_as_string(val);
6519 }
6520 str_mod_check(str, sp, slen);
6521 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6522 rb_raise(rb_eRuntimeError, "block should not cheat");
6523 }
6524 }
6525 else if (need_backref_str) {
6526 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6527 if (need_backref_str < 0) {
6528 need_backref_str = val != repl;
6529 }
6530 }
6531 else {
6532 val = repl;
6533 }
6534
6535 len = beg0 - offset; /* copy pre-match substr */
6536 if (len) {
6537 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6538 }
6539
6540 rb_str_buf_append(dest, val);
6541
6542 last = offset;
6543 offset = end0;
6544 if (beg0 == end0) {
6545 /*
6546 * Always consume at least one character of the input string
6547 * in order to prevent infinite loops.
6548 */
6549 if (RSTRING_LEN(str) <= end0) break;
6550 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6551 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6552 offset = end0 + len;
6553 }
6554 cp = RSTRING_PTR(str) + offset;
6555 if (offset > RSTRING_LEN(str)) break;
6556
6557 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6558 if (mode != FAST_MAP && mode != STR) {
6559 match = Qnil;
6560 }
6561 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6562
6563 RB_GC_GUARD(match);
6564 } while (beg >= 0);
6565
6566 if (RSTRING_LEN(str) > offset) {
6567 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6568 }
6569 rb_pat_search0(pat, str, last, 1, &match);
6570 if (bang) {
6571 str_shared_replace(str, dest);
6572 }
6573 else {
6574 str = dest;
6575 }
6576
6577 return str;
6578}
6579
6580
6581/*
6582 * call-seq:
6583 * gsub!(pattern, replacement) -> self or nil
6584 * gsub!(pattern) {|match| ... } -> self or nil
6585 * gsub!(pattern) -> an_enumerator
6586 *
6587 * Performs the specified substring replacement(s) on +self+;
6588 * returns +self+ if any replacement occurred, +nil+ otherwise.
6589 *
6590 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6591 *
6592 * Returns an Enumerator if no +replacement+ and no block given.
6593 *
6594 * Related: String#sub, String#gsub, String#sub!.
6595 *
6596 */
6597
6598static VALUE
6599rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6600{
6601 str_modify_keep_cr(str);
6602 return str_gsub(argc, argv, str, 1);
6603}
6604
6605
6606/*
6607 * call-seq:
6608 * gsub(pattern, replacement) -> new_string
6609 * gsub(pattern) {|match| ... } -> new_string
6610 * gsub(pattern) -> enumerator
6611 *
6612 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6613 *
6614 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6615 *
6616 * Returns an Enumerator if no +replacement+ and no block given.
6617 *
6618 * Related: String#sub, String#sub!, String#gsub!.
6619 *
6620 */
6621
6622static VALUE
6623rb_str_gsub(int argc, VALUE *argv, VALUE str)
6624{
6625 return str_gsub(argc, argv, str, 0);
6626}
6627
6628
6629/*
6630 * call-seq:
6631 * replace(other_string) -> self
6632 *
6633 * Replaces the contents of +self+ with the contents of +other_string+:
6634 *
6635 * s = 'foo' # => "foo"
6636 * s.replace('bar') # => "bar"
6637 *
6638 */
6639
6640VALUE
6642{
6643 str_modifiable(str);
6644 if (str == str2) return str;
6645
6646 StringValue(str2);
6647 str_discard(str);
6648 return str_replace(str, str2);
6649}
6650
6651/*
6652 * call-seq:
6653 * clear -> self
6654 *
6655 * Removes the contents of +self+:
6656 *
6657 * s = 'foo'
6658 * s.clear # => ""
6659 * s # => ""
6660 *
6661 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6662 */
6663
6664static VALUE
6665rb_str_clear(VALUE str)
6666{
6667 str_discard(str);
6668 STR_SET_EMBED(str);
6669 STR_SET_LEN(str, 0);
6670 RSTRING_PTR(str)[0] = 0;
6671 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6673 else
6675 return str;
6676}
6677
6678/*
6679 * call-seq:
6680 * chr -> string
6681 *
6682 * :include: doc/string/chr.rdoc
6683 *
6684 */
6685
6686static VALUE
6687rb_str_chr(VALUE str)
6688{
6689 return rb_str_substr(str, 0, 1);
6690}
6691
6692/*
6693 * call-seq:
6694 * getbyte(index) -> integer or nil
6695 *
6696 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6697 *
6698 * s = 'abcde' # => "abcde"
6699 * s.getbyte(0) # => 97
6700 * s.getbyte(-1) # => 101
6701 * s.getbyte(5) # => nil
6702 *
6703 * Related: String#setbyte.
6704 */
6705VALUE
6706rb_str_getbyte(VALUE str, VALUE index)
6707{
6708 long pos = NUM2LONG(index);
6709
6710 if (pos < 0)
6711 pos += RSTRING_LEN(str);
6712 if (pos < 0 || RSTRING_LEN(str) <= pos)
6713 return Qnil;
6714
6715 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6716}
6717
6718/*
6719 * call-seq:
6720 * setbyte(index, integer) -> integer
6721 *
6722 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6723 *
6724 * s = 'abcde' # => "abcde"
6725 * s.setbyte(0, 98) # => 98
6726 * s # => "bbcde"
6727 *
6728 * Related: String#getbyte.
6729 */
6730VALUE
6731rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6732{
6733 long pos = NUM2LONG(index);
6734 long len = RSTRING_LEN(str);
6735 char *ptr, *head, *left = 0;
6736 rb_encoding *enc;
6737 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6738
6739 if (pos < -len || len <= pos)
6740 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6741 if (pos < 0)
6742 pos += len;
6743
6744 VALUE v = rb_to_int(value);
6745 VALUE w = rb_int_and(v, INT2FIX(0xff));
6746 char byte = (char)(NUM2INT(w) & 0xFF);
6747
6748 if (!str_independent(str))
6749 str_make_independent(str);
6750 enc = STR_ENC_GET(str);
6751 head = RSTRING_PTR(str);
6752 ptr = &head[pos];
6753 if (!STR_EMBED_P(str)) {
6754 cr = ENC_CODERANGE(str);
6755 switch (cr) {
6756 case ENC_CODERANGE_7BIT:
6757 left = ptr;
6758 *ptr = byte;
6759 if (ISASCII(byte)) goto end;
6760 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6761 if (!MBCLEN_CHARFOUND_P(nlen))
6763 else
6765 goto end;
6767 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6768 width = rb_enc_precise_mbclen(left, head+len, enc);
6769 *ptr = byte;
6770 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6771 if (!MBCLEN_CHARFOUND_P(nlen))
6773 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6775 goto end;
6776 }
6777 }
6779 *ptr = byte;
6780
6781 end:
6782 return value;
6783}
6784
6785static VALUE
6786str_byte_substr(VALUE str, long beg, long len, int empty)
6787{
6788 long n = RSTRING_LEN(str);
6789
6790 if (beg > n || len < 0) return Qnil;
6791 if (beg < 0) {
6792 beg += n;
6793 if (beg < 0) return Qnil;
6794 }
6795 if (len > n - beg)
6796 len = n - beg;
6797 if (len <= 0) {
6798 if (!empty) return Qnil;
6799 len = 0;
6800 }
6801
6802 VALUE str2 = str_subseq(str, beg, len);
6803
6804 str_enc_copy_direct(str2, str);
6805
6806 if (RSTRING_LEN(str2) == 0) {
6807 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6809 else
6811 }
6812 else {
6813 switch (ENC_CODERANGE(str)) {
6814 case ENC_CODERANGE_7BIT:
6816 break;
6817 default:
6819 break;
6820 }
6821 }
6822
6823 return str2;
6824}
6825
6826VALUE
6827rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6828{
6829 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6830}
6831
6832static VALUE
6833str_byte_aref(VALUE str, VALUE indx)
6834{
6835 long idx;
6836 if (FIXNUM_P(indx)) {
6837 idx = FIX2LONG(indx);
6838 }
6839 else {
6840 /* check if indx is Range */
6841 long beg, len = RSTRING_LEN(str);
6842
6843 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6844 case Qfalse:
6845 break;
6846 case Qnil:
6847 return Qnil;
6848 default:
6849 return str_byte_substr(str, beg, len, TRUE);
6850 }
6851
6852 idx = NUM2LONG(indx);
6853 }
6854 return str_byte_substr(str, idx, 1, FALSE);
6855}
6856
6857/*
6858 * call-seq:
6859 * byteslice(offset, length = 1) -> string or nil
6860 * byteslice(range) -> string or nil
6861 *
6862 * :include: doc/string/byteslice.rdoc
6863 */
6864
6865static VALUE
6866rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6867{
6868 if (argc == 2) {
6869 long beg = NUM2LONG(argv[0]);
6870 long len = NUM2LONG(argv[1]);
6871 return str_byte_substr(str, beg, len, TRUE);
6872 }
6873 rb_check_arity(argc, 1, 2);
6874 return str_byte_aref(str, argv[0]);
6875}
6876
6877static void
6878str_check_beg_len(VALUE str, long *beg, long *len)
6879{
6880 long end, slen = RSTRING_LEN(str);
6881
6882 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6883 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6884 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6885 }
6886 if (*beg < 0) {
6887 *beg += slen;
6888 }
6889 RUBY_ASSERT(*beg >= 0);
6890 RUBY_ASSERT(*beg <= slen);
6891
6892 if (*len > slen - *beg) {
6893 *len = slen - *beg;
6894 }
6895 end = *beg + *len;
6896 str_ensure_byte_pos(str, *beg);
6897 str_ensure_byte_pos(str, end);
6898}
6899
6900/*
6901 * call-seq:
6902 * bytesplice(offset, length, str) -> self
6903 * bytesplice(offset, length, str, str_offset, str_length) -> self
6904 * bytesplice(range, str) -> self
6905 * bytesplice(range, str, str_range) -> self
6906 *
6907 * :include: doc/string/bytesplice.rdoc
6908 */
6909
6910static VALUE
6911rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6912{
6913 long beg, len, vbeg, vlen;
6914 VALUE val;
6915 int cr;
6916
6917 rb_check_arity(argc, 2, 5);
6918 if (!(argc == 2 || argc == 3 || argc == 5)) {
6919 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6920 }
6921 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6922 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6923 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6924 rb_builtin_class_name(argv[0]));
6925 }
6926 val = argv[1];
6927 StringValue(val);
6928 if (argc == 2) {
6929 /* bytesplice(range, str) */
6930 vbeg = 0;
6931 vlen = RSTRING_LEN(val);
6932 }
6933 else {
6934 /* bytesplice(range, str, str_range) */
6935 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6936 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6937 rb_builtin_class_name(argv[2]));
6938 }
6939 }
6940 }
6941 else {
6942 beg = NUM2LONG(argv[0]);
6943 len = NUM2LONG(argv[1]);
6944 val = argv[2];
6945 StringValue(val);
6946 if (argc == 3) {
6947 /* bytesplice(index, length, str) */
6948 vbeg = 0;
6949 vlen = RSTRING_LEN(val);
6950 }
6951 else {
6952 /* bytesplice(index, length, str, str_index, str_length) */
6953 vbeg = NUM2LONG(argv[3]);
6954 vlen = NUM2LONG(argv[4]);
6955 }
6956 }
6957 str_check_beg_len(str, &beg, &len);
6958 str_check_beg_len(val, &vbeg, &vlen);
6959 str_modify_keep_cr(str);
6960
6961 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6962 rb_enc_associate(str, rb_enc_check(str, val));
6963 }
6964
6965 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6967 if (cr != ENC_CODERANGE_BROKEN)
6968 ENC_CODERANGE_SET(str, cr);
6969 return str;
6970}
6971
6972/*
6973 * call-seq:
6974 * reverse -> string
6975 *
6976 * Returns a new string with the characters from +self+ in reverse order.
6977 *
6978 * 'stressed'.reverse # => "desserts"
6979 *
6980 */
6981
6982static VALUE
6983rb_str_reverse(VALUE str)
6984{
6985 rb_encoding *enc;
6986 VALUE rev;
6987 char *s, *e, *p;
6988 int cr;
6989
6990 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6991 enc = STR_ENC_GET(str);
6992 rev = rb_str_new(0, RSTRING_LEN(str));
6993 s = RSTRING_PTR(str); e = RSTRING_END(str);
6994 p = RSTRING_END(rev);
6995 cr = ENC_CODERANGE(str);
6996
6997 if (RSTRING_LEN(str) > 1) {
6998 if (single_byte_optimizable(str)) {
6999 while (s < e) {
7000 *--p = *s++;
7001 }
7002 }
7003 else if (cr == ENC_CODERANGE_VALID) {
7004 while (s < e) {
7005 int clen = rb_enc_fast_mbclen(s, e, enc);
7006
7007 p -= clen;
7008 memcpy(p, s, clen);
7009 s += clen;
7010 }
7011 }
7012 else {
7013 cr = rb_enc_asciicompat(enc) ?
7015 while (s < e) {
7016 int clen = rb_enc_mbclen(s, e, enc);
7017
7018 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7019 p -= clen;
7020 memcpy(p, s, clen);
7021 s += clen;
7022 }
7023 }
7024 }
7025 STR_SET_LEN(rev, RSTRING_LEN(str));
7026 str_enc_copy_direct(rev, str);
7027 ENC_CODERANGE_SET(rev, cr);
7028
7029 return rev;
7030}
7031
7032
7033/*
7034 * call-seq:
7035 * reverse! -> self
7036 *
7037 * Returns +self+ with its characters reversed:
7038 *
7039 * s = 'stressed'
7040 * s.reverse! # => "desserts"
7041 * s # => "desserts"
7042 *
7043 */
7044
7045static VALUE
7046rb_str_reverse_bang(VALUE str)
7047{
7048 if (RSTRING_LEN(str) > 1) {
7049 if (single_byte_optimizable(str)) {
7050 char *s, *e, c;
7051
7052 str_modify_keep_cr(str);
7053 s = RSTRING_PTR(str);
7054 e = RSTRING_END(str) - 1;
7055 while (s < e) {
7056 c = *s;
7057 *s++ = *e;
7058 *e-- = c;
7059 }
7060 }
7061 else {
7062 str_shared_replace(str, rb_str_reverse(str));
7063 }
7064 }
7065 else {
7066 str_modify_keep_cr(str);
7067 }
7068 return str;
7069}
7070
7071
7072/*
7073 * call-seq:
7074 * include?(other_string) -> true or false
7075 *
7076 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7077 *
7078 * s = 'foo'
7079 * s.include?('f') # => true
7080 * s.include?('fo') # => true
7081 * s.include?('food') # => false
7082 *
7083 */
7084
7085VALUE
7086rb_str_include(VALUE str, VALUE arg)
7087{
7088 long i;
7089
7090 StringValue(arg);
7091 i = rb_str_index(str, arg, 0);
7092
7093 return RBOOL(i != -1);
7094}
7095
7096
7097/*
7098 * call-seq:
7099 * to_i(base = 10) -> integer
7100 *
7101 * Returns the result of interpreting leading characters in +self+
7102 * as an integer in the given +base+ (which must be in (0, 2..36)):
7103 *
7104 * '123456'.to_i # => 123456
7105 * '123def'.to_i(16) # => 1195503
7106 *
7107 * With +base+ zero, string +object+ may contain leading characters
7108 * to specify the actual base:
7109 *
7110 * '123def'.to_i(0) # => 123
7111 * '0123def'.to_i(0) # => 83
7112 * '0b123def'.to_i(0) # => 1
7113 * '0o123def'.to_i(0) # => 83
7114 * '0d123def'.to_i(0) # => 123
7115 * '0x123def'.to_i(0) # => 1195503
7116 *
7117 * Characters past a leading valid number (in the given +base+) are ignored:
7118 *
7119 * '12.345'.to_i # => 12
7120 * '12345'.to_i(2) # => 1
7121 *
7122 * Returns zero if there is no leading valid number:
7123 *
7124 * 'abcdef'.to_i # => 0
7125 * '2'.to_i(2) # => 0
7126 *
7127 */
7128
7129static VALUE
7130rb_str_to_i(int argc, VALUE *argv, VALUE str)
7131{
7132 int base = 10;
7133
7134 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7135 rb_raise(rb_eArgError, "invalid radix %d", base);
7136 }
7137 return rb_str_to_inum(str, base, FALSE);
7138}
7139
7140
7141/*
7142 * call-seq:
7143 * to_f -> float
7144 *
7145 * Returns the result of interpreting leading characters in +self+ as a Float:
7146 *
7147 * '3.14159'.to_f # => 3.14159
7148 * '1.234e-2'.to_f # => 0.01234
7149 *
7150 * Characters past a leading valid number (in the given +base+) are ignored:
7151 *
7152 * '3.14 (pi to two places)'.to_f # => 3.14
7153 *
7154 * Returns zero if there is no leading valid number:
7155 *
7156 * 'abcdef'.to_f # => 0.0
7157 *
7158 */
7159
7160static VALUE
7161rb_str_to_f(VALUE str)
7162{
7163 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7164}
7165
7166
7167/*
7168 * call-seq:
7169 * to_s -> self or string
7170 *
7171 * Returns +self+ if +self+ is a +String+,
7172 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7173 */
7174
7175static VALUE
7176rb_str_to_s(VALUE str)
7177{
7178 if (rb_obj_class(str) != rb_cString) {
7179 return str_duplicate(rb_cString, str);
7180 }
7181 return str;
7182}
7183
7184#if 0
7185static void
7186str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7187{
7188 char s[RUBY_MAX_CHAR_LEN];
7189 int n = rb_enc_codelen(c, enc);
7190
7191 rb_enc_mbcput(c, s, enc);
7192 rb_enc_str_buf_cat(str, s, n, enc);
7193}
7194#endif
7195
7196#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7197
7198int
7199rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7200{
7201 char buf[CHAR_ESC_LEN + 1];
7202 int l;
7203
7204#if SIZEOF_INT > 4
7205 c &= 0xffffffff;
7206#endif
7207 if (unicode_p) {
7208 if (c < 0x7F && ISPRINT(c)) {
7209 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7210 }
7211 else if (c < 0x10000) {
7212 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7213 }
7214 else {
7215 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7216 }
7217 }
7218 else {
7219 if (c < 0x100) {
7220 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7221 }
7222 else {
7223 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7224 }
7225 }
7226 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7227 rb_str_buf_cat(result, buf, l);
7228 return l;
7229}
7230
7231const char *
7232ruby_escaped_char(int c)
7233{
7234 switch (c) {
7235 case '\0': return "\\0";
7236 case '\n': return "\\n";
7237 case '\r': return "\\r";
7238 case '\t': return "\\t";
7239 case '\f': return "\\f";
7240 case '\013': return "\\v";
7241 case '\010': return "\\b";
7242 case '\007': return "\\a";
7243 case '\033': return "\\e";
7244 case '\x7f': return "\\c?";
7245 }
7246 return NULL;
7247}
7248
7249VALUE
7250rb_str_escape(VALUE str)
7251{
7252 int encidx = ENCODING_GET(str);
7253 rb_encoding *enc = rb_enc_from_index(encidx);
7254 const char *p = RSTRING_PTR(str);
7255 const char *pend = RSTRING_END(str);
7256 const char *prev = p;
7257 char buf[CHAR_ESC_LEN + 1];
7258 VALUE result = rb_str_buf_new(0);
7259 int unicode_p = rb_enc_unicode_p(enc);
7260 int asciicompat = rb_enc_asciicompat(enc);
7261
7262 while (p < pend) {
7263 unsigned int c;
7264 const char *cc;
7265 int n = rb_enc_precise_mbclen(p, pend, enc);
7266 if (!MBCLEN_CHARFOUND_P(n)) {
7267 if (p > prev) str_buf_cat(result, prev, p - prev);
7268 n = rb_enc_mbminlen(enc);
7269 if (pend < p + n)
7270 n = (int)(pend - p);
7271 while (n--) {
7272 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7273 str_buf_cat(result, buf, strlen(buf));
7274 prev = ++p;
7275 }
7276 continue;
7277 }
7278 n = MBCLEN_CHARFOUND_LEN(n);
7279 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7280 p += n;
7281 cc = ruby_escaped_char(c);
7282 if (cc) {
7283 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7284 str_buf_cat(result, cc, strlen(cc));
7285 prev = p;
7286 }
7287 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7288 }
7289 else {
7290 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7291 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7292 prev = p;
7293 }
7294 }
7295 if (p > prev) str_buf_cat(result, prev, p - prev);
7296 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7297
7298 return result;
7299}
7300
7301/*
7302 * call-seq:
7303 * inspect -> string
7304 *
7305 * Returns a printable version of +self+, enclosed in double-quotes,
7306 * and with special characters escaped:
7307 *
7308 * s = "foo\tbar\tbaz\n"
7309 * s.inspect
7310 * # => "\"foo\\tbar\\tbaz\\n\""
7311 *
7312 */
7313
7314VALUE
7316{
7317 int encidx = ENCODING_GET(str);
7318 rb_encoding *enc = rb_enc_from_index(encidx);
7319 const char *p, *pend, *prev;
7320 char buf[CHAR_ESC_LEN + 1];
7321 VALUE result = rb_str_buf_new(0);
7322 rb_encoding *resenc = rb_default_internal_encoding();
7323 int unicode_p = rb_enc_unicode_p(enc);
7324 int asciicompat = rb_enc_asciicompat(enc);
7325
7326 if (resenc == NULL) resenc = rb_default_external_encoding();
7327 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7328 rb_enc_associate(result, resenc);
7329 str_buf_cat2(result, "\"");
7330
7331 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7332 prev = p;
7333 while (p < pend) {
7334 unsigned int c, cc;
7335 int n;
7336
7337 n = rb_enc_precise_mbclen(p, pend, enc);
7338 if (!MBCLEN_CHARFOUND_P(n)) {
7339 if (p > prev) str_buf_cat(result, prev, p - prev);
7340 n = rb_enc_mbminlen(enc);
7341 if (pend < p + n)
7342 n = (int)(pend - p);
7343 while (n--) {
7344 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7345 str_buf_cat(result, buf, strlen(buf));
7346 prev = ++p;
7347 }
7348 continue;
7349 }
7350 n = MBCLEN_CHARFOUND_LEN(n);
7351 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7352 p += n;
7353 if ((asciicompat || unicode_p) &&
7354 (c == '"'|| c == '\\' ||
7355 (c == '#' &&
7356 p < pend &&
7357 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7358 (cc = rb_enc_codepoint(p,pend,enc),
7359 (cc == '$' || cc == '@' || cc == '{'))))) {
7360 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7361 str_buf_cat2(result, "\\");
7362 if (asciicompat || enc == resenc) {
7363 prev = p - n;
7364 continue;
7365 }
7366 }
7367 switch (c) {
7368 case '\n': cc = 'n'; break;
7369 case '\r': cc = 'r'; break;
7370 case '\t': cc = 't'; break;
7371 case '\f': cc = 'f'; break;
7372 case '\013': cc = 'v'; break;
7373 case '\010': cc = 'b'; break;
7374 case '\007': cc = 'a'; break;
7375 case 033: cc = 'e'; break;
7376 default: cc = 0; break;
7377 }
7378 if (cc) {
7379 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7380 buf[0] = '\\';
7381 buf[1] = (char)cc;
7382 str_buf_cat(result, buf, 2);
7383 prev = p;
7384 continue;
7385 }
7386 /* The special casing of 0x85 (NEXT_LINE) here is because
7387 * Oniguruma historically treats it as printable, but it
7388 * doesn't match the print POSIX bracket class or character
7389 * property in regexps.
7390 *
7391 * See Ruby Bug #16842 for details:
7392 * https://bugs.ruby-lang.org/issues/16842
7393 */
7394 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7395 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7396 continue;
7397 }
7398 else {
7399 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7400 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7401 prev = p;
7402 continue;
7403 }
7404 }
7405 if (p > prev) str_buf_cat(result, prev, p - prev);
7406 str_buf_cat2(result, "\"");
7407
7408 return result;
7409}
7410
7411#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7412
7413/*
7414 * call-seq:
7415 * dump -> string
7416 *
7417 * Returns a printable version of +self+, enclosed in double-quotes,
7418 * with special characters escaped, and with non-printing characters
7419 * replaced by hexadecimal notation:
7420 *
7421 * "hello \n ''".dump # => "\"hello \\n ''\""
7422 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7423 *
7424 * Related: String#undump (inverse of String#dump).
7425 *
7426 */
7427
7428VALUE
7430{
7431 int encidx = rb_enc_get_index(str);
7432 rb_encoding *enc = rb_enc_from_index(encidx);
7433 long len;
7434 const char *p, *pend;
7435 char *q, *qend;
7436 VALUE result;
7437 int u8 = (encidx == rb_utf8_encindex());
7438 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7439
7440 len = 2; /* "" */
7441 if (!rb_enc_asciicompat(enc)) {
7442 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7443 len += strlen(enc->name);
7444 }
7445
7446 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7447 while (p < pend) {
7448 int clen;
7449 unsigned char c = *p++;
7450
7451 switch (c) {
7452 case '"': case '\\':
7453 case '\n': case '\r':
7454 case '\t': case '\f':
7455 case '\013': case '\010': case '\007': case '\033':
7456 clen = 2;
7457 break;
7458
7459 case '#':
7460 clen = IS_EVSTR(p, pend) ? 2 : 1;
7461 break;
7462
7463 default:
7464 if (ISPRINT(c)) {
7465 clen = 1;
7466 }
7467 else {
7468 if (u8 && c > 0x7F) { /* \u notation */
7469 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7470 if (MBCLEN_CHARFOUND_P(n)) {
7471 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7472 if (cc <= 0xFFFF)
7473 clen = 6; /* \uXXXX */
7474 else if (cc <= 0xFFFFF)
7475 clen = 9; /* \u{XXXXX} */
7476 else
7477 clen = 10; /* \u{XXXXXX} */
7478 p += MBCLEN_CHARFOUND_LEN(n)-1;
7479 break;
7480 }
7481 }
7482 clen = 4; /* \xNN */
7483 }
7484 break;
7485 }
7486
7487 if (clen > LONG_MAX - len) {
7488 rb_raise(rb_eRuntimeError, "string size too big");
7489 }
7490 len += clen;
7491 }
7492
7493 result = rb_str_new(0, len);
7494 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7495 q = RSTRING_PTR(result); qend = q + len + 1;
7496
7497 *q++ = '"';
7498 while (p < pend) {
7499 unsigned char c = *p++;
7500
7501 if (c == '"' || c == '\\') {
7502 *q++ = '\\';
7503 *q++ = c;
7504 }
7505 else if (c == '#') {
7506 if (IS_EVSTR(p, pend)) *q++ = '\\';
7507 *q++ = '#';
7508 }
7509 else if (c == '\n') {
7510 *q++ = '\\';
7511 *q++ = 'n';
7512 }
7513 else if (c == '\r') {
7514 *q++ = '\\';
7515 *q++ = 'r';
7516 }
7517 else if (c == '\t') {
7518 *q++ = '\\';
7519 *q++ = 't';
7520 }
7521 else if (c == '\f') {
7522 *q++ = '\\';
7523 *q++ = 'f';
7524 }
7525 else if (c == '\013') {
7526 *q++ = '\\';
7527 *q++ = 'v';
7528 }
7529 else if (c == '\010') {
7530 *q++ = '\\';
7531 *q++ = 'b';
7532 }
7533 else if (c == '\007') {
7534 *q++ = '\\';
7535 *q++ = 'a';
7536 }
7537 else if (c == '\033') {
7538 *q++ = '\\';
7539 *q++ = 'e';
7540 }
7541 else if (ISPRINT(c)) {
7542 *q++ = c;
7543 }
7544 else {
7545 *q++ = '\\';
7546 if (u8) {
7547 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7548 if (MBCLEN_CHARFOUND_P(n)) {
7549 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7550 p += n;
7551 if (cc <= 0xFFFF)
7552 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7553 else
7554 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7555 q += strlen(q);
7556 continue;
7557 }
7558 }
7559 snprintf(q, qend-q, "x%02X", c);
7560 q += 3;
7561 }
7562 }
7563 *q++ = '"';
7564 *q = '\0';
7565 if (!rb_enc_asciicompat(enc)) {
7566 snprintf(q, qend-q, nonascii_suffix, enc->name);
7567 encidx = rb_ascii8bit_encindex();
7568 }
7569 /* result from dump is ASCII */
7570 rb_enc_associate_index(result, encidx);
7572 return result;
7573}
7574
7575static int
7576unescape_ascii(unsigned int c)
7577{
7578 switch (c) {
7579 case 'n':
7580 return '\n';
7581 case 'r':
7582 return '\r';
7583 case 't':
7584 return '\t';
7585 case 'f':
7586 return '\f';
7587 case 'v':
7588 return '\13';
7589 case 'b':
7590 return '\010';
7591 case 'a':
7592 return '\007';
7593 case 'e':
7594 return 033;
7595 }
7597}
7598
7599static void
7600undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7601{
7602 const char *s = *ss;
7603 unsigned int c;
7604 int codelen;
7605 size_t hexlen;
7606 unsigned char buf[6];
7607 static rb_encoding *enc_utf8 = NULL;
7608
7609 switch (*s) {
7610 case '\\':
7611 case '"':
7612 case '#':
7613 rb_str_cat(undumped, s, 1); /* cat itself */
7614 s++;
7615 break;
7616 case 'n':
7617 case 'r':
7618 case 't':
7619 case 'f':
7620 case 'v':
7621 case 'b':
7622 case 'a':
7623 case 'e':
7624 *buf = unescape_ascii(*s);
7625 rb_str_cat(undumped, (char *)buf, 1);
7626 s++;
7627 break;
7628 case 'u':
7629 if (*binary) {
7630 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7631 }
7632 *utf8 = true;
7633 if (++s >= s_end) {
7634 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7635 }
7636 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7637 if (*penc != enc_utf8) {
7638 *penc = enc_utf8;
7639 rb_enc_associate(undumped, enc_utf8);
7640 }
7641 if (*s == '{') { /* handle \u{...} form */
7642 s++;
7643 for (;;) {
7644 if (s >= s_end) {
7645 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7646 }
7647 if (*s == '}') {
7648 s++;
7649 break;
7650 }
7651 if (ISSPACE(*s)) {
7652 s++;
7653 continue;
7654 }
7655 c = scan_hex(s, s_end-s, &hexlen);
7656 if (hexlen == 0 || hexlen > 6) {
7657 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7658 }
7659 if (c > 0x10ffff) {
7660 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7661 }
7662 if (0xd800 <= c && c <= 0xdfff) {
7663 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7664 }
7665 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7666 rb_str_cat(undumped, (char *)buf, codelen);
7667 s += hexlen;
7668 }
7669 }
7670 else { /* handle \uXXXX form */
7671 c = scan_hex(s, 4, &hexlen);
7672 if (hexlen != 4) {
7673 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7674 }
7675 if (0xd800 <= c && c <= 0xdfff) {
7676 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7677 }
7678 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7679 rb_str_cat(undumped, (char *)buf, codelen);
7680 s += hexlen;
7681 }
7682 break;
7683 case 'x':
7684 if (*utf8) {
7685 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7686 }
7687 *binary = true;
7688 if (++s >= s_end) {
7689 rb_raise(rb_eRuntimeError, "invalid hex escape");
7690 }
7691 *buf = scan_hex(s, 2, &hexlen);
7692 if (hexlen != 2) {
7693 rb_raise(rb_eRuntimeError, "invalid hex escape");
7694 }
7695 rb_str_cat(undumped, (char *)buf, 1);
7696 s += hexlen;
7697 break;
7698 default:
7699 rb_str_cat(undumped, s-1, 2);
7700 s++;
7701 }
7702
7703 *ss = s;
7704}
7705
7706static VALUE rb_str_is_ascii_only_p(VALUE str);
7707
7708/*
7709 * call-seq:
7710 * undump -> string
7711 *
7712 * Returns an unescaped version of +self+:
7713 *
7714 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7715 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7716 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7717 * s_undumped == s_orig # => true
7718 *
7719 * Related: String#dump (inverse of String#undump).
7720 *
7721 */
7722
7723static VALUE
7724str_undump(VALUE str)
7725{
7726 const char *s = RSTRING_PTR(str);
7727 const char *s_end = RSTRING_END(str);
7728 rb_encoding *enc = rb_enc_get(str);
7729 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7730 bool utf8 = false;
7731 bool binary = false;
7732 int w;
7733
7735 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7736 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7737 }
7738 if (!str_null_check(str, &w)) {
7739 rb_raise(rb_eRuntimeError, "string contains null byte");
7740 }
7741 if (RSTRING_LEN(str) < 2) goto invalid_format;
7742 if (*s != '"') goto invalid_format;
7743
7744 /* strip '"' at the start */
7745 s++;
7746
7747 for (;;) {
7748 if (s >= s_end) {
7749 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7750 }
7751
7752 if (*s == '"') {
7753 /* epilogue */
7754 s++;
7755 if (s == s_end) {
7756 /* ascii compatible dumped string */
7757 break;
7758 }
7759 else {
7760 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7761 static const char dup_suffix[] = ".dup";
7762 const char *encname;
7763 int encidx;
7764 ptrdiff_t size;
7765
7766 /* check separately for strings dumped by older versions */
7767 size = sizeof(dup_suffix) - 1;
7768 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7769
7770 size = sizeof(force_encoding_suffix) - 1;
7771 if (s_end - s <= size) goto invalid_format;
7772 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7773 s += size;
7774
7775 if (utf8) {
7776 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7777 }
7778
7779 encname = s;
7780 s = memchr(s, '"', s_end-s);
7781 size = s - encname;
7782 if (!s) goto invalid_format;
7783 if (s_end - s != 2) goto invalid_format;
7784 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7785
7786 encidx = rb_enc_find_index2(encname, (long)size);
7787 if (encidx < 0) {
7788 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7789 }
7790 rb_enc_associate_index(undumped, encidx);
7791 }
7792 break;
7793 }
7794
7795 if (*s == '\\') {
7796 s++;
7797 if (s >= s_end) {
7798 rb_raise(rb_eRuntimeError, "invalid escape");
7799 }
7800 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7801 }
7802 else {
7803 rb_str_cat(undumped, s++, 1);
7804 }
7805 }
7806
7807 RB_GC_GUARD(str);
7808
7809 return undumped;
7810invalid_format:
7811 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7812}
7813
7814static void
7815rb_str_check_dummy_enc(rb_encoding *enc)
7816{
7817 if (rb_enc_dummy_p(enc)) {
7818 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7819 rb_enc_name(enc));
7820 }
7821}
7822
7823static rb_encoding *
7824str_true_enc(VALUE str)
7825{
7826 rb_encoding *enc = STR_ENC_GET(str);
7827 rb_str_check_dummy_enc(enc);
7828 return enc;
7829}
7830
7831static OnigCaseFoldType
7832check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7833{
7834 if (argc==0)
7835 return flags;
7836 if (argc>2)
7837 rb_raise(rb_eArgError, "too many options");
7838 if (argv[0]==sym_turkic) {
7839 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7840 if (argc==2) {
7841 if (argv[1]==sym_lithuanian)
7842 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7843 else
7844 rb_raise(rb_eArgError, "invalid second option");
7845 }
7846 }
7847 else if (argv[0]==sym_lithuanian) {
7848 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7849 if (argc==2) {
7850 if (argv[1]==sym_turkic)
7851 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7852 else
7853 rb_raise(rb_eArgError, "invalid second option");
7854 }
7855 }
7856 else if (argc>1)
7857 rb_raise(rb_eArgError, "too many options");
7858 else if (argv[0]==sym_ascii)
7859 flags |= ONIGENC_CASE_ASCII_ONLY;
7860 else if (argv[0]==sym_fold) {
7861 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7862 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7863 else
7864 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7865 }
7866 else
7867 rb_raise(rb_eArgError, "invalid option");
7868 return flags;
7869}
7870
7871static inline bool
7872case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7873{
7874 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7875 return true;
7876 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7877}
7878
7879/* 16 should be long enough to absorb any kind of single character length increase */
7880#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7881#ifndef CASEMAP_DEBUG
7882# define CASEMAP_DEBUG 0
7883#endif
7884
7885struct mapping_buffer;
7886typedef struct mapping_buffer {
7887 size_t capa;
7888 size_t used;
7889 struct mapping_buffer *next;
7890 OnigUChar space[FLEX_ARY_LEN];
7892
7893static void
7894mapping_buffer_free(void *p)
7895{
7896 mapping_buffer *previous_buffer;
7897 mapping_buffer *current_buffer = p;
7898 while (current_buffer) {
7899 previous_buffer = current_buffer;
7900 current_buffer = current_buffer->next;
7901 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7902 }
7903}
7904
7905static const rb_data_type_t mapping_buffer_type = {
7906 "mapping_buffer",
7907 {0, mapping_buffer_free,},
7908 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7909};
7910
7911static VALUE
7912rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7913{
7914 VALUE target;
7915
7916 const OnigUChar *source_current, *source_end;
7917 int target_length = 0;
7918 VALUE buffer_anchor;
7919 mapping_buffer *current_buffer = 0;
7920 mapping_buffer **pre_buffer;
7921 size_t buffer_count = 0;
7922 int buffer_length_or_invalid;
7923
7924 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7925
7926 source_current = (OnigUChar*)RSTRING_PTR(source);
7927 source_end = (OnigUChar*)RSTRING_END(source);
7928
7929 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7930 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7931 while (source_current < source_end) {
7932 /* increase multiplier using buffer count to converge quickly */
7933 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7934 if (CASEMAP_DEBUG) {
7935 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7936 }
7937 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7938 *pre_buffer = current_buffer;
7939 pre_buffer = &current_buffer->next;
7940 current_buffer->next = NULL;
7941 current_buffer->capa = capa;
7942 buffer_length_or_invalid = enc->case_map(flags,
7943 &source_current, source_end,
7944 current_buffer->space,
7945 current_buffer->space+current_buffer->capa,
7946 enc);
7947 if (buffer_length_or_invalid < 0) {
7948 current_buffer = DATA_PTR(buffer_anchor);
7949 DATA_PTR(buffer_anchor) = 0;
7950 mapping_buffer_free(current_buffer);
7951 rb_raise(rb_eArgError, "input string invalid");
7952 }
7953 target_length += current_buffer->used = buffer_length_or_invalid;
7954 }
7955 if (CASEMAP_DEBUG) {
7956 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7957 }
7958
7959 if (buffer_count==1) {
7960 target = rb_str_new((const char*)current_buffer->space, target_length);
7961 }
7962 else {
7963 char *target_current;
7964
7965 target = rb_str_new(0, target_length);
7966 target_current = RSTRING_PTR(target);
7967 current_buffer = DATA_PTR(buffer_anchor);
7968 while (current_buffer) {
7969 memcpy(target_current, current_buffer->space, current_buffer->used);
7970 target_current += current_buffer->used;
7971 current_buffer = current_buffer->next;
7972 }
7973 }
7974 current_buffer = DATA_PTR(buffer_anchor);
7975 DATA_PTR(buffer_anchor) = 0;
7976 mapping_buffer_free(current_buffer);
7977
7978 RB_GC_GUARD(buffer_anchor);
7979
7980 /* TODO: check about string terminator character */
7981 str_enc_copy_direct(target, source);
7982 /*ENC_CODERANGE_SET(mapped, cr);*/
7983
7984 return target;
7985}
7986
7987static VALUE
7988rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7989{
7990 const OnigUChar *source_current, *source_end;
7991 OnigUChar *target_current, *target_end;
7992 long old_length = RSTRING_LEN(source);
7993 int length_or_invalid;
7994
7995 if (old_length == 0) return Qnil;
7996
7997 source_current = (OnigUChar*)RSTRING_PTR(source);
7998 source_end = (OnigUChar*)RSTRING_END(source);
7999 if (source == target) {
8000 target_current = (OnigUChar*)source_current;
8001 target_end = (OnigUChar*)source_end;
8002 }
8003 else {
8004 target_current = (OnigUChar*)RSTRING_PTR(target);
8005 target_end = (OnigUChar*)RSTRING_END(target);
8006 }
8007
8008 length_or_invalid = onigenc_ascii_only_case_map(flags,
8009 &source_current, source_end,
8010 target_current, target_end, enc);
8011 if (length_or_invalid < 0)
8012 rb_raise(rb_eArgError, "input string invalid");
8013 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8014 fprintf(stderr, "problem with rb_str_ascii_casemap"
8015 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8016 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8017 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8018 }
8019
8020 str_enc_copy(target, source);
8021
8022 return target;
8023}
8024
8025static bool
8026upcase_single(VALUE str)
8027{
8028 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8029 bool modified = false;
8030
8031 while (s < send) {
8032 unsigned int c = *(unsigned char*)s;
8033
8034 if ('a' <= c && c <= 'z') {
8035 *s = 'A' + (c - 'a');
8036 modified = true;
8037 }
8038 s++;
8039 }
8040 return modified;
8041}
8042
8043/*
8044 * call-seq:
8045 * upcase!(mapping) -> self or nil
8046 *
8047 * Upcases the characters in +self+;
8048 * returns +self+ if any changes were made, +nil+ otherwise:
8049 *
8050 * s = 'Hello World!' # => "Hello World!"
8051 * s.upcase! # => "HELLO WORLD!"
8052 * s # => "HELLO WORLD!"
8053 * s.upcase! # => nil
8054 *
8055 * The casing may be affected by the given +mapping+;
8056 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8057 *
8058 * Related: String#upcase, String#downcase, String#downcase!.
8059 *
8060 */
8061
8062static VALUE
8063rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8064{
8065 rb_encoding *enc;
8066 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8067
8068 flags = check_case_options(argc, argv, flags);
8069 str_modify_keep_cr(str);
8070 enc = str_true_enc(str);
8071 if (case_option_single_p(flags, enc, str)) {
8072 if (upcase_single(str))
8073 flags |= ONIGENC_CASE_MODIFIED;
8074 }
8075 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8076 rb_str_ascii_casemap(str, str, &flags, enc);
8077 else
8078 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8079
8080 if (ONIGENC_CASE_MODIFIED&flags) return str;
8081 return Qnil;
8082}
8083
8084
8085/*
8086 * call-seq:
8087 * upcase(mapping) -> string
8088 *
8089 * Returns a string containing the upcased characters in +self+:
8090 *
8091 * s = 'Hello World!' # => "Hello World!"
8092 * s.upcase # => "HELLO WORLD!"
8093 *
8094 * The casing may be affected by the given +mapping+;
8095 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8096 *
8097 * Related: String#upcase!, String#downcase, String#downcase!.
8098 *
8099 */
8100
8101static VALUE
8102rb_str_upcase(int argc, VALUE *argv, VALUE str)
8103{
8104 rb_encoding *enc;
8105 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8106 VALUE ret;
8107
8108 flags = check_case_options(argc, argv, flags);
8109 enc = str_true_enc(str);
8110 if (case_option_single_p(flags, enc, str)) {
8111 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8112 str_enc_copy_direct(ret, str);
8113 upcase_single(ret);
8114 }
8115 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8116 ret = rb_str_new(0, RSTRING_LEN(str));
8117 rb_str_ascii_casemap(str, ret, &flags, enc);
8118 }
8119 else {
8120 ret = rb_str_casemap(str, &flags, enc);
8121 }
8122
8123 return ret;
8124}
8125
8126static bool
8127downcase_single(VALUE str)
8128{
8129 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8130 bool modified = false;
8131
8132 while (s < send) {
8133 unsigned int c = *(unsigned char*)s;
8134
8135 if ('A' <= c && c <= 'Z') {
8136 *s = 'a' + (c - 'A');
8137 modified = true;
8138 }
8139 s++;
8140 }
8141
8142 return modified;
8143}
8144
8145/*
8146 * call-seq:
8147 * downcase!(mapping) -> self or nil
8148 *
8149 * Like String#downcase, except that:
8150 *
8151 * - Changes character casings in +self+ (not in a copy of +self+).
8152 * - Returns +self+ if any changes are made, +nil+ otherwise.
8153 *
8154 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8155 */
8156
8157static VALUE
8158rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8159{
8160 rb_encoding *enc;
8161 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8162
8163 flags = check_case_options(argc, argv, flags);
8164 str_modify_keep_cr(str);
8165 enc = str_true_enc(str);
8166 if (case_option_single_p(flags, enc, str)) {
8167 if (downcase_single(str))
8168 flags |= ONIGENC_CASE_MODIFIED;
8169 }
8170 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8171 rb_str_ascii_casemap(str, str, &flags, enc);
8172 else
8173 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8174
8175 if (ONIGENC_CASE_MODIFIED&flags) return str;
8176 return Qnil;
8177}
8178
8179
8180/*
8181 * call-seq:
8182 * downcase(mapping) -> string
8183 *
8184 * :include: doc/string/downcase.rdoc
8185 *
8186 */
8187
8188static VALUE
8189rb_str_downcase(int argc, VALUE *argv, VALUE str)
8190{
8191 rb_encoding *enc;
8192 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8193 VALUE ret;
8194
8195 flags = check_case_options(argc, argv, flags);
8196 enc = str_true_enc(str);
8197 if (case_option_single_p(flags, enc, str)) {
8198 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8199 str_enc_copy_direct(ret, str);
8200 downcase_single(ret);
8201 }
8202 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8203 ret = rb_str_new(0, RSTRING_LEN(str));
8204 rb_str_ascii_casemap(str, ret, &flags, enc);
8205 }
8206 else {
8207 ret = rb_str_casemap(str, &flags, enc);
8208 }
8209
8210 return ret;
8211}
8212
8213
8214/*
8215 * call-seq:
8216 * capitalize!(mapping = :ascii) -> self or nil
8217 *
8218 * Like String#capitalize, except that:
8219 *
8220 * - Changes character casings in +self+ (not in a copy of +self+).
8221 * - Returns +self+ if any changes are made, +nil+ otherwise.
8222 *
8223 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8224 */
8225
8226static VALUE
8227rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8228{
8229 rb_encoding *enc;
8230 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8231
8232 flags = check_case_options(argc, argv, flags);
8233 str_modify_keep_cr(str);
8234 enc = str_true_enc(str);
8235 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8236 if (flags&ONIGENC_CASE_ASCII_ONLY)
8237 rb_str_ascii_casemap(str, str, &flags, enc);
8238 else
8239 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8240
8241 if (ONIGENC_CASE_MODIFIED&flags) return str;
8242 return Qnil;
8243}
8244
8245
8246/*
8247 * call-seq:
8248 * capitalize(mapping = :ascii) -> string
8249 *
8250 * Returns a string containing the characters in +self+,
8251 * each with possibly changed case:
8252 *
8253 * - The first character is upcased.
8254 * - All other characters are downcased.
8255 *
8256 * Examples:
8257 *
8258 * 'hello world'.capitalize # => "Hello world"
8259 * 'HELLO WORLD'.capitalize # => "Hello world"
8260 *
8261 * Some characters do not have upcase and downcase, and so are not changed;
8262 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8263 *
8264 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8265 *
8266 * The casing is affected by the given +mapping+,
8267 * which may be +:ascii+, +:fold+, or +:turkic+;
8268 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8269 *
8270 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8271 */
8272
8273static VALUE
8274rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8275{
8276 rb_encoding *enc;
8277 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8278 VALUE ret;
8279
8280 flags = check_case_options(argc, argv, flags);
8281 enc = str_true_enc(str);
8282 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8283 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8284 ret = rb_str_new(0, RSTRING_LEN(str));
8285 rb_str_ascii_casemap(str, ret, &flags, enc);
8286 }
8287 else {
8288 ret = rb_str_casemap(str, &flags, enc);
8289 }
8290 return ret;
8291}
8292
8293
8294/*
8295 * call-seq:
8296 * swapcase!(mapping) -> self or nil
8297 *
8298 * Upcases each lowercase character in +self+;
8299 * downcases uppercase character;
8300 * returns +self+ if any changes were made, +nil+ otherwise:
8301 *
8302 * s = 'Hello World!' # => "Hello World!"
8303 * s.swapcase! # => "hELLO wORLD!"
8304 * s # => "hELLO wORLD!"
8305 * ''.swapcase! # => nil
8306 *
8307 * The casing may be affected by the given +mapping+;
8308 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8309 *
8310 * Related: String#swapcase.
8311 *
8312 */
8313
8314static VALUE
8315rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8316{
8317 rb_encoding *enc;
8318 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8319
8320 flags = check_case_options(argc, argv, flags);
8321 str_modify_keep_cr(str);
8322 enc = str_true_enc(str);
8323 if (flags&ONIGENC_CASE_ASCII_ONLY)
8324 rb_str_ascii_casemap(str, str, &flags, enc);
8325 else
8326 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8327
8328 if (ONIGENC_CASE_MODIFIED&flags) return str;
8329 return Qnil;
8330}
8331
8332
8333/*
8334 * call-seq:
8335 * swapcase(mapping) -> string
8336 *
8337 * Returns a string containing the characters in +self+, with cases reversed;
8338 * each uppercase character is downcased;
8339 * each lowercase character is upcased:
8340 *
8341 * s = 'Hello World!' # => "Hello World!"
8342 * s.swapcase # => "hELLO wORLD!"
8343 *
8344 * The casing may be affected by the given +mapping+;
8345 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8346 *
8347 * Related: String#swapcase!.
8348 *
8349 */
8350
8351static VALUE
8352rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8353{
8354 rb_encoding *enc;
8355 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8356 VALUE ret;
8357
8358 flags = check_case_options(argc, argv, flags);
8359 enc = str_true_enc(str);
8360 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8361 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8362 ret = rb_str_new(0, RSTRING_LEN(str));
8363 rb_str_ascii_casemap(str, ret, &flags, enc);
8364 }
8365 else {
8366 ret = rb_str_casemap(str, &flags, enc);
8367 }
8368 return ret;
8369}
8370
8371typedef unsigned char *USTR;
8372
8373struct tr {
8374 int gen;
8375 unsigned int now, max;
8376 char *p, *pend;
8377};
8378
8379static unsigned int
8380trnext(struct tr *t, rb_encoding *enc)
8381{
8382 int n;
8383
8384 for (;;) {
8385 nextpart:
8386 if (!t->gen) {
8387 if (t->p == t->pend) return -1;
8388 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8389 t->p += n;
8390 }
8391 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8392 t->p += n;
8393 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8394 t->p += n;
8395 if (t->p < t->pend) {
8396 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8397 t->p += n;
8398 if (t->now > c) {
8399 if (t->now < 0x80 && c < 0x80) {
8400 rb_raise(rb_eArgError,
8401 "invalid range \"%c-%c\" in string transliteration",
8402 t->now, c);
8403 }
8404 else {
8405 rb_raise(rb_eArgError, "invalid range in string transliteration");
8406 }
8407 continue; /* not reached */
8408 }
8409 else if (t->now < c) {
8410 t->gen = 1;
8411 t->max = c;
8412 }
8413 }
8414 }
8415 return t->now;
8416 }
8417 else {
8418 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8419 if (t->now == t->max) {
8420 t->gen = 0;
8421 goto nextpart;
8422 }
8423 }
8424 if (t->now < t->max) {
8425 return t->now;
8426 }
8427 else {
8428 t->gen = 0;
8429 return t->max;
8430 }
8431 }
8432 }
8433}
8434
8435static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8436
8437static VALUE
8438tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8439{
8440 const unsigned int errc = -1;
8441 unsigned int trans[256];
8442 rb_encoding *enc, *e1, *e2;
8443 struct tr trsrc, trrepl;
8444 int cflag = 0;
8445 unsigned int c, c0, last = 0;
8446 int modify = 0, i, l;
8447 unsigned char *s, *send;
8448 VALUE hash = 0;
8449 int singlebyte = single_byte_optimizable(str);
8450 int termlen;
8451 int cr;
8452
8453#define CHECK_IF_ASCII(c) \
8454 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8455 (cr = ENC_CODERANGE_VALID) : 0)
8456
8457 StringValue(src);
8458 StringValue(repl);
8459 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8460 if (RSTRING_LEN(repl) == 0) {
8461 return rb_str_delete_bang(1, &src, str);
8462 }
8463
8464 cr = ENC_CODERANGE(str);
8465 e1 = rb_enc_check(str, src);
8466 e2 = rb_enc_check(str, repl);
8467 if (e1 == e2) {
8468 enc = e1;
8469 }
8470 else {
8471 enc = rb_enc_check(src, repl);
8472 }
8473 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8474 if (RSTRING_LEN(src) > 1 &&
8475 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8476 trsrc.p + l < trsrc.pend) {
8477 cflag = 1;
8478 trsrc.p += l;
8479 }
8480 trrepl.p = RSTRING_PTR(repl);
8481 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8482 trsrc.gen = trrepl.gen = 0;
8483 trsrc.now = trrepl.now = 0;
8484 trsrc.max = trrepl.max = 0;
8485
8486 if (cflag) {
8487 for (i=0; i<256; i++) {
8488 trans[i] = 1;
8489 }
8490 while ((c = trnext(&trsrc, enc)) != errc) {
8491 if (c < 256) {
8492 trans[c] = errc;
8493 }
8494 else {
8495 if (!hash) hash = rb_hash_new();
8496 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8497 }
8498 }
8499 while ((c = trnext(&trrepl, enc)) != errc)
8500 /* retrieve last replacer */;
8501 last = trrepl.now;
8502 for (i=0; i<256; i++) {
8503 if (trans[i] != errc) {
8504 trans[i] = last;
8505 }
8506 }
8507 }
8508 else {
8509 unsigned int r;
8510
8511 for (i=0; i<256; i++) {
8512 trans[i] = errc;
8513 }
8514 while ((c = trnext(&trsrc, enc)) != errc) {
8515 r = trnext(&trrepl, enc);
8516 if (r == errc) r = trrepl.now;
8517 if (c < 256) {
8518 trans[c] = r;
8519 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8520 }
8521 else {
8522 if (!hash) hash = rb_hash_new();
8523 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8524 }
8525 }
8526 }
8527
8528 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8529 cr = ENC_CODERANGE_7BIT;
8530 str_modify_keep_cr(str);
8531 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8532 termlen = rb_enc_mbminlen(enc);
8533 if (sflag) {
8534 int clen, tlen;
8535 long offset, max = RSTRING_LEN(str);
8536 unsigned int save = -1;
8537 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8538
8539 while (s < send) {
8540 int may_modify = 0;
8541
8542 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8543 if (!MBCLEN_CHARFOUND_P(r)) {
8544 xfree(buf);
8545 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8546 }
8547 clen = MBCLEN_CHARFOUND_LEN(r);
8548 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8549
8550 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8551
8552 s += clen;
8553 if (c < 256) {
8554 c = trans[c];
8555 }
8556 else if (hash) {
8557 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8558 if (NIL_P(tmp)) {
8559 if (cflag) c = last;
8560 else c = errc;
8561 }
8562 else if (cflag) c = errc;
8563 else c = NUM2INT(tmp);
8564 }
8565 else {
8566 c = errc;
8567 }
8568 if (c != (unsigned int)-1) {
8569 if (save == c) {
8570 CHECK_IF_ASCII(c);
8571 continue;
8572 }
8573 save = c;
8574 tlen = rb_enc_codelen(c, enc);
8575 modify = 1;
8576 }
8577 else {
8578 save = -1;
8579 c = c0;
8580 if (enc != e1) may_modify = 1;
8581 }
8582 if ((offset = t - buf) + tlen > max) {
8583 size_t MAYBE_UNUSED(old) = max + termlen;
8584 max = offset + tlen + (send - s);
8585 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8586 t = buf + offset;
8587 }
8588 rb_enc_mbcput(c, t, enc);
8589 if (may_modify && memcmp(s, t, tlen) != 0) {
8590 modify = 1;
8591 }
8592 CHECK_IF_ASCII(c);
8593 t += tlen;
8594 }
8595 if (!STR_EMBED_P(str)) {
8596 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8597 }
8598 TERM_FILL((char *)t, termlen);
8599 RSTRING(str)->as.heap.ptr = (char *)buf;
8600 STR_SET_LEN(str, t - buf);
8601 STR_SET_NOEMBED(str);
8602 RSTRING(str)->as.heap.aux.capa = max;
8603 }
8604 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8605 while (s < send) {
8606 c = (unsigned char)*s;
8607 if (trans[c] != errc) {
8608 if (!cflag) {
8609 c = trans[c];
8610 *s = c;
8611 modify = 1;
8612 }
8613 else {
8614 *s = last;
8615 modify = 1;
8616 }
8617 }
8618 CHECK_IF_ASCII(c);
8619 s++;
8620 }
8621 }
8622 else {
8623 int clen, tlen;
8624 long offset, max = (long)((send - s) * 1.2);
8625 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8626
8627 while (s < send) {
8628 int may_modify = 0;
8629
8630 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8631 if (!MBCLEN_CHARFOUND_P(r)) {
8632 xfree(buf);
8633 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8634 }
8635 clen = MBCLEN_CHARFOUND_LEN(r);
8636 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8637
8638 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8639
8640 if (c < 256) {
8641 c = trans[c];
8642 }
8643 else if (hash) {
8644 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8645 if (NIL_P(tmp)) {
8646 if (cflag) c = last;
8647 else c = errc;
8648 }
8649 else if (cflag) c = errc;
8650 else c = NUM2INT(tmp);
8651 }
8652 else {
8653 c = cflag ? last : errc;
8654 }
8655 if (c != errc) {
8656 tlen = rb_enc_codelen(c, enc);
8657 modify = 1;
8658 }
8659 else {
8660 c = c0;
8661 if (enc != e1) may_modify = 1;
8662 }
8663 if ((offset = t - buf) + tlen > max) {
8664 size_t MAYBE_UNUSED(old) = max + termlen;
8665 max = offset + tlen + (long)((send - s) * 1.2);
8666 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8667 t = buf + offset;
8668 }
8669 if (s != t) {
8670 rb_enc_mbcput(c, t, enc);
8671 if (may_modify && memcmp(s, t, tlen) != 0) {
8672 modify = 1;
8673 }
8674 }
8675 CHECK_IF_ASCII(c);
8676 s += clen;
8677 t += tlen;
8678 }
8679 if (!STR_EMBED_P(str)) {
8680 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8681 }
8682 TERM_FILL((char *)t, termlen);
8683 RSTRING(str)->as.heap.ptr = (char *)buf;
8684 STR_SET_LEN(str, t - buf);
8685 STR_SET_NOEMBED(str);
8686 RSTRING(str)->as.heap.aux.capa = max;
8687 }
8688
8689 if (modify) {
8690 if (cr != ENC_CODERANGE_BROKEN)
8691 ENC_CODERANGE_SET(str, cr);
8692 rb_enc_associate(str, enc);
8693 return str;
8694 }
8695 return Qnil;
8696}
8697
8698
8699/*
8700 * call-seq:
8701 * tr!(selector, replacements) -> self or nil
8702 *
8703 * Like String#tr, but modifies +self+ in place.
8704 * Returns +self+ if any changes were made, +nil+ otherwise.
8705 *
8706 */
8707
8708static VALUE
8709rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8710{
8711 return tr_trans(str, src, repl, 0);
8712}
8713
8714
8715/*
8716 * call-seq:
8717 * tr(selector, replacements) -> new_string
8718 *
8719 * Returns a copy of +self+ with each character specified by string +selector+
8720 * translated to the corresponding character in string +replacements+.
8721 * The correspondence is _positional_:
8722 *
8723 * - Each occurrence of the first character specified by +selector+
8724 * is translated to the first character in +replacements+.
8725 * - Each occurrence of the second character specified by +selector+
8726 * is translated to the second character in +replacements+.
8727 * - And so on.
8728 *
8729 * Example:
8730 *
8731 * 'hello'.tr('el', 'ip') #=> "hippo"
8732 *
8733 * If +replacements+ is shorter than +selector+,
8734 * it is implicitly padded with its own last character:
8735 *
8736 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8737 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8738 *
8739 * Arguments +selector+ and +replacements+ must be valid character selectors
8740 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8741 * and may use any of its valid forms, including negation, ranges, and escaping:
8742 *
8743 * # Negation.
8744 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8745 * # Ranges.
8746 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8747 * # Escapes.
8748 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8749 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8750 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8751 *
8752 */
8753
8754static VALUE
8755rb_str_tr(VALUE str, VALUE src, VALUE repl)
8756{
8757 str = str_duplicate(rb_cString, str);
8758 tr_trans(str, src, repl, 0);
8759 return str;
8760}
8761
8762#define TR_TABLE_MAX (UCHAR_MAX+1)
8763#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8764static void
8765tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8766 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8767{
8768 const unsigned int errc = -1;
8769 char buf[TR_TABLE_MAX];
8770 struct tr tr;
8771 unsigned int c;
8772 VALUE table = 0, ptable = 0;
8773 int i, l, cflag = 0;
8774
8775 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8776 tr.gen = tr.now = tr.max = 0;
8777
8778 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8779 cflag = 1;
8780 tr.p += l;
8781 }
8782 if (first) {
8783 for (i=0; i<TR_TABLE_MAX; i++) {
8784 stable[i] = 1;
8785 }
8786 stable[TR_TABLE_MAX] = cflag;
8787 }
8788 else if (stable[TR_TABLE_MAX] && !cflag) {
8789 stable[TR_TABLE_MAX] = 0;
8790 }
8791 for (i=0; i<TR_TABLE_MAX; i++) {
8792 buf[i] = cflag;
8793 }
8794
8795 while ((c = trnext(&tr, enc)) != errc) {
8796 if (c < TR_TABLE_MAX) {
8797 buf[(unsigned char)c] = !cflag;
8798 }
8799 else {
8800 VALUE key = UINT2NUM(c);
8801
8802 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8803 if (cflag) {
8804 ptable = *ctablep;
8805 table = ptable ? ptable : rb_hash_new();
8806 *ctablep = table;
8807 }
8808 else {
8809 table = rb_hash_new();
8810 ptable = *tablep;
8811 *tablep = table;
8812 }
8813 }
8814 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8815 rb_hash_aset(table, key, Qtrue);
8816 }
8817 }
8818 }
8819 for (i=0; i<TR_TABLE_MAX; i++) {
8820 stable[i] = stable[i] && buf[i];
8821 }
8822 if (!table && !cflag) {
8823 *tablep = 0;
8824 }
8825}
8826
8827
8828static int
8829tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8830{
8831 if (c < TR_TABLE_MAX) {
8832 return table[c] != 0;
8833 }
8834 else {
8835 VALUE v = UINT2NUM(c);
8836
8837 if (del) {
8838 if (!NIL_P(rb_hash_lookup(del, v)) &&
8839 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8840 return TRUE;
8841 }
8842 }
8843 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8844 return FALSE;
8845 }
8846 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8847 }
8848}
8849
8850/*
8851 * call-seq:
8852 * delete!(*selectors) -> self or nil
8853 *
8854 * Like String#delete, but modifies +self+ in place;
8855 * returns +self+ if any characters were deleted, +nil+ otherwise.
8856 *
8857 * Related: see {Modifying}[rdoc-ref:String@Modifying].
8858 */
8859
8860static VALUE
8861rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8862{
8863 char squeez[TR_TABLE_SIZE];
8864 rb_encoding *enc = 0;
8865 char *s, *send, *t;
8866 VALUE del = 0, nodel = 0;
8867 int modify = 0;
8868 int i, ascompat, cr;
8869
8870 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8872 for (i=0; i<argc; i++) {
8873 VALUE s = argv[i];
8874
8875 StringValue(s);
8876 enc = rb_enc_check(str, s);
8877 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8878 }
8879
8880 str_modify_keep_cr(str);
8881 ascompat = rb_enc_asciicompat(enc);
8882 s = t = RSTRING_PTR(str);
8883 send = RSTRING_END(str);
8884 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8885 while (s < send) {
8886 unsigned int c;
8887 int clen;
8888
8889 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8890 if (squeez[c]) {
8891 modify = 1;
8892 }
8893 else {
8894 if (t != s) *t = c;
8895 t++;
8896 }
8897 s++;
8898 }
8899 else {
8900 c = rb_enc_codepoint_len(s, send, &clen, enc);
8901
8902 if (tr_find(c, squeez, del, nodel)) {
8903 modify = 1;
8904 }
8905 else {
8906 if (t != s) rb_enc_mbcput(c, t, enc);
8907 t += clen;
8909 }
8910 s += clen;
8911 }
8912 }
8913 TERM_FILL(t, TERM_LEN(str));
8914 STR_SET_LEN(str, t - RSTRING_PTR(str));
8915 ENC_CODERANGE_SET(str, cr);
8916
8917 if (modify) return str;
8918 return Qnil;
8919}
8920
8921
8922/*
8923 * call-seq:
8924 * delete(*selectors) -> new_string
8925 *
8926 * :include: doc/string/delete.rdoc
8927 *
8928 */
8929
8930static VALUE
8931rb_str_delete(int argc, VALUE *argv, VALUE str)
8932{
8933 str = str_duplicate(rb_cString, str);
8934 rb_str_delete_bang(argc, argv, str);
8935 return str;
8936}
8937
8938
8939/*
8940 * call-seq:
8941 * squeeze!(*selectors) -> self or nil
8942 *
8943 * Like String#squeeze, but modifies +self+ in place.
8944 * Returns +self+ if any changes were made, +nil+ otherwise.
8945 */
8946
8947static VALUE
8948rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8949{
8950 char squeez[TR_TABLE_SIZE];
8951 rb_encoding *enc = 0;
8952 VALUE del = 0, nodel = 0;
8953 unsigned char *s, *send, *t;
8954 int i, modify = 0;
8955 int ascompat, singlebyte = single_byte_optimizable(str);
8956 unsigned int save;
8957
8958 if (argc == 0) {
8959 enc = STR_ENC_GET(str);
8960 }
8961 else {
8962 for (i=0; i<argc; i++) {
8963 VALUE s = argv[i];
8964
8965 StringValue(s);
8966 enc = rb_enc_check(str, s);
8967 if (singlebyte && !single_byte_optimizable(s))
8968 singlebyte = 0;
8969 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8970 }
8971 }
8972
8973 str_modify_keep_cr(str);
8974 s = t = (unsigned char *)RSTRING_PTR(str);
8975 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8976 send = (unsigned char *)RSTRING_END(str);
8977 save = -1;
8978 ascompat = rb_enc_asciicompat(enc);
8979
8980 if (singlebyte) {
8981 while (s < send) {
8982 unsigned int c = *s++;
8983 if (c != save || (argc > 0 && !squeez[c])) {
8984 *t++ = save = c;
8985 }
8986 }
8987 }
8988 else {
8989 while (s < send) {
8990 unsigned int c;
8991 int clen;
8992
8993 if (ascompat && (c = *s) < 0x80) {
8994 if (c != save || (argc > 0 && !squeez[c])) {
8995 *t++ = save = c;
8996 }
8997 s++;
8998 }
8999 else {
9000 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9001
9002 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9003 if (t != s) rb_enc_mbcput(c, t, enc);
9004 save = c;
9005 t += clen;
9006 }
9007 s += clen;
9008 }
9009 }
9010 }
9011
9012 TERM_FILL((char *)t, TERM_LEN(str));
9013 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9014 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9015 modify = 1;
9016 }
9017
9018 if (modify) return str;
9019 return Qnil;
9020}
9021
9022
9023/*
9024 * call-seq:
9025 * squeeze(*selectors) -> new_string
9026 *
9027 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9028 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9029 *
9030 * "Squeezed" means that each multiple-character run of a selected character
9031 * is squeezed down to a single character;
9032 * with no arguments given, squeezes all characters:
9033 *
9034 * "yellow moon".squeeze #=> "yelow mon"
9035 * " now is the".squeeze(" ") #=> " now is the"
9036 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9037 *
9038 */
9039
9040static VALUE
9041rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9042{
9043 str = str_duplicate(rb_cString, str);
9044 rb_str_squeeze_bang(argc, argv, str);
9045 return str;
9046}
9047
9048
9049/*
9050 * call-seq:
9051 * tr_s!(selector, replacements) -> self or nil
9052 *
9053 * Like String#tr_s, but modifies +self+ in place.
9054 * Returns +self+ if any changes were made, +nil+ otherwise.
9055 *
9056 * Related: String#squeeze!.
9057 */
9058
9059static VALUE
9060rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9061{
9062 return tr_trans(str, src, repl, 1);
9063}
9064
9065
9066/*
9067 * call-seq:
9068 * tr_s(selector, replacements) -> string
9069 *
9070 * Like String#tr, but also squeezes the modified portions of the translated string;
9071 * returns a new string (translated and squeezed).
9072 *
9073 * 'hello'.tr_s('l', 'r') #=> "hero"
9074 * 'hello'.tr_s('el', '-') #=> "h-o"
9075 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9076 *
9077 * Related: String#squeeze.
9078 *
9079 */
9080
9081static VALUE
9082rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9083{
9084 str = str_duplicate(rb_cString, str);
9085 tr_trans(str, src, repl, 1);
9086 return str;
9087}
9088
9089
9090/*
9091 * call-seq:
9092 * count(*selectors) -> integer
9093 *
9094 * :include: doc/string/count.rdoc
9095 */
9096
9097static VALUE
9098rb_str_count(int argc, VALUE *argv, VALUE str)
9099{
9100 char table[TR_TABLE_SIZE];
9101 rb_encoding *enc = 0;
9102 VALUE del = 0, nodel = 0, tstr;
9103 char *s, *send;
9104 int i;
9105 int ascompat;
9106 size_t n = 0;
9107
9109
9110 tstr = argv[0];
9111 StringValue(tstr);
9112 enc = rb_enc_check(str, tstr);
9113 if (argc == 1) {
9114 const char *ptstr;
9115 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9116 (ptstr = RSTRING_PTR(tstr),
9117 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9118 !is_broken_string(str)) {
9119 int clen;
9120 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9121
9122 s = RSTRING_PTR(str);
9123 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9124 send = RSTRING_END(str);
9125 while (s < send) {
9126 if (*(unsigned char*)s++ == c) n++;
9127 }
9128 return SIZET2NUM(n);
9129 }
9130 }
9131
9132 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9133 for (i=1; i<argc; i++) {
9134 tstr = argv[i];
9135 StringValue(tstr);
9136 enc = rb_enc_check(str, tstr);
9137 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9138 }
9139
9140 s = RSTRING_PTR(str);
9141 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9142 send = RSTRING_END(str);
9143 ascompat = rb_enc_asciicompat(enc);
9144 while (s < send) {
9145 unsigned int c;
9146
9147 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9148 if (table[c]) {
9149 n++;
9150 }
9151 s++;
9152 }
9153 else {
9154 int clen;
9155 c = rb_enc_codepoint_len(s, send, &clen, enc);
9156 if (tr_find(c, table, del, nodel)) {
9157 n++;
9158 }
9159 s += clen;
9160 }
9161 }
9162
9163 return SIZET2NUM(n);
9164}
9165
9166static VALUE
9167rb_fs_check(VALUE val)
9168{
9169 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9170 val = rb_check_string_type(val);
9171 if (NIL_P(val)) return 0;
9172 }
9173 return val;
9174}
9175
9176static const char isspacetable[256] = {
9177 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9179 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9184 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9193};
9194
9195#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9196
9197static long
9198split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9199{
9200 if (empty_count >= 0 && len == 0) {
9201 return empty_count + 1;
9202 }
9203 if (empty_count > 0) {
9204 /* make different substrings */
9205 if (result) {
9206 do {
9207 rb_ary_push(result, str_new_empty_String(str));
9208 } while (--empty_count > 0);
9209 }
9210 else {
9211 do {
9212 rb_yield(str_new_empty_String(str));
9213 } while (--empty_count > 0);
9214 }
9215 }
9216 str = rb_str_subseq(str, beg, len);
9217 if (result) {
9218 rb_ary_push(result, str);
9219 }
9220 else {
9221 rb_yield(str);
9222 }
9223 return empty_count;
9224}
9225
9226typedef enum {
9227 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9228} split_type_t;
9229
9230static split_type_t
9231literal_split_pattern(VALUE spat, split_type_t default_type)
9232{
9233 rb_encoding *enc = STR_ENC_GET(spat);
9234 const char *ptr;
9235 long len;
9236 RSTRING_GETMEM(spat, ptr, len);
9237 if (len == 0) {
9238 /* Special case - split into chars */
9239 return SPLIT_TYPE_CHARS;
9240 }
9241 else if (rb_enc_asciicompat(enc)) {
9242 if (len == 1 && ptr[0] == ' ') {
9243 return SPLIT_TYPE_AWK;
9244 }
9245 }
9246 else {
9247 int l;
9248 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9249 return SPLIT_TYPE_AWK;
9250 }
9251 }
9252 return default_type;
9253}
9254
9255/*
9256 * call-seq:
9257 * split(field_sep = $;, limit = 0) -> array
9258 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9259 *
9260 * :include: doc/string/split.rdoc
9261 *
9262 */
9263
9264static VALUE
9265rb_str_split_m(int argc, VALUE *argv, VALUE str)
9266{
9267 rb_encoding *enc;
9268 VALUE spat;
9269 VALUE limit;
9270 split_type_t split_type;
9271 long beg, end, i = 0, empty_count = -1;
9272 int lim = 0;
9273 VALUE result, tmp;
9274
9275 result = rb_block_given_p() ? Qfalse : Qnil;
9276 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9277 lim = NUM2INT(limit);
9278 if (lim <= 0) limit = Qnil;
9279 else if (lim == 1) {
9280 if (RSTRING_LEN(str) == 0)
9281 return result ? rb_ary_new2(0) : str;
9282 tmp = str_duplicate(rb_cString, str);
9283 if (!result) {
9284 rb_yield(tmp);
9285 return str;
9286 }
9287 return rb_ary_new3(1, tmp);
9288 }
9289 i = 1;
9290 }
9291 if (NIL_P(limit) && !lim) empty_count = 0;
9292
9293 enc = STR_ENC_GET(str);
9294 split_type = SPLIT_TYPE_REGEXP;
9295 if (!NIL_P(spat)) {
9296 spat = get_pat_quoted(spat, 0);
9297 }
9298 else if (NIL_P(spat = rb_fs)) {
9299 split_type = SPLIT_TYPE_AWK;
9300 }
9301 else if (!(spat = rb_fs_check(spat))) {
9302 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9303 }
9304 else {
9305 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9306 }
9307 if (split_type != SPLIT_TYPE_AWK) {
9308 switch (BUILTIN_TYPE(spat)) {
9309 case T_REGEXP:
9310 rb_reg_options(spat); /* check if uninitialized */
9311 tmp = RREGEXP_SRC(spat);
9312 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9313 if (split_type == SPLIT_TYPE_AWK) {
9314 spat = tmp;
9315 split_type = SPLIT_TYPE_STRING;
9316 }
9317 break;
9318
9319 case T_STRING:
9320 mustnot_broken(spat);
9321 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9322 break;
9323
9324 default:
9326 }
9327 }
9328
9329#define SPLIT_STR(beg, len) ( \
9330 empty_count = split_string(result, str, beg, len, empty_count), \
9331 str_mod_check(str, str_start, str_len))
9332
9333 beg = 0;
9334 char *ptr = RSTRING_PTR(str);
9335 char *const str_start = ptr;
9336 const long str_len = RSTRING_LEN(str);
9337 char *const eptr = str_start + str_len;
9338 if (split_type == SPLIT_TYPE_AWK) {
9339 char *bptr = ptr;
9340 int skip = 1;
9341 unsigned int c;
9342
9343 if (result) result = rb_ary_new();
9344 end = beg;
9345 if (is_ascii_string(str)) {
9346 while (ptr < eptr) {
9347 c = (unsigned char)*ptr++;
9348 if (skip) {
9349 if (ascii_isspace(c)) {
9350 beg = ptr - bptr;
9351 }
9352 else {
9353 end = ptr - bptr;
9354 skip = 0;
9355 if (!NIL_P(limit) && lim <= i) break;
9356 }
9357 }
9358 else if (ascii_isspace(c)) {
9359 SPLIT_STR(beg, end-beg);
9360 skip = 1;
9361 beg = ptr - bptr;
9362 if (!NIL_P(limit)) ++i;
9363 }
9364 else {
9365 end = ptr - bptr;
9366 }
9367 }
9368 }
9369 else {
9370 while (ptr < eptr) {
9371 int n;
9372
9373 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9374 ptr += n;
9375 if (skip) {
9376 if (rb_isspace(c)) {
9377 beg = ptr - bptr;
9378 }
9379 else {
9380 end = ptr - bptr;
9381 skip = 0;
9382 if (!NIL_P(limit) && lim <= i) break;
9383 }
9384 }
9385 else if (rb_isspace(c)) {
9386 SPLIT_STR(beg, end-beg);
9387 skip = 1;
9388 beg = ptr - bptr;
9389 if (!NIL_P(limit)) ++i;
9390 }
9391 else {
9392 end = ptr - bptr;
9393 }
9394 }
9395 }
9396 }
9397 else if (split_type == SPLIT_TYPE_STRING) {
9398 char *substr_start = ptr;
9399 char *sptr = RSTRING_PTR(spat);
9400 long slen = RSTRING_LEN(spat);
9401
9402 if (result) result = rb_ary_new();
9403 mustnot_broken(str);
9404 enc = rb_enc_check(str, spat);
9405 while (ptr < eptr &&
9406 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9407 /* Check we are at the start of a char */
9408 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9409 if (t != ptr + end) {
9410 ptr = t;
9411 continue;
9412 }
9413 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9414 str_mod_check(spat, sptr, slen);
9415 ptr += end + slen;
9416 substr_start = ptr;
9417 if (!NIL_P(limit) && lim <= ++i) break;
9418 }
9419 beg = ptr - str_start;
9420 }
9421 else if (split_type == SPLIT_TYPE_CHARS) {
9422 int n;
9423
9424 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9425 mustnot_broken(str);
9426 enc = rb_enc_get(str);
9427 while (ptr < eptr &&
9428 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9429 SPLIT_STR(ptr - str_start, n);
9430 ptr += n;
9431 if (!NIL_P(limit) && lim <= ++i) break;
9432 }
9433 beg = ptr - str_start;
9434 }
9435 else {
9436 if (result) result = rb_ary_new();
9437 long len = RSTRING_LEN(str);
9438 long start = beg;
9439 long idx;
9440 int last_null = 0;
9441 struct re_registers *regs;
9442 VALUE match = 0;
9443
9444 for (; rb_reg_search(spat, str, start, 0) >= 0;
9445 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9446 match = rb_backref_get();
9447 if (!result) rb_match_busy(match);
9448 regs = RMATCH_REGS(match);
9449 end = BEG(0);
9450 if (start == end && BEG(0) == END(0)) {
9451 if (!ptr) {
9452 SPLIT_STR(0, 0);
9453 break;
9454 }
9455 else if (last_null == 1) {
9456 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9457 beg = start;
9458 }
9459 else {
9460 if (start == len)
9461 start++;
9462 else
9463 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9464 last_null = 1;
9465 continue;
9466 }
9467 }
9468 else {
9469 SPLIT_STR(beg, end-beg);
9470 beg = start = END(0);
9471 }
9472 last_null = 0;
9473
9474 for (idx=1; idx < regs->num_regs; idx++) {
9475 if (BEG(idx) == -1) continue;
9476 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9477 }
9478 if (!NIL_P(limit) && lim <= ++i) break;
9479 }
9480 if (match) rb_match_unbusy(match);
9481 }
9482 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9483 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9484 }
9485
9486 return result ? result : str;
9487}
9488
9489VALUE
9490rb_str_split(VALUE str, const char *sep0)
9491{
9492 VALUE sep;
9493
9494 StringValue(str);
9495 sep = rb_str_new_cstr(sep0);
9496 return rb_str_split_m(1, &sep, str);
9497}
9498
9499#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9500
9501static inline int
9502enumerator_element(VALUE ary, VALUE e)
9503{
9504 if (ary) {
9505 rb_ary_push(ary, e);
9506 return 0;
9507 }
9508 else {
9509 rb_yield(e);
9510 return 1;
9511 }
9512}
9513
9514#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9515
9516static const char *
9517chomp_newline(const char *p, const char *e, rb_encoding *enc)
9518{
9519 const char *prev = rb_enc_prev_char(p, e, e, enc);
9520 if (rb_enc_is_newline(prev, e, enc)) {
9521 e = prev;
9522 prev = rb_enc_prev_char(p, e, e, enc);
9523 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9524 e = prev;
9525 }
9526 return e;
9527}
9528
9529static VALUE
9530get_rs(void)
9531{
9532 VALUE rs = rb_rs;
9533 if (!NIL_P(rs) &&
9534 (!RB_TYPE_P(rs, T_STRING) ||
9535 RSTRING_LEN(rs) != 1 ||
9536 RSTRING_PTR(rs)[0] != '\n')) {
9537 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9538 }
9539 return rs;
9540}
9541
9542#define rb_rs get_rs()
9543
9544static VALUE
9545rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9546{
9547 rb_encoding *enc;
9548 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9549 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9550 long pos, len, rslen;
9551 int rsnewline = 0;
9552
9553 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9554 rs = rb_rs;
9555 if (!NIL_P(opts)) {
9556 static ID keywords[1];
9557 if (!keywords[0]) {
9558 keywords[0] = rb_intern_const("chomp");
9559 }
9560 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9561 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9562 }
9563
9564 if (NIL_P(rs)) {
9565 if (!ENUM_ELEM(ary, str)) {
9566 return ary;
9567 }
9568 else {
9569 return orig;
9570 }
9571 }
9572
9573 if (!RSTRING_LEN(str)) goto end;
9574 str = rb_str_new_frozen(str);
9575 ptr = subptr = RSTRING_PTR(str);
9576 pend = RSTRING_END(str);
9577 len = RSTRING_LEN(str);
9578 StringValue(rs);
9579 rslen = RSTRING_LEN(rs);
9580
9581 if (rs == rb_default_rs)
9582 enc = rb_enc_get(str);
9583 else
9584 enc = rb_enc_check(str, rs);
9585
9586 if (rslen == 0) {
9587 /* paragraph mode */
9588 int n;
9589 const char *eol = NULL;
9590 subend = subptr;
9591 while (subend < pend) {
9592 long chomp_rslen = 0;
9593 do {
9594 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9595 n = 0;
9596 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9597 if (rb_enc_is_newline(subend + n, pend, enc)) {
9598 if (eol == subend) break;
9599 subend += rslen;
9600 if (subptr) {
9601 eol = subend;
9602 chomp_rslen = -rslen;
9603 }
9604 }
9605 else {
9606 if (!subptr) subptr = subend;
9607 subend += rslen;
9608 }
9609 rslen = 0;
9610 } while (subend < pend);
9611 if (!subptr) break;
9612 if (rslen == 0) chomp_rslen = 0;
9613 line = rb_str_subseq(str, subptr - ptr,
9614 subend - subptr + (chomp ? chomp_rslen : rslen));
9615 if (ENUM_ELEM(ary, line)) {
9616 str_mod_check(str, ptr, len);
9617 }
9618 subptr = eol = NULL;
9619 }
9620 goto end;
9621 }
9622 else {
9623 rsptr = RSTRING_PTR(rs);
9624 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9625 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9626 rsnewline = 1;
9627 }
9628 }
9629
9630 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9631 rs = rb_str_new(rsptr, rslen);
9632 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9633 rsptr = RSTRING_PTR(rs);
9634 rslen = RSTRING_LEN(rs);
9635 }
9636
9637 while (subptr < pend) {
9638 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9639 if (pos < 0) break;
9640 hit = subptr + pos;
9641 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9642 if (hit != adjusted) {
9643 subptr = adjusted;
9644 continue;
9645 }
9646 subend = hit += rslen;
9647 if (chomp) {
9648 if (rsnewline) {
9649 subend = chomp_newline(subptr, subend, enc);
9650 }
9651 else {
9652 subend -= rslen;
9653 }
9654 }
9655 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9656 if (ENUM_ELEM(ary, line)) {
9657 str_mod_check(str, ptr, len);
9658 }
9659 subptr = hit;
9660 }
9661
9662 if (subptr != pend) {
9663 if (chomp) {
9664 if (rsnewline) {
9665 pend = chomp_newline(subptr, pend, enc);
9666 }
9667 else if (pend - subptr >= rslen &&
9668 memcmp(pend - rslen, rsptr, rslen) == 0) {
9669 pend -= rslen;
9670 }
9671 }
9672 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9673 ENUM_ELEM(ary, line);
9674 RB_GC_GUARD(str);
9675 }
9676
9677 end:
9678 if (ary)
9679 return ary;
9680 else
9681 return orig;
9682}
9683
9684/*
9685 * call-seq:
9686 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9687 * each_line(line_sep = $/, chomp: false) -> enumerator
9688 *
9689 * :include: doc/string/each_line.rdoc
9690 *
9691 */
9692
9693static VALUE
9694rb_str_each_line(int argc, VALUE *argv, VALUE str)
9695{
9696 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9697 return rb_str_enumerate_lines(argc, argv, str, 0);
9698}
9699
9700/*
9701 * call-seq:
9702 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9703 *
9704 * Forms substrings ("lines") of +self+ according to the given arguments
9705 * (see String#each_line for details); returns the lines in an array.
9706 *
9707 */
9708
9709static VALUE
9710rb_str_lines(int argc, VALUE *argv, VALUE str)
9711{
9712 VALUE ary = WANTARRAY("lines", 0);
9713 return rb_str_enumerate_lines(argc, argv, str, ary);
9714}
9715
9716static VALUE
9717rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9718{
9719 return LONG2FIX(RSTRING_LEN(str));
9720}
9721
9722static VALUE
9723rb_str_enumerate_bytes(VALUE str, VALUE ary)
9724{
9725 long i;
9726
9727 for (i=0; i<RSTRING_LEN(str); i++) {
9728 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9729 }
9730 if (ary)
9731 return ary;
9732 else
9733 return str;
9734}
9735
9736/*
9737 * call-seq:
9738 * each_byte {|byte| ... } -> self
9739 * each_byte -> enumerator
9740 *
9741 * :include: doc/string/each_byte.rdoc
9742 *
9743 */
9744
9745static VALUE
9746rb_str_each_byte(VALUE str)
9747{
9748 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9749 return rb_str_enumerate_bytes(str, 0);
9750}
9751
9752/*
9753 * call-seq:
9754 * bytes -> array_of_bytes
9755 *
9756 * :include: doc/string/bytes.rdoc
9757 *
9758 */
9759
9760static VALUE
9761rb_str_bytes(VALUE str)
9762{
9763 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9764 return rb_str_enumerate_bytes(str, ary);
9765}
9766
9767static VALUE
9768rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9769{
9770 return rb_str_length(str);
9771}
9772
9773static VALUE
9774rb_str_enumerate_chars(VALUE str, VALUE ary)
9775{
9776 VALUE orig = str;
9777 long i, len, n;
9778 const char *ptr;
9779 rb_encoding *enc;
9780
9781 str = rb_str_new_frozen(str);
9782 ptr = RSTRING_PTR(str);
9783 len = RSTRING_LEN(str);
9784 enc = rb_enc_get(str);
9785
9787 for (i = 0; i < len; i += n) {
9788 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9789 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9790 }
9791 }
9792 else {
9793 for (i = 0; i < len; i += n) {
9794 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9795 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9796 }
9797 }
9798 RB_GC_GUARD(str);
9799 if (ary)
9800 return ary;
9801 else
9802 return orig;
9803}
9804
9805/*
9806 * call-seq:
9807 * each_char {|c| ... } -> self
9808 * each_char -> enumerator
9809 *
9810 * :include: doc/string/each_char.rdoc
9811 *
9812 */
9813
9814static VALUE
9815rb_str_each_char(VALUE str)
9816{
9817 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9818 return rb_str_enumerate_chars(str, 0);
9819}
9820
9821/*
9822 * call-seq:
9823 * chars -> array_of_characters
9824 *
9825 * :include: doc/string/chars.rdoc
9826 *
9827 */
9828
9829static VALUE
9830rb_str_chars(VALUE str)
9831{
9832 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9833 return rb_str_enumerate_chars(str, ary);
9834}
9835
9836static VALUE
9837rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9838{
9839 VALUE orig = str;
9840 int n;
9841 unsigned int c;
9842 const char *ptr, *end;
9843 rb_encoding *enc;
9844
9845 if (single_byte_optimizable(str))
9846 return rb_str_enumerate_bytes(str, ary);
9847
9848 str = rb_str_new_frozen(str);
9849 ptr = RSTRING_PTR(str);
9850 end = RSTRING_END(str);
9851 enc = STR_ENC_GET(str);
9852
9853 while (ptr < end) {
9854 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9855 ENUM_ELEM(ary, UINT2NUM(c));
9856 ptr += n;
9857 }
9858 RB_GC_GUARD(str);
9859 if (ary)
9860 return ary;
9861 else
9862 return orig;
9863}
9864
9865/*
9866 * call-seq:
9867 * each_codepoint {|integer| ... } -> self
9868 * each_codepoint -> enumerator
9869 *
9870 * :include: doc/string/each_codepoint.rdoc
9871 *
9872 */
9873
9874static VALUE
9875rb_str_each_codepoint(VALUE str)
9876{
9877 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9878 return rb_str_enumerate_codepoints(str, 0);
9879}
9880
9881/*
9882 * call-seq:
9883 * codepoints -> array_of_integers
9884 *
9885 * :include: doc/string/codepoints.rdoc
9886 *
9887 */
9888
9889static VALUE
9890rb_str_codepoints(VALUE str)
9891{
9892 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9893 return rb_str_enumerate_codepoints(str, ary);
9894}
9895
9896static regex_t *
9897get_reg_grapheme_cluster(rb_encoding *enc)
9898{
9899 int encidx = rb_enc_to_index(enc);
9900
9901 const OnigUChar source_ascii[] = "\\X";
9902 const OnigUChar *source = source_ascii;
9903 size_t source_len = sizeof(source_ascii) - 1;
9904
9905 switch (encidx) {
9906#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9907#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9908#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9909#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9910#define CASE_UTF(e) \
9911 case ENCINDEX_UTF_##e: { \
9912 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9913 source = source_UTF_##e; \
9914 source_len = sizeof(source_UTF_##e); \
9915 break; \
9916 }
9917 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9918#undef CASE_UTF
9919#undef CHARS_16BE
9920#undef CHARS_16LE
9921#undef CHARS_32BE
9922#undef CHARS_32LE
9923 }
9924
9925 regex_t *reg_grapheme_cluster;
9926 OnigErrorInfo einfo;
9927 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9928 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9929 if (r) {
9930 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9931 onig_error_code_to_str(message, r, &einfo);
9932 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9933 }
9934
9935 return reg_grapheme_cluster;
9936}
9937
9938static regex_t *
9939get_cached_reg_grapheme_cluster(rb_encoding *enc)
9940{
9941 int encidx = rb_enc_to_index(enc);
9942 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9943
9944 if (encidx == rb_utf8_encindex()) {
9945 if (!reg_grapheme_cluster_utf8) {
9946 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9947 }
9948
9949 return reg_grapheme_cluster_utf8;
9950 }
9951
9952 return NULL;
9953}
9954
9955static VALUE
9956rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9957{
9958 size_t grapheme_cluster_count = 0;
9959 rb_encoding *enc = get_encoding(str);
9960 const char *ptr, *end;
9961
9962 if (!rb_enc_unicode_p(enc)) {
9963 return rb_str_length(str);
9964 }
9965
9966 bool cached_reg_grapheme_cluster = true;
9967 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9968 if (!reg_grapheme_cluster) {
9969 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9970 cached_reg_grapheme_cluster = false;
9971 }
9972
9973 ptr = RSTRING_PTR(str);
9974 end = RSTRING_END(str);
9975
9976 while (ptr < end) {
9977 OnigPosition len = onig_match(reg_grapheme_cluster,
9978 (const OnigUChar *)ptr, (const OnigUChar *)end,
9979 (const OnigUChar *)ptr, NULL, 0);
9980 if (len <= 0) break;
9981 grapheme_cluster_count++;
9982 ptr += len;
9983 }
9984
9985 if (!cached_reg_grapheme_cluster) {
9986 onig_free(reg_grapheme_cluster);
9987 }
9988
9989 return SIZET2NUM(grapheme_cluster_count);
9990}
9991
9992static VALUE
9993rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9994{
9995 VALUE orig = str;
9996 rb_encoding *enc = get_encoding(str);
9997 const char *ptr0, *ptr, *end;
9998
9999 if (!rb_enc_unicode_p(enc)) {
10000 return rb_str_enumerate_chars(str, ary);
10001 }
10002
10003 if (!ary) str = rb_str_new_frozen(str);
10004
10005 bool cached_reg_grapheme_cluster = true;
10006 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10007 if (!reg_grapheme_cluster) {
10008 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10009 cached_reg_grapheme_cluster = false;
10010 }
10011
10012 ptr0 = ptr = RSTRING_PTR(str);
10013 end = RSTRING_END(str);
10014
10015 while (ptr < end) {
10016 OnigPosition len = onig_match(reg_grapheme_cluster,
10017 (const OnigUChar *)ptr, (const OnigUChar *)end,
10018 (const OnigUChar *)ptr, NULL, 0);
10019 if (len <= 0) break;
10020 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10021 ptr += len;
10022 }
10023
10024 if (!cached_reg_grapheme_cluster) {
10025 onig_free(reg_grapheme_cluster);
10026 }
10027
10028 RB_GC_GUARD(str);
10029 if (ary)
10030 return ary;
10031 else
10032 return orig;
10033}
10034
10035/*
10036 * call-seq:
10037 * each_grapheme_cluster {|gc| ... } -> self
10038 * each_grapheme_cluster -> enumerator
10039 *
10040 * :include: doc/string/each_grapheme_cluster.rdoc
10041 *
10042 */
10043
10044static VALUE
10045rb_str_each_grapheme_cluster(VALUE str)
10046{
10047 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10048 return rb_str_enumerate_grapheme_clusters(str, 0);
10049}
10050
10051/*
10052 * call-seq:
10053 * grapheme_clusters -> array_of_grapheme_clusters
10054 *
10055 * :include: doc/string/grapheme_clusters.rdoc
10056 *
10057 */
10058
10059static VALUE
10060rb_str_grapheme_clusters(VALUE str)
10061{
10062 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10063 return rb_str_enumerate_grapheme_clusters(str, ary);
10064}
10065
10066static long
10067chopped_length(VALUE str)
10068{
10069 rb_encoding *enc = STR_ENC_GET(str);
10070 const char *p, *p2, *beg, *end;
10071
10072 beg = RSTRING_PTR(str);
10073 end = beg + RSTRING_LEN(str);
10074 if (beg >= end) return 0;
10075 p = rb_enc_prev_char(beg, end, end, enc);
10076 if (!p) return 0;
10077 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10078 p2 = rb_enc_prev_char(beg, p, end, enc);
10079 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10080 }
10081 return p - beg;
10082}
10083
10084/*
10085 * call-seq:
10086 * chop! -> self or nil
10087 *
10088 * Like String#chop, except that:
10089 *
10090 * - Removes trailing characters from +self+ (not from a copy of +self+).
10091 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10092 *
10093 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10094 */
10095
10096static VALUE
10097rb_str_chop_bang(VALUE str)
10098{
10099 str_modify_keep_cr(str);
10100 if (RSTRING_LEN(str) > 0) {
10101 long len;
10102 len = chopped_length(str);
10103 STR_SET_LEN(str, len);
10104 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10105 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10107 }
10108 return str;
10109 }
10110 return Qnil;
10111}
10112
10113
10114/*
10115 * call-seq:
10116 * chop -> new_string
10117 *
10118 * :include: doc/string/chop.rdoc
10119 *
10120 */
10121
10122static VALUE
10123rb_str_chop(VALUE str)
10124{
10125 return rb_str_subseq(str, 0, chopped_length(str));
10126}
10127
10128static long
10129smart_chomp(VALUE str, const char *e, const char *p)
10130{
10131 rb_encoding *enc = rb_enc_get(str);
10132 if (rb_enc_mbminlen(enc) > 1) {
10133 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10134 if (rb_enc_is_newline(pp, e, enc)) {
10135 e = pp;
10136 }
10137 pp = e - rb_enc_mbminlen(enc);
10138 if (pp >= p) {
10139 pp = rb_enc_left_char_head(p, pp, e, enc);
10140 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10141 e = pp;
10142 }
10143 }
10144 }
10145 else {
10146 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10147 case '\n':
10148 if (--e > p && *(e-1) == '\r') {
10149 --e;
10150 }
10151 break;
10152 case '\r':
10153 --e;
10154 break;
10155 }
10156 }
10157 return e - p;
10158}
10159
10160static long
10161chompped_length(VALUE str, VALUE rs)
10162{
10163 rb_encoding *enc;
10164 int newline;
10165 char *pp, *e, *rsptr;
10166 long rslen;
10167 char *const p = RSTRING_PTR(str);
10168 long len = RSTRING_LEN(str);
10169
10170 if (len == 0) return 0;
10171 e = p + len;
10172 if (rs == rb_default_rs) {
10173 return smart_chomp(str, e, p);
10174 }
10175
10176 enc = rb_enc_get(str);
10177 RSTRING_GETMEM(rs, rsptr, rslen);
10178 if (rslen == 0) {
10179 if (rb_enc_mbminlen(enc) > 1) {
10180 while (e > p) {
10181 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10182 if (!rb_enc_is_newline(pp, e, enc)) break;
10183 e = pp;
10184 pp -= rb_enc_mbminlen(enc);
10185 if (pp >= p) {
10186 pp = rb_enc_left_char_head(p, pp, e, enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10188 e = pp;
10189 }
10190 }
10191 }
10192 }
10193 else {
10194 while (e > p && *(e-1) == '\n') {
10195 --e;
10196 if (e > p && *(e-1) == '\r')
10197 --e;
10198 }
10199 }
10200 return e - p;
10201 }
10202 if (rslen > len) return len;
10203
10204 enc = rb_enc_get(rs);
10205 newline = rsptr[rslen-1];
10206 if (rslen == rb_enc_mbminlen(enc)) {
10207 if (rslen == 1) {
10208 if (newline == '\n')
10209 return smart_chomp(str, e, p);
10210 }
10211 else {
10212 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10213 return smart_chomp(str, e, p);
10214 }
10215 }
10216
10217 enc = rb_enc_check(str, rs);
10218 if (is_broken_string(rs)) {
10219 return len;
10220 }
10221 pp = e - rslen;
10222 if (p[len-1] == newline &&
10223 (rslen <= 1 ||
10224 memcmp(rsptr, pp, rslen) == 0)) {
10225 if (at_char_boundary(p, pp, e, enc))
10226 return len - rslen;
10227 RB_GC_GUARD(rs);
10228 }
10229 return len;
10230}
10231
10237static VALUE
10238chomp_rs(int argc, const VALUE *argv)
10239{
10240 rb_check_arity(argc, 0, 1);
10241 if (argc > 0) {
10242 VALUE rs = argv[0];
10243 if (!NIL_P(rs)) StringValue(rs);
10244 return rs;
10245 }
10246 else {
10247 return rb_rs;
10248 }
10249}
10250
10251VALUE
10252rb_str_chomp_string(VALUE str, VALUE rs)
10253{
10254 long olen = RSTRING_LEN(str);
10255 long len = chompped_length(str, rs);
10256 if (len >= olen) return Qnil;
10257 str_modify_keep_cr(str);
10258 STR_SET_LEN(str, len);
10259 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10260 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10262 }
10263 return str;
10264}
10265
10266/*
10267 * call-seq:
10268 * chomp!(line_sep = $/) -> self or nil
10269 *
10270 * Like String#chomp, except that:
10271 *
10272 * - Removes trailing characters from +self+ (not from a copy of +self+).
10273 * - Returns +self+ if any characters are removed, +nil+ otherwise.
10274 *
10275 * Related: see {Modifying}[rdoc-ref:String@Modifying].
10276 */
10277
10278static VALUE
10279rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10280{
10281 VALUE rs;
10282 str_modifiable(str);
10283 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10284 rs = chomp_rs(argc, argv);
10285 if (NIL_P(rs)) return Qnil;
10286 return rb_str_chomp_string(str, rs);
10287}
10288
10289
10290/*
10291 * call-seq:
10292 * chomp(line_sep = $/) -> new_string
10293 *
10294 * :include: doc/string/chomp.rdoc
10295 *
10296 */
10297
10298static VALUE
10299rb_str_chomp(int argc, VALUE *argv, VALUE str)
10300{
10301 VALUE rs = chomp_rs(argc, argv);
10302 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10303 return rb_str_subseq(str, 0, chompped_length(str, rs));
10304}
10305
10306static long
10307lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10308{
10309 const char *const start = s;
10310
10311 if (!s || s >= e) return 0;
10312
10313 /* remove spaces at head */
10314 if (single_byte_optimizable(str)) {
10315 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10316 }
10317 else {
10318 while (s < e) {
10319 int n;
10320 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10321
10322 if (cc && !rb_isspace(cc)) break;
10323 s += n;
10324 }
10325 }
10326 return s - start;
10327}
10328
10329/*
10330 * call-seq:
10331 * lstrip! -> self or nil
10332 *
10333 * Like String#lstrip, except that any modifications are made in +self+;
10334 * returns +self+ if any modification are made, +nil+ otherwise.
10335 *
10336 * Related: String#rstrip!, String#strip!.
10337 */
10338
10339static VALUE
10340rb_str_lstrip_bang(VALUE str)
10341{
10342 rb_encoding *enc;
10343 char *start, *s;
10344 long olen, loffset;
10345
10346 str_modify_keep_cr(str);
10347 enc = STR_ENC_GET(str);
10348 RSTRING_GETMEM(str, start, olen);
10349 loffset = lstrip_offset(str, start, start+olen, enc);
10350 if (loffset > 0) {
10351 long len = olen-loffset;
10352 s = start + loffset;
10353 memmove(start, s, len);
10354 STR_SET_LEN(str, len);
10355 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10356 return str;
10357 }
10358 return Qnil;
10359}
10360
10361
10362/*
10363 * call-seq:
10364 * lstrip -> new_string
10365 *
10366 * Returns a copy of +self+ with leading whitespace removed;
10367 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10368 *
10369 * whitespace = "\x00\t\n\v\f\r "
10370 * s = whitespace + 'abc' + whitespace
10371 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10372 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10373 *
10374 * Related: String#rstrip, String#strip.
10375 */
10376
10377static VALUE
10378rb_str_lstrip(VALUE str)
10379{
10380 char *start;
10381 long len, loffset;
10382 RSTRING_GETMEM(str, start, len);
10383 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10384 if (loffset <= 0) return str_duplicate(rb_cString, str);
10385 return rb_str_subseq(str, loffset, len - loffset);
10386}
10387
10388static long
10389rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10390{
10391 const char *t;
10392
10393 rb_str_check_dummy_enc(enc);
10395 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10396 }
10397 if (!s || s >= e) return 0;
10398 t = e;
10399
10400 /* remove trailing spaces or '\0's */
10401 if (single_byte_optimizable(str)) {
10402 unsigned char c;
10403 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10404 }
10405 else {
10406 char *tp;
10407
10408 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10409 unsigned int c = rb_enc_codepoint(tp, e, enc);
10410 if (c && !rb_isspace(c)) break;
10411 t = tp;
10412 }
10413 }
10414 return e - t;
10415}
10416
10417/*
10418 * call-seq:
10419 * rstrip! -> self or nil
10420 *
10421 * Like String#rstrip, except that any modifications are made in +self+;
10422 * returns +self+ if any modification are made, +nil+ otherwise.
10423 *
10424 * Related: String#lstrip!, String#strip!.
10425 */
10426
10427static VALUE
10428rb_str_rstrip_bang(VALUE str)
10429{
10430 rb_encoding *enc;
10431 char *start;
10432 long olen, roffset;
10433
10434 str_modify_keep_cr(str);
10435 enc = STR_ENC_GET(str);
10436 RSTRING_GETMEM(str, start, olen);
10437 roffset = rstrip_offset(str, start, start+olen, enc);
10438 if (roffset > 0) {
10439 long len = olen - roffset;
10440
10441 STR_SET_LEN(str, len);
10442 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10443 return str;
10444 }
10445 return Qnil;
10446}
10447
10448
10449/*
10450 * call-seq:
10451 * rstrip -> new_string
10452 *
10453 * Returns a copy of the receiver with trailing whitespace removed;
10454 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10455 *
10456 * whitespace = "\x00\t\n\v\f\r "
10457 * s = whitespace + 'abc' + whitespace
10458 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10459 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10460 *
10461 * Related: String#lstrip, String#strip.
10462 */
10463
10464static VALUE
10465rb_str_rstrip(VALUE str)
10466{
10467 rb_encoding *enc;
10468 char *start;
10469 long olen, roffset;
10470
10471 enc = STR_ENC_GET(str);
10472 RSTRING_GETMEM(str, start, olen);
10473 roffset = rstrip_offset(str, start, start+olen, enc);
10474
10475 if (roffset <= 0) return str_duplicate(rb_cString, str);
10476 return rb_str_subseq(str, 0, olen-roffset);
10477}
10478
10479
10480/*
10481 * call-seq:
10482 * strip! -> self or nil
10483 *
10484 * Like String#strip, except that any modifications are made in +self+;
10485 * returns +self+ if any modification are made, +nil+ otherwise.
10486 *
10487 * Related: String#lstrip!, String#strip!.
10488 */
10489
10490static VALUE
10491rb_str_strip_bang(VALUE str)
10492{
10493 char *start;
10494 long olen, loffset, roffset;
10495 rb_encoding *enc;
10496
10497 str_modify_keep_cr(str);
10498 enc = STR_ENC_GET(str);
10499 RSTRING_GETMEM(str, start, olen);
10500 loffset = lstrip_offset(str, start, start+olen, enc);
10501 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10502
10503 if (loffset > 0 || roffset > 0) {
10504 long len = olen-roffset;
10505 if (loffset > 0) {
10506 len -= loffset;
10507 memmove(start, start + loffset, len);
10508 }
10509 STR_SET_LEN(str, len);
10510 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10511 return str;
10512 }
10513 return Qnil;
10514}
10515
10516
10517/*
10518 * call-seq:
10519 * strip -> new_string
10520 *
10521 * Returns a copy of the receiver with leading and trailing whitespace removed;
10522 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10523 *
10524 * whitespace = "\x00\t\n\v\f\r "
10525 * s = whitespace + 'abc' + whitespace
10526 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10527 * s.strip # => "abc"
10528 *
10529 * Related: String#lstrip, String#rstrip.
10530 */
10531
10532static VALUE
10533rb_str_strip(VALUE str)
10534{
10535 char *start;
10536 long olen, loffset, roffset;
10537 rb_encoding *enc = STR_ENC_GET(str);
10538
10539 RSTRING_GETMEM(str, start, olen);
10540 loffset = lstrip_offset(str, start, start+olen, enc);
10541 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10542
10543 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10544 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10545}
10546
10547static VALUE
10548scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10549{
10550 VALUE result = Qnil;
10551 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10552 if (pos >= 0) {
10553 VALUE match;
10554 struct re_registers *regs;
10555 if (BUILTIN_TYPE(pat) == T_STRING) {
10556 regs = NULL;
10557 end = pos + RSTRING_LEN(pat);
10558 }
10559 else {
10560 match = rb_backref_get();
10561 regs = RMATCH_REGS(match);
10562 pos = BEG(0);
10563 end = END(0);
10564 }
10565
10566 if (pos == end) {
10567 rb_encoding *enc = STR_ENC_GET(str);
10568 /*
10569 * Always consume at least one character of the input string
10570 */
10571 if (RSTRING_LEN(str) > end)
10572 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10573 RSTRING_END(str), enc);
10574 else
10575 *start = end + 1;
10576 }
10577 else {
10578 *start = end;
10579 }
10580
10581 if (!regs || regs->num_regs == 1) {
10582 result = rb_str_subseq(str, pos, end - pos);
10583 return result;
10584 }
10585 else {
10586 result = rb_ary_new2(regs->num_regs);
10587 for (int i = 1; i < regs->num_regs; i++) {
10588 VALUE s = Qnil;
10589 if (BEG(i) >= 0) {
10590 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10591 }
10592
10593 rb_ary_push(result, s);
10594 }
10595 }
10596
10597 RB_GC_GUARD(match);
10598 }
10599
10600 return result;
10601}
10602
10603
10604/*
10605 * call-seq:
10606 * scan(string_or_regexp) -> array
10607 * scan(string_or_regexp) {|matches| ... } -> self
10608 *
10609 * Matches a pattern against +self+; the pattern is:
10610 *
10611 * - +string_or_regexp+ itself, if it is a Regexp.
10612 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10613 *
10614 * Iterates through +self+, generating a collection of matching results:
10615 *
10616 * - If the pattern contains no groups, each result is the
10617 * matched string, <code>$&</code>.
10618 * - If the pattern contains groups, each result is an array
10619 * containing one entry per group.
10620 *
10621 * With no block given, returns an array of the results:
10622 *
10623 * s = 'cruel world'
10624 * s.scan(/\w+/) # => ["cruel", "world"]
10625 * s.scan(/.../) # => ["cru", "el ", "wor"]
10626 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10627 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10628 *
10629 * With a block given, calls the block with each result; returns +self+:
10630 *
10631 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10632 * print "\n"
10633 * s.scan(/(.)(.)/) {|x,y| print y, x }
10634 * print "\n"
10635 *
10636 * Output:
10637 *
10638 * <<cruel>> <<world>>
10639 * rceu lowlr
10640 *
10641 */
10642
10643static VALUE
10644rb_str_scan(VALUE str, VALUE pat)
10645{
10646 VALUE result;
10647 long start = 0;
10648 long last = -1, prev = 0;
10649 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10650
10651 pat = get_pat_quoted(pat, 1);
10652 mustnot_broken(str);
10653 if (!rb_block_given_p()) {
10654 VALUE ary = rb_ary_new();
10655
10656 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10657 last = prev;
10658 prev = start;
10659 rb_ary_push(ary, result);
10660 }
10661 if (last >= 0) rb_pat_search(pat, str, last, 1);
10662 else rb_backref_set(Qnil);
10663 return ary;
10664 }
10665
10666 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10667 last = prev;
10668 prev = start;
10669 rb_yield(result);
10670 str_mod_check(str, p, len);
10671 }
10672 if (last >= 0) rb_pat_search(pat, str, last, 1);
10673 return str;
10674}
10675
10676
10677/*
10678 * call-seq:
10679 * hex -> integer
10680 *
10681 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10682 * (with an optional sign and an optional <code>0x</code>) and returns the
10683 * corresponding number;
10684 * returns zero if there is no such leading substring:
10685 *
10686 * '0x0a'.hex # => 10
10687 * '-1234'.hex # => -4660
10688 * '0'.hex # => 0
10689 * 'non-numeric'.hex # => 0
10690 *
10691 * Related: String#oct.
10692 *
10693 */
10694
10695static VALUE
10696rb_str_hex(VALUE str)
10697{
10698 return rb_str_to_inum(str, 16, FALSE);
10699}
10700
10701
10702/*
10703 * call-seq:
10704 * oct -> integer
10705 *
10706 * Interprets the leading substring of +self+ as a string of octal digits
10707 * (with an optional sign) and returns the corresponding number;
10708 * returns zero if there is no such leading substring:
10709 *
10710 * '123'.oct # => 83
10711 * '-377'.oct # => -255
10712 * '0377non-numeric'.oct # => 255
10713 * 'non-numeric'.oct # => 0
10714 *
10715 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10716 * see Kernel#Integer.
10717 *
10718 * Related: String#hex.
10719 *
10720 */
10721
10722static VALUE
10723rb_str_oct(VALUE str)
10724{
10725 return rb_str_to_inum(str, -8, FALSE);
10726}
10727
10728#ifndef HAVE_CRYPT_R
10729# include "ruby/thread_native.h"
10730# include "ruby/atomic.h"
10731
10732static struct {
10733 rb_nativethread_lock_t lock;
10734} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10735#endif
10736
10737/*
10738 * call-seq:
10739 * crypt(salt_str) -> new_string
10740 *
10741 * Returns the string generated by calling <code>crypt(3)</code>
10742 * standard library function with <code>str</code> and
10743 * <code>salt_str</code>, in this order, as its arguments. Please do
10744 * not use this method any longer. It is legacy; provided only for
10745 * backward compatibility with ruby scripts in earlier days. It is
10746 * bad to use in contemporary programs for several reasons:
10747 *
10748 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10749 * run. The generated string lacks data portability.
10750 *
10751 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10752 * (i.e. silently ends up in unexpected results).
10753 *
10754 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10755 * thread safe.
10756 *
10757 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10758 * very very weak. According to its manpage, Linux's traditional
10759 * <code>crypt(3)</code> output has only 2**56 variations; too
10760 * easy to brute force today. And this is the default behaviour.
10761 *
10762 * * In order to make things robust some OSes implement so-called
10763 * "modular" usage. To go through, you have to do a complex
10764 * build-up of the <code>salt_str</code> parameter, by hand.
10765 * Failure in generation of a proper salt string tends not to
10766 * yield any errors; typos in parameters are normally not
10767 * detectable.
10768 *
10769 * * For instance, in the following example, the second invocation
10770 * of String#crypt is wrong; it has a typo in "round=" (lacks
10771 * "s"). However the call does not fail and something unexpected
10772 * is generated.
10773 *
10774 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10775 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10776 *
10777 * * Even in the "modular" mode, some hash functions are considered
10778 * archaic and no longer recommended at all; for instance module
10779 * <code>$1$</code> is officially abandoned by its author: see
10780 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10781 * instance module <code>$3$</code> is considered completely
10782 * broken: see the manpage of FreeBSD.
10783 *
10784 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10785 * written above, <code>crypt(3)</code> on Mac OS never fails.
10786 * This means even if you build up a proper salt string it
10787 * generates a traditional DES hash anyways, and there is no way
10788 * for you to be aware of.
10789 *
10790 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10791 *
10792 * If for some reason you cannot migrate to other secure contemporary
10793 * password hashing algorithms, install the string-crypt gem and
10794 * <code>require 'string/crypt'</code> to continue using it.
10795 */
10796
10797static VALUE
10798rb_str_crypt(VALUE str, VALUE salt)
10799{
10800#ifdef HAVE_CRYPT_R
10801 VALUE databuf;
10802 struct crypt_data *data;
10803# define CRYPT_END() ALLOCV_END(databuf)
10804#else
10805 char *tmp_buf;
10806 extern char *crypt(const char *, const char *);
10807# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10808#endif
10809 VALUE result;
10810 const char *s, *saltp;
10811 char *res;
10812#ifdef BROKEN_CRYPT
10813 char salt_8bit_clean[3];
10814#endif
10815
10816 StringValue(salt);
10817 mustnot_wchar(str);
10818 mustnot_wchar(salt);
10819 s = StringValueCStr(str);
10820 saltp = RSTRING_PTR(salt);
10821 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10822 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10823 }
10824
10825#ifdef BROKEN_CRYPT
10826 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10827 salt_8bit_clean[0] = saltp[0] & 0x7f;
10828 salt_8bit_clean[1] = saltp[1] & 0x7f;
10829 salt_8bit_clean[2] = '\0';
10830 saltp = salt_8bit_clean;
10831 }
10832#endif
10833#ifdef HAVE_CRYPT_R
10834 data = ALLOCV(databuf, sizeof(struct crypt_data));
10835# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10836 data->initialized = 0;
10837# endif
10838 res = crypt_r(s, saltp, data);
10839#else
10840 rb_nativethread_lock_lock(&crypt_mutex.lock);
10841 res = crypt(s, saltp);
10842#endif
10843 if (!res) {
10844 int err = errno;
10845 CRYPT_END();
10846 rb_syserr_fail(err, "crypt");
10847 }
10848#ifdef HAVE_CRYPT_R
10849 result = rb_str_new_cstr(res);
10850 CRYPT_END();
10851#else
10852 // We need to copy this buffer because it's static and we need to unlock the mutex
10853 // before allocating a new object (the string to be returned). If we allocate while
10854 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10855 // if other ractors are waiting on this lock.
10856 size_t res_size = strlen(res)+1;
10857 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10858 memcpy(tmp_buf, res, res_size);
10859 res = tmp_buf;
10860 CRYPT_END();
10861 result = rb_str_new_cstr(res);
10862#endif
10863 return result;
10864}
10865
10866
10867/*
10868 * call-seq:
10869 * ord -> integer
10870 *
10871 * :include: doc/string/ord.rdoc
10872 *
10873 */
10874
10875static VALUE
10876rb_str_ord(VALUE s)
10877{
10878 unsigned int c;
10879
10880 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10881 return UINT2NUM(c);
10882}
10883/*
10884 * call-seq:
10885 * sum(n = 16) -> integer
10886 *
10887 * :include: doc/string/sum.rdoc
10888 *
10889 */
10890
10891static VALUE
10892rb_str_sum(int argc, VALUE *argv, VALUE str)
10893{
10894 int bits = 16;
10895 char *ptr, *p, *pend;
10896 long len;
10897 VALUE sum = INT2FIX(0);
10898 unsigned long sum0 = 0;
10899
10900 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10901 bits = 0;
10902 }
10903 ptr = p = RSTRING_PTR(str);
10904 len = RSTRING_LEN(str);
10905 pend = p + len;
10906
10907 while (p < pend) {
10908 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10909 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10910 str_mod_check(str, ptr, len);
10911 sum0 = 0;
10912 }
10913 sum0 += (unsigned char)*p;
10914 p++;
10915 }
10916
10917 if (bits == 0) {
10918 if (sum0) {
10919 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10920 }
10921 }
10922 else {
10923 if (sum == INT2FIX(0)) {
10924 if (bits < (int)sizeof(long)*CHAR_BIT) {
10925 sum0 &= (((unsigned long)1)<<bits)-1;
10926 }
10927 sum = LONG2FIX(sum0);
10928 }
10929 else {
10930 VALUE mod;
10931
10932 if (sum0) {
10933 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10934 }
10935
10936 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10937 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10938 sum = rb_funcall(sum, '&', 1, mod);
10939 }
10940 }
10941 return sum;
10942}
10943
10944static VALUE
10945rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10946{
10947 rb_encoding *enc;
10948 VALUE w;
10949 long width, len, flen = 1, fclen = 1;
10950 VALUE res;
10951 char *p;
10952 const char *f = " ";
10953 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10954 VALUE pad;
10955 int singlebyte = 1, cr;
10956 int termlen;
10957
10958 rb_scan_args(argc, argv, "11", &w, &pad);
10959 enc = STR_ENC_GET(str);
10960 termlen = rb_enc_mbminlen(enc);
10961 width = NUM2LONG(w);
10962 if (argc == 2) {
10963 StringValue(pad);
10964 enc = rb_enc_check(str, pad);
10965 f = RSTRING_PTR(pad);
10966 flen = RSTRING_LEN(pad);
10967 fclen = str_strlen(pad, enc); /* rb_enc_check */
10968 singlebyte = single_byte_optimizable(pad);
10969 if (flen == 0 || fclen == 0) {
10970 rb_raise(rb_eArgError, "zero width padding");
10971 }
10972 }
10973 len = str_strlen(str, enc); /* rb_enc_check */
10974 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10975 n = width - len;
10976 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10977 rlen = n - llen;
10978 cr = ENC_CODERANGE(str);
10979 if (flen > 1) {
10980 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10981 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10982 }
10983 size = RSTRING_LEN(str);
10984 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10985 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10986 (len += llen2 + rlen2) >= LONG_MAX - size) {
10987 rb_raise(rb_eArgError, "argument too big");
10988 }
10989 len += size;
10990 res = str_enc_new(rb_cString, 0, len, enc);
10991 p = RSTRING_PTR(res);
10992 if (flen <= 1) {
10993 memset(p, *f, llen);
10994 p += llen;
10995 }
10996 else {
10997 while (llen >= fclen) {
10998 memcpy(p,f,flen);
10999 p += flen;
11000 llen -= fclen;
11001 }
11002 if (llen > 0) {
11003 memcpy(p, f, llen2);
11004 p += llen2;
11005 }
11006 }
11007 memcpy(p, RSTRING_PTR(str), size);
11008 p += size;
11009 if (flen <= 1) {
11010 memset(p, *f, rlen);
11011 p += rlen;
11012 }
11013 else {
11014 while (rlen >= fclen) {
11015 memcpy(p,f,flen);
11016 p += flen;
11017 rlen -= fclen;
11018 }
11019 if (rlen > 0) {
11020 memcpy(p, f, rlen2);
11021 p += rlen2;
11022 }
11023 }
11024 TERM_FILL(p, termlen);
11025 STR_SET_LEN(res, p-RSTRING_PTR(res));
11026
11027 if (argc == 2)
11028 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11029 if (cr != ENC_CODERANGE_BROKEN)
11030 ENC_CODERANGE_SET(res, cr);
11031
11032 RB_GC_GUARD(pad);
11033 return res;
11034}
11035
11036
11037/*
11038 * call-seq:
11039 * ljust(size, pad_string = ' ') -> new_string
11040 *
11041 * :include: doc/string/ljust.rdoc
11042 *
11043 * Related: String#rjust, String#center.
11044 *
11045 */
11046
11047static VALUE
11048rb_str_ljust(int argc, VALUE *argv, VALUE str)
11049{
11050 return rb_str_justify(argc, argv, str, 'l');
11051}
11052
11053/*
11054 * call-seq:
11055 * rjust(size, pad_string = ' ') -> new_string
11056 *
11057 * :include: doc/string/rjust.rdoc
11058 *
11059 * Related: String#ljust, String#center.
11060 *
11061 */
11062
11063static VALUE
11064rb_str_rjust(int argc, VALUE *argv, VALUE str)
11065{
11066 return rb_str_justify(argc, argv, str, 'r');
11067}
11068
11069
11070/*
11071 * call-seq:
11072 * center(size, pad_string = ' ') -> new_string
11073 *
11074 * :include: doc/string/center.rdoc
11075 *
11076 */
11077
11078static VALUE
11079rb_str_center(int argc, VALUE *argv, VALUE str)
11080{
11081 return rb_str_justify(argc, argv, str, 'c');
11082}
11083
11084/*
11085 * call-seq:
11086 * partition(string_or_regexp) -> [head, match, tail]
11087 *
11088 * :include: doc/string/partition.rdoc
11089 *
11090 */
11091
11092static VALUE
11093rb_str_partition(VALUE str, VALUE sep)
11094{
11095 long pos;
11096
11097 sep = get_pat_quoted(sep, 0);
11098 if (RB_TYPE_P(sep, T_REGEXP)) {
11099 if (rb_reg_search(sep, str, 0, 0) < 0) {
11100 goto failed;
11101 }
11102 VALUE match = rb_backref_get();
11103 struct re_registers *regs = RMATCH_REGS(match);
11104
11105 pos = BEG(0);
11106 sep = rb_str_subseq(str, pos, END(0) - pos);
11107 }
11108 else {
11109 pos = rb_str_index(str, sep, 0);
11110 if (pos < 0) goto failed;
11111 }
11112 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11113 sep,
11114 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11115 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11116
11117 failed:
11118 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11119}
11120
11121/*
11122 * call-seq:
11123 * rpartition(sep) -> [head, match, tail]
11124 *
11125 * :include: doc/string/rpartition.rdoc
11126 *
11127 */
11128
11129static VALUE
11130rb_str_rpartition(VALUE str, VALUE sep)
11131{
11132 long pos = RSTRING_LEN(str);
11133
11134 sep = get_pat_quoted(sep, 0);
11135 if (RB_TYPE_P(sep, T_REGEXP)) {
11136 if (rb_reg_search(sep, str, pos, 1) < 0) {
11137 goto failed;
11138 }
11139 VALUE match = rb_backref_get();
11140 struct re_registers *regs = RMATCH_REGS(match);
11141
11142 pos = BEG(0);
11143 sep = rb_str_subseq(str, pos, END(0) - pos);
11144 }
11145 else {
11146 pos = rb_str_sublen(str, pos);
11147 pos = rb_str_rindex(str, sep, pos);
11148 if (pos < 0) {
11149 goto failed;
11150 }
11151 }
11152
11153 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11154 sep,
11155 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11156 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11157 failed:
11158 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11159}
11160
11161/*
11162 * call-seq:
11163 * start_with?(*string_or_regexp) -> true or false
11164 *
11165 * :include: doc/string/start_with_p.rdoc
11166 *
11167 */
11168
11169static VALUE
11170rb_str_start_with(int argc, VALUE *argv, VALUE str)
11171{
11172 int i;
11173
11174 for (i=0; i<argc; i++) {
11175 VALUE tmp = argv[i];
11176 if (RB_TYPE_P(tmp, T_REGEXP)) {
11177 if (rb_reg_start_with_p(tmp, str))
11178 return Qtrue;
11179 }
11180 else {
11181 const char *p, *s, *e;
11182 long slen, tlen;
11183 rb_encoding *enc;
11184
11185 StringValue(tmp);
11186 enc = rb_enc_check(str, tmp);
11187 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11188 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11189 p = RSTRING_PTR(str);
11190 e = p + slen;
11191 s = p + tlen;
11192 if (!at_char_right_boundary(p, s, e, enc))
11193 continue;
11194 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11195 return Qtrue;
11196 }
11197 }
11198 return Qfalse;
11199}
11200
11201/*
11202 * call-seq:
11203 * end_with?(*strings) -> true or false
11204 *
11205 * :include: doc/string/end_with_p.rdoc
11206 *
11207 */
11208
11209static VALUE
11210rb_str_end_with(int argc, VALUE *argv, VALUE str)
11211{
11212 int i;
11213
11214 for (i=0; i<argc; i++) {
11215 VALUE tmp = argv[i];
11216 const char *p, *s, *e;
11217 long slen, tlen;
11218 rb_encoding *enc;
11219
11220 StringValue(tmp);
11221 enc = rb_enc_check(str, tmp);
11222 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11223 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11224 p = RSTRING_PTR(str);
11225 e = p + slen;
11226 s = e - tlen;
11227 if (!at_char_boundary(p, s, e, enc))
11228 continue;
11229 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11230 return Qtrue;
11231 }
11232 return Qfalse;
11233}
11234
11244static long
11245deleted_prefix_length(VALUE str, VALUE prefix)
11246{
11247 const char *strptr, *prefixptr;
11248 long olen, prefixlen;
11249 rb_encoding *enc = rb_enc_get(str);
11250
11251 StringValue(prefix);
11252
11253 if (!is_broken_string(prefix) ||
11254 !rb_enc_asciicompat(enc) ||
11255 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11256 enc = rb_enc_check(str, prefix);
11257 }
11258
11259 /* return 0 if not start with prefix */
11260 prefixlen = RSTRING_LEN(prefix);
11261 if (prefixlen <= 0) return 0;
11262 olen = RSTRING_LEN(str);
11263 if (olen < prefixlen) return 0;
11264 strptr = RSTRING_PTR(str);
11265 prefixptr = RSTRING_PTR(prefix);
11266 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11267 if (is_broken_string(prefix)) {
11268 if (!is_broken_string(str)) {
11269 /* prefix in a valid string cannot be broken */
11270 return 0;
11271 }
11272 const char *strend = strptr + olen;
11273 const char *after_prefix = strptr + prefixlen;
11274 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11275 /* prefix does not end at char-boundary */
11276 return 0;
11277 }
11278 }
11279 /* prefix part in `str` also should be valid. */
11280
11281 return prefixlen;
11282}
11283
11284/*
11285 * call-seq:
11286 * delete_prefix!(prefix) -> self or nil
11287 *
11288 * Like String#delete_prefix, except that +self+ is modified in place;
11289 * returns +self+ if the prefix is removed, +nil+ otherwise.
11290 *
11291 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11292 */
11293
11294static VALUE
11295rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11296{
11297 long prefixlen;
11298 str_modify_keep_cr(str);
11299
11300 prefixlen = deleted_prefix_length(str, prefix);
11301 if (prefixlen <= 0) return Qnil;
11302
11303 return rb_str_drop_bytes(str, prefixlen);
11304}
11305
11306/*
11307 * call-seq:
11308 * delete_prefix(prefix) -> new_string
11309 *
11310 * :include: doc/string/delete_prefix.rdoc
11311 *
11312 */
11313
11314static VALUE
11315rb_str_delete_prefix(VALUE str, VALUE prefix)
11316{
11317 long prefixlen;
11318
11319 prefixlen = deleted_prefix_length(str, prefix);
11320 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11321
11322 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11323}
11324
11334static long
11335deleted_suffix_length(VALUE str, VALUE suffix)
11336{
11337 const char *strptr, *suffixptr;
11338 long olen, suffixlen;
11339 rb_encoding *enc;
11340
11341 StringValue(suffix);
11342 if (is_broken_string(suffix)) return 0;
11343 enc = rb_enc_check(str, suffix);
11344
11345 /* return 0 if not start with suffix */
11346 suffixlen = RSTRING_LEN(suffix);
11347 if (suffixlen <= 0) return 0;
11348 olen = RSTRING_LEN(str);
11349 if (olen < suffixlen) return 0;
11350 strptr = RSTRING_PTR(str);
11351 suffixptr = RSTRING_PTR(suffix);
11352 const char *strend = strptr + olen;
11353 const char *before_suffix = strend - suffixlen;
11354 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11355 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11356
11357 return suffixlen;
11358}
11359
11360/*
11361 * call-seq:
11362 * delete_suffix!(suffix) -> self or nil
11363 *
11364 * Like String#delete_suffix, except that +self+ is modified in place;
11365 * returns +self+ if the suffix is removed, +nil+ otherwise.
11366 *
11367 * Related: see {Modifying}[rdoc-ref:String@Modifying].
11368 */
11369
11370static VALUE
11371rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11372{
11373 long olen, suffixlen, len;
11374 str_modifiable(str);
11375
11376 suffixlen = deleted_suffix_length(str, suffix);
11377 if (suffixlen <= 0) return Qnil;
11378
11379 olen = RSTRING_LEN(str);
11380 str_modify_keep_cr(str);
11381 len = olen - suffixlen;
11382 STR_SET_LEN(str, len);
11383 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11384 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11386 }
11387 return str;
11388}
11389
11390/*
11391 * call-seq:
11392 * delete_suffix(suffix) -> new_string
11393 *
11394 * :include: doc/string/delete_suffix.rdoc
11395 *
11396 */
11397
11398static VALUE
11399rb_str_delete_suffix(VALUE str, VALUE suffix)
11400{
11401 long suffixlen;
11402
11403 suffixlen = deleted_suffix_length(str, suffix);
11404 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11405
11406 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11407}
11408
11409void
11410rb_str_setter(VALUE val, ID id, VALUE *var)
11411{
11412 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11413 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11414 }
11415 *var = val;
11416}
11417
11418static void
11419rb_fs_setter(VALUE val, ID id, VALUE *var)
11420{
11421 val = rb_fs_check(val);
11422 if (!val) {
11423 rb_raise(rb_eTypeError,
11424 "value of %"PRIsVALUE" must be String or Regexp",
11425 rb_id2str(id));
11426 }
11427 if (!NIL_P(val)) {
11428 rb_warn_deprecated("'$;'", NULL);
11429 }
11430 *var = val;
11431}
11432
11433
11434/*
11435 * call-seq:
11436 * force_encoding(encoding) -> self
11437 *
11438 * :include: doc/string/force_encoding.rdoc
11439 *
11440 */
11441
11442static VALUE
11443rb_str_force_encoding(VALUE str, VALUE enc)
11444{
11445 str_modifiable(str);
11446
11447 rb_encoding *encoding = rb_to_encoding(enc);
11448 int idx = rb_enc_to_index(encoding);
11449
11450 // If the encoding is unchanged, we do nothing.
11451 if (ENCODING_GET(str) == idx) {
11452 return str;
11453 }
11454
11455 rb_enc_associate_index(str, idx);
11456
11457 // If the coderange was 7bit and the new encoding is ASCII-compatible
11458 // we can keep the coderange.
11459 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11460 return str;
11461 }
11462
11464 return str;
11465}
11466
11467/*
11468 * call-seq:
11469 * b -> new_string
11470 *
11471 * :include: doc/string/b.rdoc
11472 *
11473 */
11474
11475static VALUE
11476rb_str_b(VALUE str)
11477{
11478 VALUE str2;
11479 if (STR_EMBED_P(str)) {
11480 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11481 }
11482 else {
11483 str2 = str_alloc_heap(rb_cString);
11484 }
11485 str_replace_shared_without_enc(str2, str);
11486
11487 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11488 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11489 // If we know the receiver's code range then we know the result's code range.
11490 int cr = ENC_CODERANGE(str);
11491 switch (cr) {
11492 case ENC_CODERANGE_7BIT:
11494 break;
11498 break;
11499 default:
11500 ENC_CODERANGE_CLEAR(str2);
11501 break;
11502 }
11503 }
11504
11505 return str2;
11506}
11507
11508/*
11509 * call-seq:
11510 * valid_encoding? -> true or false
11511 *
11512 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11513 *
11514 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11515 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11516 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11517 */
11518
11519static VALUE
11520rb_str_valid_encoding_p(VALUE str)
11521{
11522 int cr = rb_enc_str_coderange(str);
11523
11524 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11525}
11526
11527/*
11528 * call-seq:
11529 * ascii_only? -> true or false
11530 *
11531 * Returns whether +self+ contains only ASCII characters:
11532 *
11533 * 'abc'.ascii_only? # => true
11534 * "abc\u{6666}".ascii_only? # => false
11535 *
11536 * Related: see {Querying}[rdoc-ref:String@Querying].
11537 */
11538
11539static VALUE
11540rb_str_is_ascii_only_p(VALUE str)
11541{
11542 int cr = rb_enc_str_coderange(str);
11543
11544 return RBOOL(cr == ENC_CODERANGE_7BIT);
11545}
11546
11547VALUE
11549{
11550 static const char ellipsis[] = "...";
11551 const long ellipsislen = sizeof(ellipsis) - 1;
11552 rb_encoding *const enc = rb_enc_get(str);
11553 const long blen = RSTRING_LEN(str);
11554 const char *const p = RSTRING_PTR(str), *e = p + blen;
11555 VALUE estr, ret = 0;
11556
11557 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11558 if (len * rb_enc_mbminlen(enc) >= blen ||
11559 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11560 ret = str;
11561 }
11562 else if (len <= ellipsislen ||
11563 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11564 if (rb_enc_asciicompat(enc)) {
11565 ret = rb_str_new(ellipsis, len);
11566 rb_enc_associate(ret, enc);
11567 }
11568 else {
11569 estr = rb_usascii_str_new(ellipsis, len);
11570 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11571 }
11572 }
11573 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11574 rb_str_cat(ret, ellipsis, ellipsislen);
11575 }
11576 else {
11577 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11578 rb_enc_from_encoding(enc), 0, Qnil);
11579 rb_str_append(ret, estr);
11580 }
11581 return ret;
11582}
11583
11584static VALUE
11585str_compat_and_valid(VALUE str, rb_encoding *enc)
11586{
11587 int cr;
11588 str = StringValue(str);
11589 cr = rb_enc_str_coderange(str);
11590 if (cr == ENC_CODERANGE_BROKEN) {
11591 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11592 }
11593 else {
11594 rb_encoding *e = STR_ENC_GET(str);
11595 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11596 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11597 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11598 }
11599 }
11600 return str;
11601}
11602
11603static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11604
11605VALUE
11607{
11608 rb_encoding *enc = STR_ENC_GET(str);
11609 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11610}
11611
11612VALUE
11613rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11614{
11615 int cr = ENC_CODERANGE_UNKNOWN;
11616 if (enc == STR_ENC_GET(str)) {
11617 /* cached coderange makes sense only when enc equals the
11618 * actual encoding of str */
11619 cr = ENC_CODERANGE(str);
11620 }
11621 return enc_str_scrub(enc, str, repl, cr);
11622}
11623
11624static VALUE
11625enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11626{
11627 int encidx;
11628 VALUE buf = Qnil;
11629 const char *rep, *p, *e, *p1, *sp;
11630 long replen = -1;
11631 long slen;
11632
11633 if (rb_block_given_p()) {
11634 if (!NIL_P(repl))
11635 rb_raise(rb_eArgError, "both of block and replacement given");
11636 replen = 0;
11637 }
11638
11639 if (ENC_CODERANGE_CLEAN_P(cr))
11640 return Qnil;
11641
11642 if (!NIL_P(repl)) {
11643 repl = str_compat_and_valid(repl, enc);
11644 }
11645
11646 if (rb_enc_dummy_p(enc)) {
11647 return Qnil;
11648 }
11649 encidx = rb_enc_to_index(enc);
11650
11651#define DEFAULT_REPLACE_CHAR(str) do { \
11652 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11653 rep = replace; replen = (int)sizeof(replace); \
11654 } while (0)
11655
11656 slen = RSTRING_LEN(str);
11657 p = RSTRING_PTR(str);
11658 e = RSTRING_END(str);
11659 p1 = p;
11660 sp = p;
11661
11662 if (rb_enc_asciicompat(enc)) {
11663 int rep7bit_p;
11664 if (!replen) {
11665 rep = NULL;
11666 rep7bit_p = FALSE;
11667 }
11668 else if (!NIL_P(repl)) {
11669 rep = RSTRING_PTR(repl);
11670 replen = RSTRING_LEN(repl);
11671 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11672 }
11673 else if (encidx == rb_utf8_encindex()) {
11674 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11675 rep7bit_p = FALSE;
11676 }
11677 else {
11678 DEFAULT_REPLACE_CHAR("?");
11679 rep7bit_p = TRUE;
11680 }
11681 cr = ENC_CODERANGE_7BIT;
11682
11683 p = search_nonascii(p, e);
11684 if (!p) {
11685 p = e;
11686 }
11687 while (p < e) {
11688 int ret = rb_enc_precise_mbclen(p, e, enc);
11689 if (MBCLEN_NEEDMORE_P(ret)) {
11690 break;
11691 }
11692 else if (MBCLEN_CHARFOUND_P(ret)) {
11694 p += MBCLEN_CHARFOUND_LEN(ret);
11695 }
11696 else if (MBCLEN_INVALID_P(ret)) {
11697 /*
11698 * p1~p: valid ascii/multibyte chars
11699 * p ~e: invalid bytes + unknown bytes
11700 */
11701 long clen = rb_enc_mbmaxlen(enc);
11702 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11703 if (p > p1) {
11704 rb_str_buf_cat(buf, p1, p - p1);
11705 }
11706
11707 if (e - p < clen) clen = e - p;
11708 if (clen <= 2) {
11709 clen = 1;
11710 }
11711 else {
11712 const char *q = p;
11713 clen--;
11714 for (; clen > 1; clen--) {
11715 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11716 if (MBCLEN_NEEDMORE_P(ret)) break;
11717 if (MBCLEN_INVALID_P(ret)) continue;
11719 }
11720 }
11721 if (rep) {
11722 rb_str_buf_cat(buf, rep, replen);
11723 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11724 }
11725 else {
11726 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11727 str_mod_check(str, sp, slen);
11728 repl = str_compat_and_valid(repl, enc);
11729 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11732 }
11733 p += clen;
11734 p1 = p;
11735 p = search_nonascii(p, e);
11736 if (!p) {
11737 p = e;
11738 break;
11739 }
11740 }
11741 else {
11743 }
11744 }
11745 if (NIL_P(buf)) {
11746 if (p == e) {
11747 ENC_CODERANGE_SET(str, cr);
11748 return Qnil;
11749 }
11750 buf = rb_str_buf_new(RSTRING_LEN(str));
11751 }
11752 if (p1 < p) {
11753 rb_str_buf_cat(buf, p1, p - p1);
11754 }
11755 if (p < e) {
11756 if (rep) {
11757 rb_str_buf_cat(buf, rep, replen);
11758 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11759 }
11760 else {
11761 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11762 str_mod_check(str, sp, slen);
11763 repl = str_compat_and_valid(repl, enc);
11764 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11767 }
11768 }
11769 }
11770 else {
11771 /* ASCII incompatible */
11772 long mbminlen = rb_enc_mbminlen(enc);
11773 if (!replen) {
11774 rep = NULL;
11775 }
11776 else if (!NIL_P(repl)) {
11777 rep = RSTRING_PTR(repl);
11778 replen = RSTRING_LEN(repl);
11779 }
11780 else if (encidx == ENCINDEX_UTF_16BE) {
11781 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11782 }
11783 else if (encidx == ENCINDEX_UTF_16LE) {
11784 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11785 }
11786 else if (encidx == ENCINDEX_UTF_32BE) {
11787 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11788 }
11789 else if (encidx == ENCINDEX_UTF_32LE) {
11790 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11791 }
11792 else {
11793 DEFAULT_REPLACE_CHAR("?");
11794 }
11795
11796 while (p < e) {
11797 int ret = rb_enc_precise_mbclen(p, e, enc);
11798 if (MBCLEN_NEEDMORE_P(ret)) {
11799 break;
11800 }
11801 else if (MBCLEN_CHARFOUND_P(ret)) {
11802 p += MBCLEN_CHARFOUND_LEN(ret);
11803 }
11804 else if (MBCLEN_INVALID_P(ret)) {
11805 const char *q = p;
11806 long clen = rb_enc_mbmaxlen(enc);
11807 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11808 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11809
11810 if (e - p < clen) clen = e - p;
11811 if (clen <= mbminlen * 2) {
11812 clen = mbminlen;
11813 }
11814 else {
11815 clen -= mbminlen;
11816 for (; clen > mbminlen; clen-=mbminlen) {
11817 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11818 if (MBCLEN_NEEDMORE_P(ret)) break;
11819 if (MBCLEN_INVALID_P(ret)) continue;
11821 }
11822 }
11823 if (rep) {
11824 rb_str_buf_cat(buf, rep, replen);
11825 }
11826 else {
11827 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11828 str_mod_check(str, sp, slen);
11829 repl = str_compat_and_valid(repl, enc);
11830 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11831 }
11832 p += clen;
11833 p1 = p;
11834 }
11835 else {
11837 }
11838 }
11839 if (NIL_P(buf)) {
11840 if (p == e) {
11842 return Qnil;
11843 }
11844 buf = rb_str_buf_new(RSTRING_LEN(str));
11845 }
11846 if (p1 < p) {
11847 rb_str_buf_cat(buf, p1, p - p1);
11848 }
11849 if (p < e) {
11850 if (rep) {
11851 rb_str_buf_cat(buf, rep, replen);
11852 }
11853 else {
11854 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11855 str_mod_check(str, sp, slen);
11856 repl = str_compat_and_valid(repl, enc);
11857 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11858 }
11859 }
11861 }
11862 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11863 return buf;
11864}
11865
11866/*
11867 * call-seq:
11868 * scrub(replacement_string = default_replacement) -> new_string
11869 * scrub{|bytes| ... } -> new_string
11870 *
11871 * :include: doc/string/scrub.rdoc
11872 *
11873 */
11874static VALUE
11875str_scrub(int argc, VALUE *argv, VALUE str)
11876{
11877 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11878 VALUE new = rb_str_scrub(str, repl);
11879 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11880}
11881
11882/*
11883 * call-seq:
11884 * scrub! -> self
11885 * scrub!(replacement_string = default_replacement) -> self
11886 * scrub!{|bytes| ... } -> self
11887 *
11888 * Like String#scrub, except that any replacements are made in +self+.
11889 *
11890 */
11891static VALUE
11892str_scrub_bang(int argc, VALUE *argv, VALUE str)
11893{
11894 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11895 VALUE new = rb_str_scrub(str, repl);
11896 if (!NIL_P(new)) rb_str_replace(str, new);
11897 return str;
11898}
11899
11900static ID id_normalize;
11901static ID id_normalized_p;
11902static VALUE mUnicodeNormalize;
11903
11904static VALUE
11905unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11906{
11907 static int UnicodeNormalizeRequired = 0;
11908 VALUE argv2[2];
11909
11910 if (!UnicodeNormalizeRequired) {
11911 rb_require("unicode_normalize/normalize.rb");
11912 UnicodeNormalizeRequired = 1;
11913 }
11914 argv2[0] = str;
11915 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11916 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11917}
11918
11919/*
11920 * call-seq:
11921 * unicode_normalize(form = :nfc) -> string
11922 *
11923 * Returns a copy of +self+ with
11924 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11925 *
11926 * Argument +form+ must be one of the following symbols
11927 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11928 *
11929 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11930 * - +:nfd+: Canonical decomposition.
11931 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11932 * - +:nfkd+: Compatibility decomposition.
11933 *
11934 * The encoding of +self+ must be one of:
11935 *
11936 * - Encoding::UTF_8
11937 * - Encoding::UTF_16BE
11938 * - Encoding::UTF_16LE
11939 * - Encoding::UTF_32BE
11940 * - Encoding::UTF_32LE
11941 * - Encoding::GB18030
11942 * - Encoding::UCS_2BE
11943 * - Encoding::UCS_4BE
11944 *
11945 * Examples:
11946 *
11947 * "a\u0300".unicode_normalize # => "a"
11948 * "\u00E0".unicode_normalize(:nfd) # => "a "
11949 *
11950 * Related: String#unicode_normalize!, String#unicode_normalized?.
11951 */
11952static VALUE
11953rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11954{
11955 return unicode_normalize_common(argc, argv, str, id_normalize);
11956}
11957
11958/*
11959 * call-seq:
11960 * unicode_normalize!(form = :nfc) -> self
11961 *
11962 * Like String#unicode_normalize, except that the normalization
11963 * is performed on +self+.
11964 *
11965 * Related String#unicode_normalized?.
11966 *
11967 */
11968static VALUE
11969rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11970{
11971 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11972}
11973
11974/* call-seq:
11975 * unicode_normalized?(form = :nfc) -> true or false
11976 *
11977 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11978 * +false+ otherwise.
11979 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11980 *
11981 * Examples:
11982 *
11983 * "a\u0300".unicode_normalized? # => false
11984 * "a\u0300".unicode_normalized?(:nfd) # => true
11985 * "\u00E0".unicode_normalized? # => true
11986 * "\u00E0".unicode_normalized?(:nfd) # => false
11987 *
11988 *
11989 * Raises an exception if +self+ is not in a Unicode encoding:
11990 *
11991 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
11992 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11993 *
11994 * Related: String#unicode_normalize, String#unicode_normalize!.
11995 *
11996 */
11997static VALUE
11998rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11999{
12000 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12001}
12002
12003/**********************************************************************
12004 * Document-class: Symbol
12005 *
12006 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12007 *
12008 * You can create a +Symbol+ object explicitly with:
12009 *
12010 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12011 *
12012 * The same +Symbol+ object will be
12013 * created for a given name or string for the duration of a program's
12014 * execution, regardless of the context or meaning of that name. Thus
12015 * if <code>Fred</code> is a constant in one context, a method in
12016 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12017 * will be the same object in all three contexts.
12018 *
12019 * module One
12020 * class Fred
12021 * end
12022 * $f1 = :Fred
12023 * end
12024 * module Two
12025 * Fred = 1
12026 * $f2 = :Fred
12027 * end
12028 * def Fred()
12029 * end
12030 * $f3 = :Fred
12031 * $f1.object_id #=> 2514190
12032 * $f2.object_id #=> 2514190
12033 * $f3.object_id #=> 2514190
12034 *
12035 * Constant, method, and variable names are returned as symbols:
12036 *
12037 * module One
12038 * Two = 2
12039 * def three; 3 end
12040 * @four = 4
12041 * @@five = 5
12042 * $six = 6
12043 * end
12044 * seven = 7
12045 *
12046 * One.constants
12047 * # => [:Two]
12048 * One.instance_methods(true)
12049 * # => [:three]
12050 * One.instance_variables
12051 * # => [:@four]
12052 * One.class_variables
12053 * # => [:@@five]
12054 * global_variables.grep(/six/)
12055 * # => [:$six]
12056 * local_variables
12057 * # => [:seven]
12058 *
12059 * A +Symbol+ object differs from a String object in that
12060 * a +Symbol+ object represents an identifier, while a String object
12061 * represents text or data.
12062 *
12063 * == What's Here
12064 *
12065 * First, what's elsewhere. Class +Symbol+:
12066 *
12067 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12068 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12069 *
12070 * Here, class +Symbol+ provides methods that are useful for:
12071 *
12072 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12073 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12074 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12075 *
12076 * === Methods for Querying
12077 *
12078 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12079 * - #=~: Returns the index of the first substring in symbol that matches a
12080 * given Regexp or other object; returns +nil+ if no match is found.
12081 * - #[], #slice : Returns a substring of symbol
12082 * determined by a given index, start/length, or range, or string.
12083 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12084 * - #encoding: Returns the Encoding object that represents the encoding
12085 * of symbol.
12086 * - #end_with?: Returns +true+ if symbol ends with
12087 * any of the given strings.
12088 * - #match: Returns a MatchData object if symbol
12089 * matches a given Regexp; +nil+ otherwise.
12090 * - #match?: Returns +true+ if symbol
12091 * matches a given Regexp; +false+ otherwise.
12092 * - #length, #size: Returns the number of characters in symbol.
12093 * - #start_with?: Returns +true+ if symbol starts with
12094 * any of the given strings.
12095 *
12096 * === Methods for Comparing
12097 *
12098 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12099 * or larger than symbol.
12100 * - #==, #===: Returns +true+ if a given symbol has the same content and
12101 * encoding.
12102 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12103 * symbol is smaller than, equal to, or larger than symbol.
12104 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12105 * after Unicode case folding; +false+ otherwise.
12106 *
12107 * === Methods for Converting
12108 *
12109 * - #capitalize: Returns symbol with the first character upcased
12110 * and all other characters downcased.
12111 * - #downcase: Returns symbol with all characters downcased.
12112 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12113 * - #name: Returns the frozen string corresponding to symbol.
12114 * - #succ, #next: Returns the symbol that is the successor to symbol.
12115 * - #swapcase: Returns symbol with all upcase characters downcased
12116 * and all downcase characters upcased.
12117 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12118 * - #to_s, #id2name: Returns the string corresponding to +self+.
12119 * - #to_sym, #intern: Returns +self+.
12120 * - #upcase: Returns symbol with all characters upcased.
12121 *
12122 */
12123
12124
12125/*
12126 * call-seq:
12127 * symbol == object -> true or false
12128 *
12129 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12130 */
12131
12132#define sym_equal rb_obj_equal
12133
12134static int
12135sym_printable(const char *s, const char *send, rb_encoding *enc)
12136{
12137 while (s < send) {
12138 int n;
12139 int c = rb_enc_precise_mbclen(s, send, enc);
12140
12141 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12142 n = MBCLEN_CHARFOUND_LEN(c);
12143 c = rb_enc_mbc_to_codepoint(s, send, enc);
12144 if (!rb_enc_isprint(c, enc)) return FALSE;
12145 s += n;
12146 }
12147 return TRUE;
12148}
12149
12150int
12151rb_str_symname_p(VALUE sym)
12152{
12153 rb_encoding *enc;
12154 const char *ptr;
12155 long len;
12156 rb_encoding *resenc = rb_default_internal_encoding();
12157
12158 if (resenc == NULL) resenc = rb_default_external_encoding();
12159 enc = STR_ENC_GET(sym);
12160 ptr = RSTRING_PTR(sym);
12161 len = RSTRING_LEN(sym);
12162 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12163 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12164 return FALSE;
12165 }
12166 return TRUE;
12167}
12168
12169VALUE
12170rb_str_quote_unprintable(VALUE str)
12171{
12172 rb_encoding *enc;
12173 const char *ptr;
12174 long len;
12175 rb_encoding *resenc;
12176
12177 Check_Type(str, T_STRING);
12178 resenc = rb_default_internal_encoding();
12179 if (resenc == NULL) resenc = rb_default_external_encoding();
12180 enc = STR_ENC_GET(str);
12181 ptr = RSTRING_PTR(str);
12182 len = RSTRING_LEN(str);
12183 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12184 !sym_printable(ptr, ptr + len, enc)) {
12185 return rb_str_escape(str);
12186 }
12187 return str;
12188}
12189
12190VALUE
12191rb_id_quote_unprintable(ID id)
12192{
12193 VALUE str = rb_id2str(id);
12194 if (!rb_str_symname_p(str)) {
12195 return rb_str_escape(str);
12196 }
12197 return str;
12198}
12199
12200/*
12201 * call-seq:
12202 * inspect -> string
12203 *
12204 * Returns a string representation of +self+ (including the leading colon):
12205 *
12206 * :foo.inspect # => ":foo"
12207 *
12208 * Related: Symbol#to_s, Symbol#name.
12209 *
12210 */
12211
12212static VALUE
12213sym_inspect(VALUE sym)
12214{
12215 VALUE str = rb_sym2str(sym);
12216 const char *ptr;
12217 long len;
12218 char *dest;
12219
12220 if (!rb_str_symname_p(str)) {
12221 str = rb_str_inspect(str);
12222 len = RSTRING_LEN(str);
12223 rb_str_resize(str, len + 1);
12224 dest = RSTRING_PTR(str);
12225 memmove(dest + 1, dest, len);
12226 }
12227 else {
12228 rb_encoding *enc = STR_ENC_GET(str);
12229 VALUE orig_str = str;
12230
12231 len = RSTRING_LEN(orig_str);
12232 str = rb_enc_str_new(0, len + 1, enc);
12233
12234 // Get data pointer after allocation
12235 ptr = RSTRING_PTR(orig_str);
12236 dest = RSTRING_PTR(str);
12237 memcpy(dest + 1, ptr, len);
12238
12239 RB_GC_GUARD(orig_str);
12240 }
12241 dest[0] = ':';
12242
12244
12245 return str;
12246}
12247
12248VALUE
12250{
12251 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12252 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12253 return str;
12254}
12255
12256VALUE
12257rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12258{
12259 VALUE obj;
12260
12261 if (argc < 1) {
12262 rb_raise(rb_eArgError, "no receiver given");
12263 }
12264 obj = argv[0];
12265 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12266}
12267
12268/*
12269 * call-seq:
12270 * succ
12271 *
12272 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12273 *
12274 * :foo.succ # => :fop
12275 *
12276 * Related: String#succ.
12277 */
12278
12279static VALUE
12280sym_succ(VALUE sym)
12281{
12282 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12283}
12284
12285/*
12286 * call-seq:
12287 * symbol <=> object -> -1, 0, +1, or nil
12288 *
12289 * If +object+ is a symbol,
12290 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12291 *
12292 * :bar <=> :foo # => -1
12293 * :foo <=> :foo # => 0
12294 * :foo <=> :bar # => 1
12295 *
12296 * Otherwise, returns +nil+:
12297 *
12298 * :foo <=> 'bar' # => nil
12299 *
12300 * Related: String#<=>.
12301 */
12302
12303static VALUE
12304sym_cmp(VALUE sym, VALUE other)
12305{
12306 if (!SYMBOL_P(other)) {
12307 return Qnil;
12308 }
12309 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12310}
12311
12312/*
12313 * call-seq:
12314 * casecmp(object) -> -1, 0, 1, or nil
12315 *
12316 * :include: doc/symbol/casecmp.rdoc
12317 *
12318 */
12319
12320static VALUE
12321sym_casecmp(VALUE sym, VALUE other)
12322{
12323 if (!SYMBOL_P(other)) {
12324 return Qnil;
12325 }
12326 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12327}
12328
12329/*
12330 * call-seq:
12331 * casecmp?(object) -> true, false, or nil
12332 *
12333 * :include: doc/symbol/casecmp_p.rdoc
12334 *
12335 */
12336
12337static VALUE
12338sym_casecmp_p(VALUE sym, VALUE other)
12339{
12340 if (!SYMBOL_P(other)) {
12341 return Qnil;
12342 }
12343 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12344}
12345
12346/*
12347 * call-seq:
12348 * symbol =~ object -> integer or nil
12349 *
12350 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12351 * including possible updates to global variables;
12352 * see String#=~.
12353 *
12354 */
12355
12356static VALUE
12357sym_match(VALUE sym, VALUE other)
12358{
12359 return rb_str_match(rb_sym2str(sym), other);
12360}
12361
12362/*
12363 * call-seq:
12364 * match(pattern, offset = 0) -> matchdata or nil
12365 * match(pattern, offset = 0) {|matchdata| } -> object
12366 *
12367 * Equivalent to <tt>self.to_s.match</tt>,
12368 * including possible updates to global variables;
12369 * see String#match.
12370 *
12371 */
12372
12373static VALUE
12374sym_match_m(int argc, VALUE *argv, VALUE sym)
12375{
12376 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12377}
12378
12379/*
12380 * call-seq:
12381 * match?(pattern, offset) -> true or false
12382 *
12383 * Equivalent to <tt>sym.to_s.match?</tt>;
12384 * see String#match.
12385 *
12386 */
12387
12388static VALUE
12389sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12390{
12391 return rb_str_match_m_p(argc, argv, sym);
12392}
12393
12394/*
12395 * call-seq:
12396 * symbol[index] -> string or nil
12397 * symbol[start, length] -> string or nil
12398 * symbol[range] -> string or nil
12399 * symbol[regexp, capture = 0] -> string or nil
12400 * symbol[substring] -> string or nil
12401 *
12402 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12403 *
12404 */
12405
12406static VALUE
12407sym_aref(int argc, VALUE *argv, VALUE sym)
12408{
12409 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12410}
12411
12412/*
12413 * call-seq:
12414 * length -> integer
12415 *
12416 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12417 */
12418
12419static VALUE
12420sym_length(VALUE sym)
12421{
12422 return rb_str_length(rb_sym2str(sym));
12423}
12424
12425/*
12426 * call-seq:
12427 * empty? -> true or false
12428 *
12429 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12430 *
12431 */
12432
12433static VALUE
12434sym_empty(VALUE sym)
12435{
12436 return rb_str_empty(rb_sym2str(sym));
12437}
12438
12439/*
12440 * call-seq:
12441 * upcase(mapping) -> symbol
12442 *
12443 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12444 *
12445 * See String#upcase.
12446 *
12447 */
12448
12449static VALUE
12450sym_upcase(int argc, VALUE *argv, VALUE sym)
12451{
12452 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12453}
12454
12455/*
12456 * call-seq:
12457 * downcase(mapping) -> symbol
12458 *
12459 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12460 *
12461 * See String#downcase.
12462 *
12463 * Related: Symbol#upcase.
12464 *
12465 */
12466
12467static VALUE
12468sym_downcase(int argc, VALUE *argv, VALUE sym)
12469{
12470 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12471}
12472
12473/*
12474 * call-seq:
12475 * capitalize(mapping) -> symbol
12476 *
12477 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12478 *
12479 * See String#capitalize.
12480 *
12481 */
12482
12483static VALUE
12484sym_capitalize(int argc, VALUE *argv, VALUE sym)
12485{
12486 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12487}
12488
12489/*
12490 * call-seq:
12491 * swapcase(mapping) -> symbol
12492 *
12493 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12494 *
12495 * See String#swapcase.
12496 *
12497 */
12498
12499static VALUE
12500sym_swapcase(int argc, VALUE *argv, VALUE sym)
12501{
12502 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12503}
12504
12505/*
12506 * call-seq:
12507 * start_with?(*string_or_regexp) -> true or false
12508 *
12509 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12510 *
12511 */
12512
12513static VALUE
12514sym_start_with(int argc, VALUE *argv, VALUE sym)
12515{
12516 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12517}
12518
12519/*
12520 * call-seq:
12521 * end_with?(*strings) -> true or false
12522 *
12523 *
12524 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12525 *
12526 */
12527
12528static VALUE
12529sym_end_with(int argc, VALUE *argv, VALUE sym)
12530{
12531 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12532}
12533
12534/*
12535 * call-seq:
12536 * encoding -> encoding
12537 *
12538 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12539 *
12540 */
12541
12542static VALUE
12543sym_encoding(VALUE sym)
12544{
12545 return rb_obj_encoding(rb_sym2str(sym));
12546}
12547
12548static VALUE
12549string_for_symbol(VALUE name)
12550{
12551 if (!RB_TYPE_P(name, T_STRING)) {
12552 VALUE tmp = rb_check_string_type(name);
12553 if (NIL_P(tmp)) {
12554 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12555 name);
12556 }
12557 name = tmp;
12558 }
12559 return name;
12560}
12561
12562ID
12564{
12565 if (SYMBOL_P(name)) {
12566 return SYM2ID(name);
12567 }
12568 name = string_for_symbol(name);
12569 return rb_intern_str(name);
12570}
12571
12572VALUE
12574{
12575 if (SYMBOL_P(name)) {
12576 return name;
12577 }
12578 name = string_for_symbol(name);
12579 return rb_str_intern(name);
12580}
12581
12582/*
12583 * call-seq:
12584 * Symbol.all_symbols -> array_of_symbols
12585 *
12586 * Returns an array of all symbols currently in Ruby's symbol table:
12587 *
12588 * Symbol.all_symbols.size # => 9334
12589 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12590 *
12591 */
12592
12593static VALUE
12594sym_all_symbols(VALUE _)
12595{
12596 return rb_sym_all_symbols();
12597}
12598
12599VALUE
12600rb_str_to_interned_str(VALUE str)
12601{
12602 return rb_fstring(str);
12603}
12604
12605VALUE
12606rb_interned_str(const char *ptr, long len)
12607{
12608 struct RString fake_str;
12609 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12610}
12611
12612VALUE
12614{
12615 return rb_interned_str(ptr, strlen(ptr));
12616}
12617
12618VALUE
12619rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12620{
12621 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12622 rb_enc_autoload(enc);
12623 }
12624
12625 struct RString fake_str;
12626 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12627}
12628
12629VALUE
12630rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12631{
12632 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12633 rb_enc_autoload(enc);
12634 }
12635
12636 struct RString fake_str;
12637 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12638}
12639
12640VALUE
12642{
12643 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12644}
12645
12646#if USE_YJIT
12647void
12648rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12649{
12650 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12651 ssize_t code = RB_NUM2SSIZE(codepoint);
12652
12653 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12654 rb_str_buf_cat_byte(str, (char) code);
12655 return;
12656 }
12657 }
12658
12659 rb_str_concat(str, codepoint);
12660}
12661#endif
12662
12663static int
12664fstring_set_class_i(VALUE *str, void *data)
12665{
12666 RBASIC_SET_CLASS(*str, rb_cString);
12667
12668 return ST_CONTINUE;
12669}
12670
12671void
12672Init_String(void)
12673{
12674 rb_cString = rb_define_class("String", rb_cObject);
12675
12676 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12677
12679 rb_define_alloc_func(rb_cString, empty_str_alloc);
12680 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12681 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12682 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12683 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12684 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12687 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12688 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12689 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12690 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12693 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12694 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12695 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12696 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12699 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12700 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12701 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12702 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12703 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12705 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12707 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12708 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12709 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12710 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12711 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12712 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12714 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12715 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12716 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12717 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12718 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12719 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12720 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12721 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12723 rb_define_method(rb_cString, "+@", str_uplus, 0);
12724 rb_define_method(rb_cString, "-@", str_uminus, 0);
12725 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12726 rb_define_alias(rb_cString, "dedup", "-@");
12727
12728 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12729 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12730 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12731 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12734 rb_define_method(rb_cString, "undump", str_undump, 0);
12735
12736 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12737 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12738 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12739 sym_fold = ID2SYM(rb_intern_const("fold"));
12740
12741 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12742 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12743 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12744 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12745
12746 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12747 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12748 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12749 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12750
12751 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12752 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12753 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12754 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12755 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12756 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12757 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12758 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12759 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12760 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12761 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12762 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12764 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12765 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12766 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12767 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12768 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12769
12770 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12771 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12772 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12773
12774 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12775
12776 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12777 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12778 rb_define_method(rb_cString, "center", rb_str_center, -1);
12779
12780 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12781 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12782 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12783 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12784 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12785 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12786 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12787 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12788 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12789
12790 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12791 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12792 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12793 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12794 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12795 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12796 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12797 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12798 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12799
12800 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12801 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12802 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12803 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12804 rb_define_method(rb_cString, "count", rb_str_count, -1);
12805
12806 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12807 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12808 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12809 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12810
12811 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12812 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12813 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12814 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12815 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12816
12817 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12818
12819 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12820 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12821
12822 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12823 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12824
12825 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12826 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12827 rb_define_method(rb_cString, "b", rb_str_b, 0);
12828 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12829 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12830
12831 /* define UnicodeNormalize module here so that we don't have to look it up */
12832 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12833 id_normalize = rb_intern_const("normalize");
12834 id_normalized_p = rb_intern_const("normalized?");
12835
12836 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12837 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12838 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12839
12840 rb_fs = Qnil;
12841 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12842 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12843 rb_gc_register_address(&rb_fs);
12844
12845 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12849 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12850
12851 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12852 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12853 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12854 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12855 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12856 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12857
12858 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12859 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12860 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12861 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12862
12863 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12864 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12865 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12866 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12867 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12868 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12869 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12870
12871 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12872 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12873 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12874 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12875
12876 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12877 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12878
12879 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12880}
12881
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1696
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1479
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1597
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2663
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:943
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:206
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2125
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2143
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1311
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3539
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:175
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1299
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3223
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1316
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:931
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1181
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2986
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1200
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12619
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2293
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3690
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1129
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1421
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1322
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:950
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12641
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:815
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:444
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1490
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2670
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2934
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1746
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:700
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1861
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1071
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1867
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4225
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3722
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1927
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1716
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1486
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2445
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3755
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1397
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12249
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2518
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1373
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1710
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3014
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5403
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4119
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3111
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11548
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1768
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1752
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1163
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:985
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1492
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1955
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4105
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3523
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2382
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1973
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6641
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3119
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12613
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1403
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3721
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3061
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4228
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3345
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7315
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2748
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12606
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4175
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3992
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4150
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3697
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3236
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5913
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11606
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1666
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2908
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3208
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3327
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1175
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2702
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7429
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1385
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1682
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2396
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5831
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9490
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1169
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:911
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1814
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2094
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2171
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3127
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1420
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:999
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12573
ID rb_to_id(VALUE str)
Definition string.c:12563
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1866
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3501
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4469
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1415
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2885
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2767
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1409
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2780
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1743
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1586
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
Definition string.c:8373
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:295
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113