Ruby 3.5.0dev (2025-07-08 revision c913a635d79f405695699594c73fb04cfe47d239)
string.c (c913a635d79f405695699594c73fb04cfe47d239)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/concurrent_set.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_concurrent_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_concurrent_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_concurrent_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static struct rb_concurrent_set_funcs fstring_concurrent_set_funcs = {
552 .hash = fstring_concurrent_set_hash,
553 .cmp = fstring_concurrent_set_cmp,
554 .create = fstring_concurrent_set_create,
555};
556
557void
558Init_fstring_table(void)
559{
560 fstring_table_obj = rb_concurrent_set_new(&fstring_concurrent_set_funcs, 8192);
561 rb_gc_register_address(&fstring_table_obj);
562}
563
564static VALUE
565register_fstring(VALUE str, bool copy, bool force_precompute_hash)
566{
567 struct fstr_create_arg args = {
568 .copy = copy,
569 .force_precompute_hash = force_precompute_hash
570 };
571
572#if SIZEOF_VOIDP == SIZEOF_LONG
573 if (FL_TEST_RAW(str, STR_FAKESTR)) {
574 // if the string hasn't been interned, we'll need the hash twice, so we
575 // compute it once and store it in capa
576 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
577 }
578#endif
579
580 VALUE result = rb_concurrent_set_find_or_insert(&fstring_table_obj, str, &args);
581
582 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
584 RUBY_ASSERT(OBJ_FROZEN(result));
585 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
587
588 return result;
589}
590
591bool
592rb_obj_is_fstring_table(VALUE obj)
593{
594 ASSERT_vm_locking();
595
596 return obj == fstring_table_obj;
597}
598
599void
600rb_gc_free_fstring(VALUE obj)
601{
602 // Assume locking and barrier (which there is no assert for)
603 ASSERT_vm_locking();
604
605 rb_concurrent_set_delete_by_identity(fstring_table_obj, obj);
606
607 RB_DEBUG_COUNTER_INC(obj_str_fstr);
608
609 FL_UNSET(obj, RSTRING_FSTR);
610}
611
612void
613rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
614{
615 if (fstring_table_obj) {
616 rb_concurrent_set_foreach_with_replace(fstring_table_obj, callback, data);
617 }
618}
619
620static VALUE
621setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
622{
623 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
624 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
625
626 if (!name) {
628 name = "";
629 }
630
631 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
632
633 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
634 fake_str->len = len;
635 fake_str->as.heap.ptr = (char *)name;
636 fake_str->as.heap.aux.capa = len;
637 return (VALUE)fake_str;
638}
639
640/*
641 * set up a fake string which refers a static string literal.
642 */
643VALUE
644rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
645{
646 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
647}
648
649/*
650 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
651 * shared string which refers a static string literal. `ptr` must
652 * point a constant string.
653 */
654VALUE
655rb_fstring_new(const char *ptr, long len)
656{
657 struct RString fake_str;
658 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
659}
660
661VALUE
662rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
663{
664 struct RString fake_str;
665 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
666}
667
668VALUE
669rb_fstring_cstr(const char *ptr)
670{
671 return rb_fstring_new(ptr, strlen(ptr));
672}
673
674static inline bool
675single_byte_optimizable(VALUE str)
676{
677 int encindex = ENCODING_GET(str);
678 switch (encindex) {
679 case ENCINDEX_ASCII_8BIT:
680 case ENCINDEX_US_ASCII:
681 return true;
682 case ENCINDEX_UTF_8:
683 // For UTF-8 it's worth scanning the string coderange when unknown.
685 }
686 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
687 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
688 return true;
689 }
690
691 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
692 return true;
693 }
694
695 /* Conservative. Possibly single byte.
696 * "\xa1" in Shift_JIS for example. */
697 return false;
698}
699
701
702static inline const char *
703search_nonascii(const char *p, const char *e)
704{
705 const uintptr_t *s, *t;
706
707#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
708# if SIZEOF_UINTPTR_T == 8
709# define NONASCII_MASK UINT64_C(0x8080808080808080)
710# elif SIZEOF_UINTPTR_T == 4
711# define NONASCII_MASK UINT32_C(0x80808080)
712# else
713# error "don't know what to do."
714# endif
715#else
716# if SIZEOF_UINTPTR_T == 8
717# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
718# elif SIZEOF_UINTPTR_T == 4
719# define NONASCII_MASK 0x80808080UL /* or...? */
720# else
721# error "don't know what to do."
722# endif
723#endif
724
725 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
726#if !UNALIGNED_WORD_ACCESS
727 if ((uintptr_t)p % SIZEOF_VOIDP) {
728 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
729 p += l;
730 switch (l) {
731 default: UNREACHABLE;
732#if SIZEOF_VOIDP > 4
733 case 7: if (p[-7]&0x80) return p-7;
734 case 6: if (p[-6]&0x80) return p-6;
735 case 5: if (p[-5]&0x80) return p-5;
736 case 4: if (p[-4]&0x80) return p-4;
737#endif
738 case 3: if (p[-3]&0x80) return p-3;
739 case 2: if (p[-2]&0x80) return p-2;
740 case 1: if (p[-1]&0x80) return p-1;
741 case 0: break;
742 }
743 }
744#endif
745#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
746#define aligned_ptr(value) \
747 __builtin_assume_aligned((value), sizeof(uintptr_t))
748#else
749#define aligned_ptr(value) (uintptr_t *)(value)
750#endif
751 s = aligned_ptr(p);
752 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
753#undef aligned_ptr
754 for (;s < t; s++) {
755 if (*s & NONASCII_MASK) {
756#ifdef WORDS_BIGENDIAN
757 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
758#else
759 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
760#endif
761 }
762 }
763 p = (const char *)s;
764 }
765
766 switch (e - p) {
767 default: UNREACHABLE;
768#if SIZEOF_VOIDP > 4
769 case 7: if (e[-7]&0x80) return e-7;
770 case 6: if (e[-6]&0x80) return e-6;
771 case 5: if (e[-5]&0x80) return e-5;
772 case 4: if (e[-4]&0x80) return e-4;
773#endif
774 case 3: if (e[-3]&0x80) return e-3;
775 case 2: if (e[-2]&0x80) return e-2;
776 case 1: if (e[-1]&0x80) return e-1;
777 case 0: return NULL;
778 }
779}
780
781static int
782coderange_scan(const char *p, long len, rb_encoding *enc)
783{
784 const char *e = p + len;
785
786 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
787 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
788 p = search_nonascii(p, e);
790 }
791
792 if (rb_enc_asciicompat(enc)) {
793 p = search_nonascii(p, e);
794 if (!p) return ENC_CODERANGE_7BIT;
795 for (;;) {
796 int ret = rb_enc_precise_mbclen(p, e, enc);
798 p += MBCLEN_CHARFOUND_LEN(ret);
799 if (p == e) break;
800 p = search_nonascii(p, e);
801 if (!p) break;
802 }
803 }
804 else {
805 while (p < e) {
806 int ret = rb_enc_precise_mbclen(p, e, enc);
808 p += MBCLEN_CHARFOUND_LEN(ret);
809 }
810 }
811 return ENC_CODERANGE_VALID;
812}
813
814long
815rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
816{
817 const char *p = s;
818
819 if (*cr == ENC_CODERANGE_BROKEN)
820 return e - s;
821
822 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
823 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
824 if (*cr == ENC_CODERANGE_VALID) return e - s;
825 p = search_nonascii(p, e);
827 return e - s;
828 }
829 else if (rb_enc_asciicompat(enc)) {
830 p = search_nonascii(p, e);
831 if (!p) {
832 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
833 return e - s;
834 }
835 for (;;) {
836 int ret = rb_enc_precise_mbclen(p, e, enc);
837 if (!MBCLEN_CHARFOUND_P(ret)) {
839 return p - s;
840 }
841 p += MBCLEN_CHARFOUND_LEN(ret);
842 if (p == e) break;
843 p = search_nonascii(p, e);
844 if (!p) break;
845 }
846 }
847 else {
848 while (p < e) {
849 int ret = rb_enc_precise_mbclen(p, e, enc);
850 if (!MBCLEN_CHARFOUND_P(ret)) {
852 return p - s;
853 }
854 p += MBCLEN_CHARFOUND_LEN(ret);
855 }
856 }
858 return e - s;
859}
860
861static inline void
862str_enc_copy(VALUE str1, VALUE str2)
863{
864 rb_enc_set_index(str1, ENCODING_GET(str2));
865}
866
867/* Like str_enc_copy, but does not check frozen status of str1.
868 * You should use this only if you're certain that str1 is not frozen. */
869static inline void
870str_enc_copy_direct(VALUE str1, VALUE str2)
871{
872 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
873 if (inlined_encoding == ENCODING_INLINE_MAX) {
874 rb_enc_set_index(str1, rb_enc_get_index(str2));
875 }
876 else {
877 ENCODING_SET_INLINED(str1, inlined_encoding);
878 }
879}
880
881static void
882rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
883{
884 /* this function is designed for copying encoding and coderange
885 * from src to new string "dest" which is made from the part of src.
886 */
887 str_enc_copy(dest, src);
888 if (RSTRING_LEN(dest) == 0) {
889 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
891 else
893 return;
894 }
895 switch (ENC_CODERANGE(src)) {
898 break;
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
901 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
903 else
905 break;
906 default:
907 break;
908 }
909}
910
911static void
912rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
913{
914 str_enc_copy(dest, src);
916}
917
918static int
919enc_coderange_scan(VALUE str, rb_encoding *enc)
920{
921 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
922}
923
924int
925rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
926{
927 return enc_coderange_scan(str, enc);
928}
929
930int
932{
933 int cr = ENC_CODERANGE(str);
934
935 if (cr == ENC_CODERANGE_UNKNOWN) {
936 cr = enc_coderange_scan(str, get_encoding(str));
937 ENC_CODERANGE_SET(str, cr);
938 }
939 return cr;
940}
941
942static inline bool
943rb_enc_str_asciicompat(VALUE str)
944{
945 int encindex = ENCODING_GET_INLINED(str);
946 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
947}
948
949int
951{
952 switch(ENC_CODERANGE(str)) {
954 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
956 return true;
957 default:
958 return false;
959 }
960}
961
962static inline void
963str_mod_check(VALUE s, const char *p, long len)
964{
965 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
966 rb_raise(rb_eRuntimeError, "string modified");
967 }
968}
969
970static size_t
971str_capacity(VALUE str, const int termlen)
972{
973 if (STR_EMBED_P(str)) {
974 return str_embed_capa(str) - termlen;
975 }
976 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
977 return RSTRING(str)->len;
978 }
979 else {
980 return RSTRING(str)->as.heap.aux.capa;
981 }
982}
983
984size_t
986{
987 return str_capacity(str, TERM_LEN(str));
988}
989
990static inline void
991must_not_null(const char *ptr)
992{
993 if (!ptr) {
994 rb_raise(rb_eArgError, "NULL pointer given");
995 }
996}
997
998static inline VALUE
999str_alloc_embed(VALUE klass, size_t capa)
1000{
1001 size_t size = rb_str_embed_size(capa);
1002 RUBY_ASSERT(size > 0);
1003 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1004
1005 NEWOBJ_OF(str, struct RString, klass,
1007
1008 return (VALUE)str;
1009}
1010
1011static inline VALUE
1012str_alloc_heap(VALUE klass)
1013{
1014 NEWOBJ_OF(str, struct RString, klass,
1015 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1016
1017 return (VALUE)str;
1018}
1019
1020static inline VALUE
1021empty_str_alloc(VALUE klass)
1022{
1023 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1024 VALUE str = str_alloc_embed(klass, 0);
1025 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1027 return str;
1028}
1029
1030static VALUE
1031str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (enc == NULL) {
1040 enc = rb_ascii8bit_encoding();
1041 }
1042
1043 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1044
1045 int termlen = rb_enc_mbminlen(enc);
1046
1047 if (STR_EMBEDDABLE_P(len, termlen)) {
1048 str = str_alloc_embed(klass, len + termlen);
1049 if (len == 0) {
1050 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1051 }
1052 }
1053 else {
1054 str = str_alloc_heap(klass);
1055 RSTRING(str)->as.heap.aux.capa = len;
1056 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1057 * integer overflow. If we can STATIC_ASSERT that, the following
1058 * mul_add_mul can be reverted to a simple ALLOC_N. */
1059 RSTRING(str)->as.heap.ptr =
1060 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1061 }
1062
1063 rb_enc_raw_set(str, enc);
1064
1065 if (ptr) {
1066 memcpy(RSTRING_PTR(str), ptr, len);
1067 }
1068
1069 STR_SET_LEN(str, len);
1070 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1071 return str;
1072}
1073
1074static VALUE
1075str_new(VALUE klass, const char *ptr, long len)
1076{
1077 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1078}
1079
1080VALUE
1081rb_str_new(const char *ptr, long len)
1082{
1083 return str_new(rb_cString, ptr, len);
1084}
1085
1086VALUE
1087rb_usascii_str_new(const char *ptr, long len)
1088{
1089 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1090}
1091
1092VALUE
1093rb_utf8_str_new(const char *ptr, long len)
1094{
1095 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1096}
1097
1098VALUE
1099rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1100{
1101 return str_enc_new(rb_cString, ptr, len, enc);
1102}
1103
1104VALUE
1106{
1107 must_not_null(ptr);
1108 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1109 * memory regions, and that cannot be detected by the MSAN. Just
1110 * trust the programmer that the argument passed here is a sane C
1111 * string. */
1112 __msan_unpoison_string(ptr);
1113 return rb_str_new(ptr, strlen(ptr));
1114}
1115
1116VALUE
1118{
1119 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1120}
1121
1122VALUE
1124{
1125 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 if (rb_enc_mbminlen(enc) != 1) {
1133 rb_raise(rb_eArgError, "wchar encoding given");
1134 }
1135 return rb_enc_str_new(ptr, strlen(ptr), enc);
1136}
1137
1138static VALUE
1139str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1140{
1141 VALUE str;
1142
1143 if (len < 0) {
1144 rb_raise(rb_eArgError, "negative string size (or size too big)");
1145 }
1146
1147 if (!ptr) {
1148 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1149 }
1150 else {
1151 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1152 str = str_alloc_heap(klass);
1153 RSTRING(str)->len = len;
1154 RSTRING(str)->as.heap.ptr = (char *)ptr;
1155 RSTRING(str)->as.heap.aux.capa = len;
1156 RBASIC(str)->flags |= STR_NOFREE;
1157 rb_enc_associate_index(str, encindex);
1158 }
1159 return str;
1160}
1161
1162VALUE
1163rb_str_new_static(const char *ptr, long len)
1164{
1165 return str_new_static(rb_cString, ptr, len, 0);
1166}
1167
1168VALUE
1170{
1171 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1172}
1173
1174VALUE
1176{
1177 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1178}
1179
1180VALUE
1182{
1183 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1184}
1185
1186static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1187 rb_encoding *from, rb_encoding *to,
1188 int ecflags, VALUE ecopts);
1189
1190static inline bool
1191is_enc_ascii_string(VALUE str, rb_encoding *enc)
1192{
1193 int encidx = rb_enc_to_index(enc);
1194 if (rb_enc_get_index(str) == encidx)
1195 return is_ascii_string(str);
1196 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1197}
1198
1199VALUE
1200rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1201{
1202 long len;
1203 const char *ptr;
1204 VALUE newstr;
1205
1206 if (!to) return str;
1207 if (!from) from = rb_enc_get(str);
1208 if (from == to) return str;
1209 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1210 rb_is_ascii8bit_enc(to)) {
1211 if (STR_ENC_GET(str) != to) {
1212 str = rb_str_dup(str);
1213 rb_enc_associate(str, to);
1214 }
1215 return str;
1216 }
1217
1218 RSTRING_GETMEM(str, ptr, len);
1219 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1220 from, to, ecflags, ecopts);
1221 if (NIL_P(newstr)) {
1222 /* some error, return original */
1223 return str;
1224 }
1225 return newstr;
1226}
1227
1228VALUE
1229rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1230 rb_encoding *from, int ecflags, VALUE ecopts)
1231{
1232 long olen;
1233
1234 olen = RSTRING_LEN(newstr);
1235 if (ofs < -olen || olen < ofs)
1236 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1237 if (ofs < 0) ofs += olen;
1238 if (!from) {
1239 STR_SET_LEN(newstr, ofs);
1240 return rb_str_cat(newstr, ptr, len);
1241 }
1242
1243 rb_str_modify(newstr);
1244 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1245 rb_enc_get(newstr),
1246 ecflags, ecopts);
1247}
1248
1249VALUE
1250rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1251{
1252 STR_SET_LEN(str, 0);
1253 rb_enc_associate(str, enc);
1254 rb_str_cat(str, ptr, len);
1255 return str;
1256}
1257
1258static VALUE
1259str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1260 rb_encoding *from, rb_encoding *to,
1261 int ecflags, VALUE ecopts)
1262{
1263 rb_econv_t *ec;
1265 long olen;
1266 VALUE econv_wrapper;
1267 const unsigned char *start, *sp;
1268 unsigned char *dest, *dp;
1269 size_t converted_output = (size_t)ofs;
1270
1271 olen = rb_str_capacity(newstr);
1272
1273 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1274 RBASIC_CLEAR_CLASS(econv_wrapper);
1275 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1276 if (!ec) return Qnil;
1277 DATA_PTR(econv_wrapper) = ec;
1278
1279 sp = (unsigned char*)ptr;
1280 start = sp;
1281 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1282 (dp = dest + converted_output),
1283 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1285 /* destination buffer short */
1286 size_t converted_input = sp - start;
1287 size_t rest = len - converted_input;
1288 converted_output = dp - dest;
1289 rb_str_set_len(newstr, converted_output);
1290 if (converted_input && converted_output &&
1291 rest < (LONG_MAX / converted_output)) {
1292 rest = (rest * converted_output) / converted_input;
1293 }
1294 else {
1295 rest = olen;
1296 }
1297 olen += rest < 2 ? 2 : rest;
1298 rb_str_resize(newstr, olen);
1299 }
1300 DATA_PTR(econv_wrapper) = 0;
1301 RB_GC_GUARD(econv_wrapper);
1302 rb_econv_close(ec);
1303 switch (ret) {
1304 case econv_finished:
1305 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1306 rb_str_set_len(newstr, len);
1307 rb_enc_associate(newstr, to);
1308 return newstr;
1309
1310 default:
1311 return Qnil;
1312 }
1313}
1314
1315VALUE
1317{
1318 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1319}
1320
1321VALUE
1323{
1324 rb_encoding *ienc;
1325 VALUE str;
1326 const int eidx = rb_enc_to_index(eenc);
1327
1328 if (!ptr) {
1329 return rb_enc_str_new(ptr, len, eenc);
1330 }
1331
1332 /* ASCII-8BIT case, no conversion */
1333 if ((eidx == rb_ascii8bit_encindex()) ||
1334 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1335 return rb_str_new(ptr, len);
1336 }
1337 /* no default_internal or same encoding, no conversion */
1338 ienc = rb_default_internal_encoding();
1339 if (!ienc || eenc == ienc) {
1340 return rb_enc_str_new(ptr, len, eenc);
1341 }
1342 /* ASCII compatible, and ASCII only string, no conversion in
1343 * default_internal */
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex()) ||
1346 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1347 return rb_enc_str_new(ptr, len, ienc);
1348 }
1349 /* convert from the given encoding to default_internal */
1350 str = rb_enc_str_new(NULL, 0, ienc);
1351 /* when the conversion failed for some reason, just ignore the
1352 * default_internal and result in the given encoding as-is. */
1353 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1354 rb_str_initialize(str, ptr, len, eenc);
1355 }
1356 return str;
1357}
1358
1359VALUE
1360rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1361{
1362 int eidx = rb_enc_to_index(eenc);
1363 if (eidx == rb_usascii_encindex() &&
1364 !is_ascii_string(str)) {
1365 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1366 return str;
1367 }
1368 rb_enc_associate_index(str, eidx);
1369 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1370}
1371
1372VALUE
1373rb_external_str_new(const char *ptr, long len)
1374{
1375 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1376}
1377
1378VALUE
1380{
1381 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1382}
1383
1384VALUE
1385rb_locale_str_new(const char *ptr, long len)
1386{
1387 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1388}
1389
1390VALUE
1392{
1393 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1394}
1395
1396VALUE
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1406}
1407
1408VALUE
1410{
1411 return rb_str_export_to_enc(str, rb_default_external_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_str_export_to_enc(str, rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1424}
1425
1426static VALUE
1427str_replace_shared_without_enc(VALUE str2, VALUE str)
1428{
1429 const int termlen = TERM_LEN(str);
1430 char *ptr;
1431 long len;
1432
1433 RSTRING_GETMEM(str, ptr, len);
1434 if (str_embed_capa(str2) >= len + termlen) {
1435 char *ptr2 = RSTRING(str2)->as.embed.ary;
1436 STR_SET_EMBED(str2);
1437 memcpy(ptr2, RSTRING_PTR(str), len);
1438 TERM_FILL(ptr2+len, termlen);
1439 }
1440 else {
1441 VALUE root;
1442 if (STR_SHARED_P(str)) {
1443 root = RSTRING(str)->as.heap.aux.shared;
1444 RSTRING_GETMEM(str, ptr, len);
1445 }
1446 else {
1447 root = rb_str_new_frozen(str);
1448 RSTRING_GETMEM(root, ptr, len);
1449 }
1450 RUBY_ASSERT(OBJ_FROZEN(root));
1451
1452 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1453 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1454 rb_fatal("about to free a possible shared root");
1455 }
1456 char *ptr2 = STR_HEAP_PTR(str2);
1457 if (ptr2 != ptr) {
1458 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1459 }
1460 }
1461 FL_SET(str2, STR_NOEMBED);
1462 RSTRING(str2)->as.heap.ptr = ptr;
1463 STR_SET_SHARED(str2, root);
1464 }
1465
1466 STR_SET_LEN(str2, len);
1467
1468 return str2;
1469}
1470
1471static VALUE
1472str_replace_shared(VALUE str2, VALUE str)
1473{
1474 str_replace_shared_without_enc(str2, str);
1475 rb_enc_cr_str_exact_copy(str2, str);
1476 return str2;
1477}
1478
1479static VALUE
1480str_new_shared(VALUE klass, VALUE str)
1481{
1482 return str_replace_shared(str_alloc_heap(klass), str);
1483}
1484
1485VALUE
1487{
1488 return str_new_shared(rb_obj_class(str), str);
1489}
1490
1491VALUE
1493{
1494 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1495 return str_new_frozen(rb_obj_class(orig), orig);
1496}
1497
1498static VALUE
1499rb_str_new_frozen_String(VALUE orig)
1500{
1501 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1502 return str_new_frozen(rb_cString, orig);
1503}
1504
1505
1506VALUE
1507rb_str_frozen_bare_string(VALUE orig)
1508{
1509 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1510 return str_new_frozen(rb_cString, orig);
1511}
1512
1513VALUE
1514rb_str_tmp_frozen_acquire(VALUE orig)
1515{
1516 if (OBJ_FROZEN_RAW(orig)) return orig;
1517 return str_new_frozen_buffer(0, orig, FALSE);
1518}
1519
1520VALUE
1521rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1522{
1523 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1524 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1525
1526 VALUE str = str_alloc_heap(0);
1527 OBJ_FREEZE(str);
1528 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1529 FL_SET(str, STR_SHARED_ROOT);
1530
1531 size_t capa = str_capacity(orig, TERM_LEN(orig));
1532
1533 /* If the string is embedded then we want to create a copy that is heap
1534 * allocated. If the string is shared then the shared root must be
1535 * embedded, so we want to create a copy. If the string is a shared root
1536 * then it must be embedded, so we want to create a copy. */
1537 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1538 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1539 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1540 }
1541 else {
1542 /* orig must be heap allocated and not shared, so we can safely transfer
1543 * the pointer to str. */
1544 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1545 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1546 RBASIC(orig)->flags &= ~STR_NOFREE;
1547 STR_SET_SHARED(orig, str);
1548 }
1549
1550 RSTRING(str)->len = RSTRING(orig)->len;
1551 RSTRING(str)->as.heap.aux.capa = capa;
1552
1553 return str;
1554}
1555
1556void
1557rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1558{
1559 if (RBASIC_CLASS(tmp) != 0)
1560 return;
1561
1562 if (STR_EMBED_P(tmp)) {
1564 }
1565 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1566 !OBJ_FROZEN_RAW(orig)) {
1567 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1568
1569 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1570 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1571 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1572
1573 /* Unshare orig since the root (tmp) only has this one child. */
1574 FL_UNSET_RAW(orig, STR_SHARED);
1575 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1576 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1578
1579 /* Make tmp embedded and empty so it is safe for sweeping. */
1580 STR_SET_EMBED(tmp);
1581 STR_SET_LEN(tmp, 0);
1582 }
1583 }
1584}
1585
1586static VALUE
1587str_new_frozen(VALUE klass, VALUE orig)
1588{
1589 return str_new_frozen_buffer(klass, orig, TRUE);
1590}
1591
1592static VALUE
1593heap_str_make_shared(VALUE klass, VALUE orig)
1594{
1595 RUBY_ASSERT(!STR_EMBED_P(orig));
1596 RUBY_ASSERT(!STR_SHARED_P(orig));
1597
1598 VALUE str = str_alloc_heap(klass);
1599 STR_SET_LEN(str, RSTRING_LEN(orig));
1600 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1601 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1602 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1603 RBASIC(orig)->flags &= ~STR_NOFREE;
1604 STR_SET_SHARED(orig, str);
1605 if (klass == 0)
1606 FL_UNSET_RAW(str, STR_BORROWED);
1607 return str;
1608}
1609
1610static VALUE
1611str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1612{
1613 VALUE str;
1614
1615 long len = RSTRING_LEN(orig);
1616 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1617 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1618
1619 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1620 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1621 RUBY_ASSERT(STR_EMBED_P(str));
1622 }
1623 else {
1624 if (FL_TEST_RAW(orig, STR_SHARED)) {
1625 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1626 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1627 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1628 RUBY_ASSERT(ofs >= 0);
1629 RUBY_ASSERT(rest >= 0);
1630 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1632
1633 if ((ofs > 0) || (rest > 0) ||
1634 (klass != RBASIC(shared)->klass) ||
1635 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1636 str = str_new_shared(klass, shared);
1637 RUBY_ASSERT(!STR_EMBED_P(str));
1638 RSTRING(str)->as.heap.ptr += ofs;
1639 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1640 }
1641 else {
1642 if (RBASIC_CLASS(shared) == 0)
1643 FL_SET_RAW(shared, STR_BORROWED);
1644 return shared;
1645 }
1646 }
1647 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1648 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1649 STR_SET_EMBED(str);
1650 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1651 STR_SET_LEN(str, RSTRING_LEN(orig));
1652 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1653 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1654 }
1655 else {
1656 str = heap_str_make_shared(klass, orig);
1657 }
1658 }
1659
1660 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1661 OBJ_FREEZE(str);
1662 return str;
1663}
1664
1665VALUE
1666rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1667{
1668 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1669}
1670
1671static VALUE
1672str_new_empty_String(VALUE str)
1673{
1674 VALUE v = rb_str_new(0, 0);
1675 rb_enc_copy(v, str);
1676 return v;
1677}
1678
1679#define STR_BUF_MIN_SIZE 63
1680
1681VALUE
1683{
1684 if (STR_EMBEDDABLE_P(capa, 1)) {
1685 return str_alloc_embed(rb_cString, capa + 1);
1686 }
1687
1688 VALUE str = str_alloc_heap(rb_cString);
1689
1690 RSTRING(str)->as.heap.aux.capa = capa;
1691 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1692 RSTRING(str)->as.heap.ptr[0] = '\0';
1693
1694 return str;
1695}
1696
1697VALUE
1699{
1700 VALUE str;
1701 long len = strlen(ptr);
1702
1703 str = rb_str_buf_new(len);
1704 rb_str_buf_cat(str, ptr, len);
1705
1706 return str;
1707}
1708
1709VALUE
1711{
1712 return str_new(0, 0, len);
1713}
1714
1715void
1717{
1718 if (STR_EMBED_P(str)) {
1719 RB_DEBUG_COUNTER_INC(obj_str_embed);
1720 }
1721 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1722 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1724 }
1725 else {
1726 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1727 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1728 }
1729}
1730
1731size_t
1732rb_str_memsize(VALUE str)
1733{
1734 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1735 return STR_HEAP_SIZE(str);
1736 }
1737 else {
1738 return 0;
1739 }
1740}
1741
1742VALUE
1744{
1745 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1746}
1747
1748static inline void str_discard(VALUE str);
1749static void str_shared_replace(VALUE str, VALUE str2);
1750
1751void
1753{
1754 if (str != str2) str_shared_replace(str, str2);
1755}
1756
1757static void
1758str_shared_replace(VALUE str, VALUE str2)
1759{
1760 rb_encoding *enc;
1761 int cr;
1762 int termlen;
1763
1764 RUBY_ASSERT(str2 != str);
1765 enc = STR_ENC_GET(str2);
1766 cr = ENC_CODERANGE(str2);
1767 str_discard(str);
1768 termlen = rb_enc_mbminlen(enc);
1769
1770 STR_SET_LEN(str, RSTRING_LEN(str2));
1771
1772 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1773 STR_SET_EMBED(str);
1774 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1775 rb_enc_associate(str, enc);
1776 ENC_CODERANGE_SET(str, cr);
1777 }
1778 else {
1779 if (STR_EMBED_P(str2)) {
1780 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1781 long len = RSTRING_LEN(str2);
1782 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1783
1784 char *new_ptr = ALLOC_N(char, len + termlen);
1785 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1786 RSTRING(str2)->as.heap.ptr = new_ptr;
1787 STR_SET_LEN(str2, len);
1788 RSTRING(str2)->as.heap.aux.capa = len;
1789 STR_SET_NOEMBED(str2);
1790 }
1791
1792 STR_SET_NOEMBED(str);
1793 FL_UNSET(str, STR_SHARED);
1794 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1795
1796 if (FL_TEST(str2, STR_SHARED)) {
1797 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1798 STR_SET_SHARED(str, shared);
1799 }
1800 else {
1801 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1802 }
1803
1804 /* abandon str2 */
1805 STR_SET_EMBED(str2);
1806 RSTRING_PTR(str2)[0] = 0;
1807 STR_SET_LEN(str2, 0);
1808 rb_enc_associate(str, enc);
1809 ENC_CODERANGE_SET(str, cr);
1810 }
1811}
1812
1813VALUE
1815{
1816 VALUE str;
1817
1818 if (RB_TYPE_P(obj, T_STRING)) {
1819 return obj;
1820 }
1821 str = rb_funcall(obj, idTo_s, 0);
1822 return rb_obj_as_string_result(str, obj);
1823}
1824
1825VALUE
1826rb_obj_as_string_result(VALUE str, VALUE obj)
1827{
1828 if (!RB_TYPE_P(str, T_STRING))
1829 return rb_any_to_s(obj);
1830 return str;
1831}
1832
1833static VALUE
1834str_replace(VALUE str, VALUE str2)
1835{
1836 long len;
1837
1838 len = RSTRING_LEN(str2);
1839 if (STR_SHARED_P(str2)) {
1840 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1842 STR_SET_NOEMBED(str);
1843 STR_SET_LEN(str, len);
1844 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1845 STR_SET_SHARED(str, shared);
1846 rb_enc_cr_str_exact_copy(str, str2);
1847 }
1848 else {
1849 str_replace_shared(str, str2);
1850 }
1851
1852 return str;
1853}
1854
1855static inline VALUE
1856ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1857{
1858 size_t size = rb_str_embed_size(capa);
1859 RUBY_ASSERT(size > 0);
1860 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1861
1862 NEWOBJ_OF(str, struct RString, klass,
1864
1865 return (VALUE)str;
1866}
1867
1868static inline VALUE
1869ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1870{
1871 NEWOBJ_OF(str, struct RString, klass,
1872 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1873
1874 return (VALUE)str;
1875}
1876
1877static inline VALUE
1878str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1879{
1880 int encidx = 0;
1881 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1882 encidx = rb_enc_get_index(str);
1883 flags &= ~ENCODING_MASK;
1884 }
1885 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1886 if (encidx) rb_enc_associate_index(dup, encidx);
1887 return dup;
1888}
1889
1890static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1891
1892static inline VALUE
1893str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1894{
1895 VALUE flags = FL_TEST_RAW(str, flag_mask);
1896 long len = RSTRING_LEN(str);
1897
1898 RUBY_ASSERT(STR_EMBED_P(dup));
1899 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1900 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1901 STR_SET_LEN(dup, RSTRING_LEN(str));
1902 return str_duplicate_setup_encoding(str, dup, flags);
1903}
1904
1905static inline VALUE
1906str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1907{
1908 VALUE flags = FL_TEST_RAW(str, flag_mask);
1909 VALUE root = str;
1910 if (FL_TEST_RAW(str, STR_SHARED)) {
1911 root = RSTRING(str)->as.heap.aux.shared;
1912 }
1913 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1914 root = str = str_new_frozen(klass, str);
1915 flags = FL_TEST_RAW(str, flag_mask);
1916 }
1917 RUBY_ASSERT(!STR_SHARED_P(root));
1919
1920 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1921 FL_SET(root, STR_SHARED_ROOT);
1922 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1923 flags |= RSTRING_NOEMBED | STR_SHARED;
1924
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1927}
1928
1929static inline VALUE
1930str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1931{
1932 if (STR_EMBED_P(str)) {
1933 return str_duplicate_setup_embed(klass, str, dup);
1934 }
1935 else {
1936 return str_duplicate_setup_heap(klass, str, dup);
1937 }
1938}
1939
1940static inline VALUE
1941str_duplicate(VALUE klass, VALUE str)
1942{
1943 VALUE dup;
1944 if (STR_EMBED_P(str)) {
1945 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1946 }
1947 else {
1948 dup = str_alloc_heap(klass);
1949 }
1950
1951 return str_duplicate_setup(klass, str, dup);
1952}
1953
1954VALUE
1956{
1957 return str_duplicate(rb_obj_class(str), str);
1958}
1959
1960/* :nodoc: */
1961VALUE
1962rb_str_dup_m(VALUE str)
1963{
1964 if (LIKELY(BARE_STRING_P(str))) {
1965 return str_duplicate(rb_cString, str);
1966 }
1967 else {
1968 return rb_obj_dup(str);
1969 }
1970}
1971
1972VALUE
1974{
1975 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1976 return str_duplicate(rb_cString, str);
1977}
1978
1979VALUE
1980rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1981{
1982 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1983 VALUE new_str, klass = rb_cString;
1984
1985 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1986 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 str_duplicate_setup_embed(klass, str, new_str);
1988 }
1989 else {
1990 new_str = ec_str_alloc_heap(ec, klass);
1991 str_duplicate_setup_heap(klass, str, new_str);
1992 }
1993 if (chilled) {
1994 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1995 }
1996 return new_str;
1997}
1998
1999VALUE
2000rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2001{
2002 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2003 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2004 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2005 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2006 return rb_str_freeze(str);
2007}
2008
2009/*
2010 * The documentation block below uses an include (instead of inline text)
2011 * because the included text has non-ASCII characters (which are not allowed in a C file).
2012 */
2013
2014/*
2015 *
2016 * call-seq:
2017 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2018 *
2019 * :include: doc/string/new.rdoc
2020 *
2021 */
2022
2023static VALUE
2024rb_str_init(int argc, VALUE *argv, VALUE str)
2025{
2026 static ID keyword_ids[2];
2027 VALUE orig, opt, venc, vcapa;
2028 VALUE kwargs[2];
2029 rb_encoding *enc = 0;
2030 int n;
2031
2032 if (!keyword_ids[0]) {
2033 keyword_ids[0] = rb_id_encoding();
2034 CONST_ID(keyword_ids[1], "capacity");
2035 }
2036
2037 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2038 if (!NIL_P(opt)) {
2039 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2040 venc = kwargs[0];
2041 vcapa = kwargs[1];
2042 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2043 enc = rb_to_encoding(venc);
2044 }
2045 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2046 long capa = NUM2LONG(vcapa);
2047 long len = 0;
2048 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2049
2050 if (capa < STR_BUF_MIN_SIZE) {
2051 capa = STR_BUF_MIN_SIZE;
2052 }
2053 if (n == 1) {
2054 StringValue(orig);
2055 len = RSTRING_LEN(orig);
2056 if (capa < len) {
2057 capa = len;
2058 }
2059 if (orig == str) n = 0;
2060 }
2061 str_modifiable(str);
2062 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2063 /* make noembed always */
2064 const size_t size = (size_t)capa + termlen;
2065 const char *const old_ptr = RSTRING_PTR(str);
2066 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2067 char *new_ptr = ALLOC_N(char, size);
2068 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2069 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2070 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2071 RSTRING(str)->as.heap.ptr = new_ptr;
2072 }
2073 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2074 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2075 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2076 }
2077 STR_SET_LEN(str, len);
2078 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2079 if (n == 1) {
2080 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2081 rb_enc_cr_str_exact_copy(str, orig);
2082 }
2083 FL_SET(str, STR_NOEMBED);
2084 RSTRING(str)->as.heap.aux.capa = capa;
2085 }
2086 else if (n == 1) {
2087 rb_str_replace(str, orig);
2088 }
2089 if (enc) {
2090 rb_enc_associate(str, enc);
2092 }
2093 }
2094 else if (n == 1) {
2095 rb_str_replace(str, orig);
2096 }
2097 return str;
2098}
2099
2100/* :nodoc: */
2101static VALUE
2102rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2103{
2104 if (klass != rb_cString) {
2105 return rb_class_new_instance_pass_kw(argc, argv, klass);
2106 }
2107
2108 static ID keyword_ids[2];
2109 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2110 VALUE kwargs[2];
2111 rb_encoding *enc = NULL;
2112
2113 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2114 if (NIL_P(opt)) {
2115 return rb_class_new_instance_pass_kw(argc, argv, klass);
2116 }
2117
2118 keyword_ids[0] = rb_id_encoding();
2119 CONST_ID(keyword_ids[1], "capacity");
2120 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2121 encoding = kwargs[0];
2122 capacity = kwargs[1];
2123
2124 if (n == 1) {
2125 orig = StringValue(orig);
2126 }
2127 else {
2128 orig = Qnil;
2129 }
2130
2131 if (UNDEF_P(encoding)) {
2132 if (!NIL_P(orig)) {
2133 encoding = rb_obj_encoding(orig);
2134 }
2135 }
2136
2137 if (!UNDEF_P(encoding)) {
2138 enc = rb_to_encoding(encoding);
2139 }
2140
2141 // If capacity is nil, we're basically just duping `orig`.
2142 if (UNDEF_P(capacity)) {
2143 if (NIL_P(orig)) {
2144 VALUE empty_str = str_new(klass, "", 0);
2145 if (enc) {
2146 rb_enc_associate(empty_str, enc);
2147 }
2148 return empty_str;
2149 }
2150 VALUE copy = str_duplicate(klass, orig);
2151 rb_enc_associate(copy, enc);
2152 ENC_CODERANGE_CLEAR(copy);
2153 return copy;
2154 }
2155
2156 long capa = 0;
2157 capa = NUM2LONG(capacity);
2158 if (capa < 0) {
2159 capa = 0;
2160 }
2161
2162 if (!NIL_P(orig)) {
2163 long orig_capa = rb_str_capacity(orig);
2164 if (orig_capa > capa) {
2165 capa = orig_capa;
2166 }
2167 }
2168
2169 VALUE str = str_enc_new(klass, NULL, capa, enc);
2170 STR_SET_LEN(str, 0);
2171 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2172
2173 if (!NIL_P(orig)) {
2174 rb_str_buf_append(str, orig);
2175 }
2176
2177 return str;
2178}
2179
2180#ifdef NONASCII_MASK
2181#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2182
2183/*
2184 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2185 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2186 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2187 *
2188 * if (!(byte & 0x80))
2189 * byte |= 0x40; // turn on bit6
2190 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2191 *
2192 * This function calculates whether a byte is leading or not for all bytes
2193 * in the argument word by concurrently using the above logic, and then
2194 * adds up the number of leading bytes in the word.
2195 */
2196static inline uintptr_t
2197count_utf8_lead_bytes_with_word(const uintptr_t *s)
2198{
2199 uintptr_t d = *s;
2200
2201 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2202 d = (d>>6) | (~d>>7);
2203 d &= NONASCII_MASK >> 7;
2204
2205 /* Gather all bytes. */
2206#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2207 /* use only if it can use POPCNT */
2208 return rb_popcount_intptr(d);
2209#else
2210 d += (d>>8);
2211 d += (d>>16);
2212# if SIZEOF_VOIDP == 8
2213 d += (d>>32);
2214# endif
2215 return (d&0xF);
2216#endif
2217}
2218#endif
2219
2220static inline long
2221enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2222{
2223 long c;
2224 const char *q;
2225
2226 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2227 long diff = (long)(e - p);
2228 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2229 }
2230#ifdef NONASCII_MASK
2231 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2232 uintptr_t len = 0;
2233 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2234 const uintptr_t *s, *t;
2235 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2236 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2237 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2238 while (p < (const char *)s) {
2239 if (is_utf8_lead_byte(*p)) len++;
2240 p++;
2241 }
2242 while (s < t) {
2243 len += count_utf8_lead_bytes_with_word(s);
2244 s++;
2245 }
2246 p = (const char *)s;
2247 }
2248 while (p < e) {
2249 if (is_utf8_lead_byte(*p)) len++;
2250 p++;
2251 }
2252 return (long)len;
2253 }
2254#endif
2255 else if (rb_enc_asciicompat(enc)) {
2256 c = 0;
2257 if (ENC_CODERANGE_CLEAN_P(cr)) {
2258 while (p < e) {
2259 if (ISASCII(*p)) {
2260 q = search_nonascii(p, e);
2261 if (!q)
2262 return c + (e - p);
2263 c += q - p;
2264 p = q;
2265 }
2266 p += rb_enc_fast_mbclen(p, e, enc);
2267 c++;
2268 }
2269 }
2270 else {
2271 while (p < e) {
2272 if (ISASCII(*p)) {
2273 q = search_nonascii(p, e);
2274 if (!q)
2275 return c + (e - p);
2276 c += q - p;
2277 p = q;
2278 }
2279 p += rb_enc_mbclen(p, e, enc);
2280 c++;
2281 }
2282 }
2283 return c;
2284 }
2285
2286 for (c=0; p<e; c++) {
2287 p += rb_enc_mbclen(p, e, enc);
2288 }
2289 return c;
2290}
2291
2292long
2293rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2294{
2295 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2296}
2297
2298/* To get strlen with cr
2299 * Note that given cr is not used.
2300 */
2301long
2302rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2303{
2304 long c;
2305 const char *q;
2306 int ret;
2307
2308 *cr = 0;
2309 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2310 long diff = (long)(e - p);
2311 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2312 }
2313 else if (rb_enc_asciicompat(enc)) {
2314 c = 0;
2315 while (p < e) {
2316 if (ISASCII(*p)) {
2317 q = search_nonascii(p, e);
2318 if (!q) {
2319 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2320 return c + (e - p);
2321 }
2322 c += q - p;
2323 p = q;
2324 }
2325 ret = rb_enc_precise_mbclen(p, e, enc);
2326 if (MBCLEN_CHARFOUND_P(ret)) {
2327 *cr |= ENC_CODERANGE_VALID;
2328 p += MBCLEN_CHARFOUND_LEN(ret);
2329 }
2330 else {
2332 p++;
2333 }
2334 c++;
2335 }
2336 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2337 return c;
2338 }
2339
2340 for (c=0; p<e; c++) {
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2342 if (MBCLEN_CHARFOUND_P(ret)) {
2343 *cr |= ENC_CODERANGE_VALID;
2344 p += MBCLEN_CHARFOUND_LEN(ret);
2345 }
2346 else {
2348 if (p + rb_enc_mbminlen(enc) <= e)
2349 p += rb_enc_mbminlen(enc);
2350 else
2351 p = e;
2352 }
2353 }
2354 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2355 return c;
2356}
2357
2358/* enc must be str's enc or rb_enc_check(str, str2) */
2359static long
2360str_strlen(VALUE str, rb_encoding *enc)
2361{
2362 const char *p, *e;
2363 int cr;
2364
2365 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2366 if (!enc) enc = STR_ENC_GET(str);
2367 p = RSTRING_PTR(str);
2368 e = RSTRING_END(str);
2369 cr = ENC_CODERANGE(str);
2370
2371 if (cr == ENC_CODERANGE_UNKNOWN) {
2372 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2373 if (cr) ENC_CODERANGE_SET(str, cr);
2374 return n;
2375 }
2376 else {
2377 return enc_strlen(p, e, enc, cr);
2378 }
2379}
2380
2381long
2383{
2384 return str_strlen(str, NULL);
2385}
2386
2387/*
2388 * call-seq:
2389 * length -> integer
2390 *
2391 * :include: doc/string/length.rdoc
2392 *
2393 */
2394
2395VALUE
2397{
2398 return LONG2NUM(str_strlen(str, NULL));
2399}
2400
2401/*
2402 * call-seq:
2403 * bytesize -> integer
2404 *
2405 * :include: doc/string/bytesize.rdoc
2406 *
2407 */
2408
2409VALUE
2410rb_str_bytesize(VALUE str)
2411{
2412 return LONG2NUM(RSTRING_LEN(str));
2413}
2414
2415/*
2416 * call-seq:
2417 * empty? -> true or false
2418 *
2419 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2420 *
2421 * "hello".empty? # => false
2422 * " ".empty? # => false
2423 * "".empty? # => true
2424 *
2425 */
2426
2427static VALUE
2428rb_str_empty(VALUE str)
2429{
2430 return RBOOL(RSTRING_LEN(str) == 0);
2431}
2432
2433/*
2434 * call-seq:
2435 * self + other_string -> new_string
2436 *
2437 * Returns a new string containing +other_string+ concatenated to +self+:
2438 *
2439 * 'Hello from ' + self.to_s # => "Hello from main"
2440 *
2441 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2442 */
2443
2444VALUE
2446{
2447 VALUE str3;
2448 rb_encoding *enc;
2449 char *ptr1, *ptr2, *ptr3;
2450 long len1, len2;
2451 int termlen;
2452
2453 StringValue(str2);
2454 enc = rb_enc_check_str(str1, str2);
2455 RSTRING_GETMEM(str1, ptr1, len1);
2456 RSTRING_GETMEM(str2, ptr2, len2);
2457 termlen = rb_enc_mbminlen(enc);
2458 if (len1 > LONG_MAX - len2) {
2459 rb_raise(rb_eArgError, "string size too big");
2460 }
2461 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2462 ptr3 = RSTRING_PTR(str3);
2463 memcpy(ptr3, ptr1, len1);
2464 memcpy(ptr3+len1, ptr2, len2);
2465 TERM_FILL(&ptr3[len1+len2], termlen);
2466
2467 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2469 RB_GC_GUARD(str1);
2470 RB_GC_GUARD(str2);
2471 return str3;
2472}
2473
2474/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2475VALUE
2476rb_str_opt_plus(VALUE str1, VALUE str2)
2477{
2480 long len1, len2;
2481 MAYBE_UNUSED(char) *ptr1, *ptr2;
2482 RSTRING_GETMEM(str1, ptr1, len1);
2483 RSTRING_GETMEM(str2, ptr2, len2);
2484 int enc1 = rb_enc_get_index(str1);
2485 int enc2 = rb_enc_get_index(str2);
2486
2487 if (enc1 < 0) {
2488 return Qundef;
2489 }
2490 else if (enc2 < 0) {
2491 return Qundef;
2492 }
2493 else if (enc1 != enc2) {
2494 return Qundef;
2495 }
2496 else if (len1 > LONG_MAX - len2) {
2497 return Qundef;
2498 }
2499 else {
2500 return rb_str_plus(str1, str2);
2501 }
2502
2503}
2504
2505/*
2506 * call-seq:
2507 * self * n -> new_string
2508 *
2509 * Returns a new string containing +n+ copies of +self+:
2510 *
2511 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2512 * 'No!' * 0 # => ""
2513 *
2514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2515 */
2516
2517VALUE
2519{
2520 VALUE str2;
2521 long n, len;
2522 char *ptr2;
2523 int termlen;
2524
2525 if (times == INT2FIX(1)) {
2526 return str_duplicate(rb_cString, str);
2527 }
2528 if (times == INT2FIX(0)) {
2529 str2 = str_alloc_embed(rb_cString, 0);
2530 rb_enc_copy(str2, str);
2531 return str2;
2532 }
2533 len = NUM2LONG(times);
2534 if (len < 0) {
2535 rb_raise(rb_eArgError, "negative argument");
2536 }
2537 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2538 if (STR_EMBEDDABLE_P(len, 1)) {
2539 str2 = str_alloc_embed(rb_cString, len + 1);
2540 memset(RSTRING_PTR(str2), 0, len + 1);
2541 }
2542 else {
2543 str2 = str_alloc_heap(rb_cString);
2544 RSTRING(str2)->as.heap.aux.capa = len;
2545 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2546 }
2547 STR_SET_LEN(str2, len);
2548 rb_enc_copy(str2, str);
2549 return str2;
2550 }
2551 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2552 rb_raise(rb_eArgError, "argument too big");
2553 }
2554
2555 len *= RSTRING_LEN(str);
2556 termlen = TERM_LEN(str);
2557 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2558 ptr2 = RSTRING_PTR(str2);
2559 if (len) {
2560 n = RSTRING_LEN(str);
2561 memcpy(ptr2, RSTRING_PTR(str), n);
2562 while (n <= len/2) {
2563 memcpy(ptr2 + n, ptr2, n);
2564 n *= 2;
2565 }
2566 memcpy(ptr2 + n, ptr2, len-n);
2567 }
2568 STR_SET_LEN(str2, len);
2569 TERM_FILL(&ptr2[len], termlen);
2570 rb_enc_cr_str_copy_for_substr(str2, str);
2571
2572 return str2;
2573}
2574
2575/*
2576 * call-seq:
2577 * self % object -> new_string
2578 *
2579 * Returns the result of formatting +object+ into the format specifications
2580 * contained in +self+
2581 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2582 *
2583 * '%05d' % 123 # => "00123"
2584 *
2585 * If +self+ contains multiple format specifications,
2586 * +object+ must be an array or hash containing the objects to be formatted:
2587 *
2588 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2589 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2590 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2591 *
2592 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2593 */
2594
2595static VALUE
2596rb_str_format_m(VALUE str, VALUE arg)
2597{
2598 VALUE tmp = rb_check_array_type(arg);
2599
2600 if (!NIL_P(tmp)) {
2601 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2602 }
2603 return rb_str_format(1, &arg, str);
2604}
2605
2606static inline void
2607rb_check_lockedtmp(VALUE str)
2608{
2609 if (FL_TEST(str, STR_TMPLOCK)) {
2610 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2611 }
2612}
2613
2614// If none of these flags are set, we know we have an modifiable string.
2615// If any is set, we need to do more detailed checks.
2616#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2617static inline void
2618str_modifiable(VALUE str)
2619{
2620 RUBY_ASSERT(ruby_thread_has_gvl_p());
2621
2622 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2623 if (CHILLED_STRING_P(str)) {
2624 CHILLED_STRING_MUTATED(str);
2625 }
2626 rb_check_lockedtmp(str);
2627 rb_check_frozen(str);
2628 }
2629}
2630
2631static inline int
2632str_dependent_p(VALUE str)
2633{
2634 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2635 return FALSE;
2636 }
2637 else {
2638 return TRUE;
2639 }
2640}
2641
2642// If none of these flags are set, we know we have an independent string.
2643// If any is set, we need to do more detailed checks.
2644#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2645static inline int
2646str_independent(VALUE str)
2647{
2648 RUBY_ASSERT(ruby_thread_has_gvl_p());
2649
2650 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2651 str_modifiable(str);
2652 return !str_dependent_p(str);
2653 }
2654 return TRUE;
2655}
2656
2657static void
2658str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 char *ptr;
2663 char *oldptr;
2664 long capa = len + expand;
2665
2666 if (len > capa) len = capa;
2667
2668 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2669 ptr = RSTRING(str)->as.heap.ptr;
2670 STR_SET_EMBED(str);
2671 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2672 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2673 STR_SET_LEN(str, len);
2674 return;
2675 }
2676
2677 ptr = ALLOC_N(char, (size_t)capa + termlen);
2678 oldptr = RSTRING_PTR(str);
2679 if (oldptr) {
2680 memcpy(ptr, oldptr, len);
2681 }
2682 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2683 xfree(oldptr);
2684 }
2685 STR_SET_NOEMBED(str);
2686 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2687 TERM_FILL(ptr + len, termlen);
2688 RSTRING(str)->as.heap.ptr = ptr;
2689 STR_SET_LEN(str, len);
2690 RSTRING(str)->as.heap.aux.capa = capa;
2691}
2692
2693void
2694rb_str_modify(VALUE str)
2695{
2696 if (!str_independent(str))
2697 str_make_independent(str);
2699}
2700
2701void
2703{
2704 RUBY_ASSERT(ruby_thread_has_gvl_p());
2705
2706 int termlen = TERM_LEN(str);
2707 long len = RSTRING_LEN(str);
2708
2709 if (expand < 0) {
2710 rb_raise(rb_eArgError, "negative expanding string size");
2711 }
2712 if (expand >= LONG_MAX - len) {
2713 rb_raise(rb_eArgError, "string size too big");
2714 }
2715
2716 if (!str_independent(str)) {
2717 str_make_independent_expand(str, len, expand, termlen);
2718 }
2719 else if (expand > 0) {
2720 RESIZE_CAPA_TERM(str, len + expand, termlen);
2721 }
2723}
2724
2725/* As rb_str_modify(), but don't clear coderange */
2726static void
2727str_modify_keep_cr(VALUE str)
2728{
2729 if (!str_independent(str))
2730 str_make_independent(str);
2732 /* Force re-scan later */
2734}
2735
2736static inline void
2737str_discard(VALUE str)
2738{
2739 str_modifiable(str);
2740 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2741 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2742 RSTRING(str)->as.heap.ptr = 0;
2743 STR_SET_LEN(str, 0);
2744 }
2745}
2746
2747void
2749{
2750 int encindex = rb_enc_get_index(str);
2751
2752 if (RB_UNLIKELY(encindex == -1)) {
2753 rb_raise(rb_eTypeError, "not encoding capable object");
2754 }
2755
2756 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2757 return;
2758 }
2759
2760 rb_encoding *enc = rb_enc_from_index(encindex);
2761 if (!rb_enc_asciicompat(enc)) {
2762 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2763 }
2764}
2765
2766VALUE
2768{
2769 RUBY_ASSERT(ruby_thread_has_gvl_p());
2770
2771 VALUE s = *ptr;
2772 if (!RB_TYPE_P(s, T_STRING)) {
2773 s = rb_str_to_str(s);
2774 *ptr = s;
2775 }
2776 return s;
2777}
2778
2779char *
2781{
2782 VALUE str = rb_string_value(ptr);
2783 return RSTRING_PTR(str);
2784}
2785
2786static int
2787zero_filled(const char *s, int n)
2788{
2789 for (; n > 0; --n) {
2790 if (*s++) return 0;
2791 }
2792 return 1;
2793}
2794
2795static const char *
2796str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2797{
2798 const char *e = s + len;
2799
2800 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2801 if (zero_filled(s, minlen)) return s;
2802 }
2803 return 0;
2804}
2805
2806static char *
2807str_fill_term(VALUE str, char *s, long len, int termlen)
2808{
2809 /* This function assumes that (capa + termlen) bytes of memory
2810 * is allocated, like many other functions in this file.
2811 */
2812 if (str_dependent_p(str)) {
2813 if (!zero_filled(s + len, termlen))
2814 str_make_independent_expand(str, len, 0L, termlen);
2815 }
2816 else {
2817 TERM_FILL(s + len, termlen);
2818 return s;
2819 }
2820 return RSTRING_PTR(str);
2821}
2822
2823void
2824rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2825{
2826 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2827 long len = RSTRING_LEN(str);
2828
2829 RUBY_ASSERT(capa >= len);
2830 if (capa - len < termlen) {
2831 rb_check_lockedtmp(str);
2832 str_make_independent_expand(str, len, 0L, termlen);
2833 }
2834 else if (str_dependent_p(str)) {
2835 if (termlen > oldtermlen)
2836 str_make_independent_expand(str, len, 0L, termlen);
2837 }
2838 else {
2839 if (!STR_EMBED_P(str)) {
2840 /* modify capa instead of realloc */
2841 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2842 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2843 }
2844 if (termlen > oldtermlen) {
2845 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2846 }
2847 }
2848
2849 return;
2850}
2851
2852static char *
2853str_null_check(VALUE str, int *w)
2854{
2855 char *s = RSTRING_PTR(str);
2856 long len = RSTRING_LEN(str);
2857 rb_encoding *enc = rb_enc_get(str);
2858 const int minlen = rb_enc_mbminlen(enc);
2859
2860 if (minlen > 1) {
2861 *w = 1;
2862 if (str_null_char(s, len, minlen, enc)) {
2863 return NULL;
2864 }
2865 return str_fill_term(str, s, len, minlen);
2866 }
2867 *w = 0;
2868 if (!s || memchr(s, 0, len)) {
2869 return NULL;
2870 }
2871 if (s[len]) {
2872 s = str_fill_term(str, s, len, minlen);
2873 }
2874 return s;
2875}
2876
2877char *
2878rb_str_to_cstr(VALUE str)
2879{
2880 int w;
2881 return str_null_check(str, &w);
2882}
2883
2884char *
2886{
2887 VALUE str = rb_string_value(ptr);
2888 int w;
2889 char *s = str_null_check(str, &w);
2890 if (!s) {
2891 if (w) {
2892 rb_raise(rb_eArgError, "string contains null char");
2893 }
2894 rb_raise(rb_eArgError, "string contains null byte");
2895 }
2896 return s;
2897}
2898
2899char *
2900rb_str_fill_terminator(VALUE str, const int newminlen)
2901{
2902 char *s = RSTRING_PTR(str);
2903 long len = RSTRING_LEN(str);
2904 return str_fill_term(str, s, len, newminlen);
2905}
2906
2907VALUE
2909{
2910 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2911 return str;
2912}
2913
2914/*
2915 * call-seq:
2916 * String.try_convert(object) -> object, new_string, or nil
2917 *
2918 * Attempts to convert the given +object+ to a string.
2919 *
2920 * If +object+ is already a string, returns +object+, unmodified.
2921 *
2922 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2923 * calls <tt>object.to_str</tt> and returns the result.
2924 *
2925 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2926 *
2927 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2928 */
2929static VALUE
2930rb_str_s_try_convert(VALUE dummy, VALUE str)
2931{
2932 return rb_check_string_type(str);
2933}
2934
2935static char*
2936str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2937{
2938 long nth = *nthp;
2939 if (rb_enc_mbmaxlen(enc) == 1) {
2940 p += nth;
2941 }
2942 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2943 p += nth * rb_enc_mbmaxlen(enc);
2944 }
2945 else if (rb_enc_asciicompat(enc)) {
2946 const char *p2, *e2;
2947 int n;
2948
2949 while (p < e && 0 < nth) {
2950 e2 = p + nth;
2951 if (e < e2) {
2952 *nthp = nth;
2953 return (char *)e;
2954 }
2955 if (ISASCII(*p)) {
2956 p2 = search_nonascii(p, e2);
2957 if (!p2) {
2958 nth -= e2 - p;
2959 *nthp = nth;
2960 return (char *)e2;
2961 }
2962 nth -= p2 - p;
2963 p = p2;
2964 }
2965 n = rb_enc_mbclen(p, e, enc);
2966 p += n;
2967 nth--;
2968 }
2969 *nthp = nth;
2970 if (nth != 0) {
2971 return (char *)e;
2972 }
2973 return (char *)p;
2974 }
2975 else {
2976 while (p < e && nth--) {
2977 p += rb_enc_mbclen(p, e, enc);
2978 }
2979 }
2980 if (p > e) p = e;
2981 *nthp = nth;
2982 return (char*)p;
2983}
2984
2985char*
2986rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2987{
2988 return str_nth_len(p, e, &nth, enc);
2989}
2990
2991static char*
2992str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2993{
2994 if (singlebyte)
2995 p += nth;
2996 else {
2997 p = str_nth_len(p, e, &nth, enc);
2998 }
2999 if (!p) return 0;
3000 if (p > e) p = e;
3001 return (char *)p;
3002}
3003
3004/* char offset to byte offset */
3005static long
3006str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3007{
3008 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3009 if (!pp) return e - p;
3010 return pp - p;
3011}
3012
3013long
3014rb_str_offset(VALUE str, long pos)
3015{
3016 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3017 STR_ENC_GET(str), single_byte_optimizable(str));
3018}
3019
3020#ifdef NONASCII_MASK
3021static char *
3022str_utf8_nth(const char *p, const char *e, long *nthp)
3023{
3024 long nth = *nthp;
3025 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3026 const uintptr_t *s, *t;
3027 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3028 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3029 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3030 while (p < (const char *)s) {
3031 if (is_utf8_lead_byte(*p)) nth--;
3032 p++;
3033 }
3034 do {
3035 nth -= count_utf8_lead_bytes_with_word(s);
3036 s++;
3037 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3038 p = (char *)s;
3039 }
3040 while (p < e) {
3041 if (is_utf8_lead_byte(*p)) {
3042 if (nth == 0) break;
3043 nth--;
3044 }
3045 p++;
3046 }
3047 *nthp = nth;
3048 return (char *)p;
3049}
3050
3051static long
3052str_utf8_offset(const char *p, const char *e, long nth)
3053{
3054 const char *pp = str_utf8_nth(p, e, &nth);
3055 return pp - p;
3056}
3057#endif
3058
3059/* byte offset to char offset */
3060long
3061rb_str_sublen(VALUE str, long pos)
3062{
3063 if (single_byte_optimizable(str) || pos < 0)
3064 return pos;
3065 else {
3066 char *p = RSTRING_PTR(str);
3067 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3068 }
3069}
3070
3071static VALUE
3072str_subseq(VALUE str, long beg, long len)
3073{
3074 VALUE str2;
3075
3076 RUBY_ASSERT(beg >= 0);
3077 RUBY_ASSERT(len >= 0);
3078 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3079
3080 const int termlen = TERM_LEN(str);
3081 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3082 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3083 RB_GC_GUARD(str);
3084 return str2;
3085 }
3086
3087 str2 = str_alloc_heap(rb_cString);
3088 if (str_embed_capa(str2) >= len + termlen) {
3089 char *ptr2 = RSTRING(str2)->as.embed.ary;
3090 STR_SET_EMBED(str2);
3091 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3092 TERM_FILL(ptr2+len, termlen);
3093
3094 STR_SET_LEN(str2, len);
3095 RB_GC_GUARD(str);
3096 }
3097 else {
3098 str_replace_shared(str2, str);
3099 RUBY_ASSERT(!STR_EMBED_P(str2));
3100 ENC_CODERANGE_CLEAR(str2);
3101 RSTRING(str2)->as.heap.ptr += beg;
3102 if (RSTRING_LEN(str2) > len) {
3103 STR_SET_LEN(str2, len);
3104 }
3105 }
3106
3107 return str2;
3108}
3109
3110VALUE
3111rb_str_subseq(VALUE str, long beg, long len)
3112{
3113 VALUE str2 = str_subseq(str, beg, len);
3114 rb_enc_cr_str_copy_for_substr(str2, str);
3115 return str2;
3116}
3117
3118char *
3119rb_str_subpos(VALUE str, long beg, long *lenp)
3120{
3121 long len = *lenp;
3122 long slen = -1L;
3123 const long blen = RSTRING_LEN(str);
3124 rb_encoding *enc = STR_ENC_GET(str);
3125 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3126
3127 if (len < 0) return 0;
3128 if (beg < 0 && -beg < 0) return 0;
3129 if (!blen) {
3130 len = 0;
3131 }
3132 if (single_byte_optimizable(str)) {
3133 if (beg > blen) return 0;
3134 if (beg < 0) {
3135 beg += blen;
3136 if (beg < 0) return 0;
3137 }
3138 if (len > blen - beg)
3139 len = blen - beg;
3140 if (len < 0) return 0;
3141 p = s + beg;
3142 goto end;
3143 }
3144 if (beg < 0) {
3145 if (len > -beg) len = -beg;
3146 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3147 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3148 beg = -beg;
3149 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3150 p = e;
3151 if (!p) return 0;
3152 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3153 if (!p) return 0;
3154 len = e - p;
3155 goto end;
3156 }
3157 else {
3158 slen = str_strlen(str, enc);
3159 beg += slen;
3160 if (beg < 0) return 0;
3161 p = s + beg;
3162 if (len == 0) goto end;
3163 }
3164 }
3165 else if (beg > 0 && beg > blen) {
3166 return 0;
3167 }
3168 if (len == 0) {
3169 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3170 p = s + beg;
3171 }
3172#ifdef NONASCII_MASK
3173 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3174 enc == rb_utf8_encoding()) {
3175 p = str_utf8_nth(s, e, &beg);
3176 if (beg > 0) return 0;
3177 len = str_utf8_offset(p, e, len);
3178 }
3179#endif
3180 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3181 int char_sz = rb_enc_mbmaxlen(enc);
3182
3183 p = s + beg * char_sz;
3184 if (p > e) {
3185 return 0;
3186 }
3187 else if (len * char_sz > e - p)
3188 len = e - p;
3189 else
3190 len *= char_sz;
3191 }
3192 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3193 if (beg > 0) return 0;
3194 len = 0;
3195 }
3196 else {
3197 len = str_offset(p, e, len, enc, 0);
3198 }
3199 end:
3200 *lenp = len;
3201 RB_GC_GUARD(str);
3202 return p;
3203}
3204
3205static VALUE str_substr(VALUE str, long beg, long len, int empty);
3206
3207VALUE
3208rb_str_substr(VALUE str, long beg, long len)
3209{
3210 return str_substr(str, beg, len, TRUE);
3211}
3212
3213VALUE
3214rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3215{
3216 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3217}
3218
3219static VALUE
3220str_substr(VALUE str, long beg, long len, int empty)
3221{
3222 char *p = rb_str_subpos(str, beg, &len);
3223
3224 if (!p) return Qnil;
3225 if (!len && !empty) return Qnil;
3226
3227 beg = p - RSTRING_PTR(str);
3228
3229 VALUE str2 = str_subseq(str, beg, len);
3230 rb_enc_cr_str_copy_for_substr(str2, str);
3231 return str2;
3232}
3233
3234/* :nodoc: */
3235VALUE
3237{
3238 if (CHILLED_STRING_P(str)) {
3239 FL_UNSET_RAW(str, STR_CHILLED);
3240 }
3241
3242 if (OBJ_FROZEN(str)) return str;
3243 rb_str_resize(str, RSTRING_LEN(str));
3244 return rb_obj_freeze(str);
3245}
3246
3247/*
3248 * call-seq:
3249 * +string -> new_string or self
3250 *
3251 * Returns +self+ if +self+ is not frozen and can be mutated
3252 * without warning issuance.
3253 *
3254 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3255 *
3256 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3257 */
3258static VALUE
3259str_uplus(VALUE str)
3260{
3261 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3262 return rb_str_dup(str);
3263 }
3264 else {
3265 return str;
3266 }
3267}
3268
3269/*
3270 * call-seq:
3271 * -self -> frozen_string
3272 *
3273 * Returns a frozen string equal to +self+.
3274 *
3275 * The returned string is +self+ if and only if all of the following are true:
3276 *
3277 * - +self+ is already frozen.
3278 * - +self+ is an instance of \String (rather than of a subclass of \String)
3279 * - +self+ has no instance variables set on it.
3280 *
3281 * Otherwise, the returned string is a frozen copy of +self+.
3282 *
3283 * Returning +self+, when possible, saves duplicating +self+;
3284 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3285 *
3286 * It may also save duplicating other, already-existing, strings:
3287 *
3288 * s0 = 'foo'
3289 * s1 = 'foo'
3290 * s0.object_id == s1.object_id # => false
3291 * (-s0).object_id == (-s1).object_id # => true
3292 *
3293 * Note that method #-@ is convenient for defining a constant:
3294 *
3295 * FileName = -'config/database.yml'
3296 *
3297 * While its alias #dedup is better suited for chaining:
3298 *
3299 * 'foo'.dedup.gsub!('o')
3300 *
3301 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3302 */
3303static VALUE
3304str_uminus(VALUE str)
3305{
3306 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3307 str = rb_str_dup(str);
3308 }
3309 return rb_fstring(str);
3310}
3311
3312RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3313#define rb_str_dup_frozen rb_str_new_frozen
3314
3315VALUE
3317{
3318 rb_check_frozen(str);
3319 if (FL_TEST(str, STR_TMPLOCK)) {
3320 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3321 }
3322 FL_SET(str, STR_TMPLOCK);
3323 return str;
3324}
3325
3326VALUE
3328{
3329 rb_check_frozen(str);
3330 if (!FL_TEST(str, STR_TMPLOCK)) {
3331 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3332 }
3333 FL_UNSET(str, STR_TMPLOCK);
3334 return str;
3335}
3336
3337VALUE
3338rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3339{
3340 rb_str_locktmp(str);
3341 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3342}
3343
3344void
3346{
3347 RUBY_ASSERT(ruby_thread_has_gvl_p());
3348
3349 long capa;
3350 const int termlen = TERM_LEN(str);
3351
3352 str_modifiable(str);
3353 if (STR_SHARED_P(str)) {
3354 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3355 }
3356 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3357 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3358 }
3359
3360 int cr = ENC_CODERANGE(str);
3361 if (len == 0) {
3362 /* Empty string does not contain non-ASCII */
3364 }
3365 else if (cr == ENC_CODERANGE_UNKNOWN) {
3366 /* Leave unknown. */
3367 }
3368 else if (len > RSTRING_LEN(str)) {
3369 if (ENC_CODERANGE_CLEAN_P(cr)) {
3370 /* Update the coderange regarding the extended part. */
3371 const char *const prev_end = RSTRING_END(str);
3372 const char *const new_end = RSTRING_PTR(str) + len;
3373 rb_encoding *enc = rb_enc_get(str);
3374 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3375 ENC_CODERANGE_SET(str, cr);
3376 }
3377 else if (cr == ENC_CODERANGE_BROKEN) {
3378 /* May be valid now, by appended part. */
3380 }
3381 }
3382 else if (len < RSTRING_LEN(str)) {
3383 if (cr != ENC_CODERANGE_7BIT) {
3384 /* ASCII-only string is keeping after truncated. Valid
3385 * and broken may be invalid or valid, leave unknown. */
3387 }
3388 }
3389
3390 STR_SET_LEN(str, len);
3391 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3392}
3393
3394VALUE
3395rb_str_resize(VALUE str, long len)
3396{
3397 if (len < 0) {
3398 rb_raise(rb_eArgError, "negative string size (or size too big)");
3399 }
3400
3401 int independent = str_independent(str);
3402 long slen = RSTRING_LEN(str);
3403 const int termlen = TERM_LEN(str);
3404
3405 if (slen > len || (termlen != 1 && slen < len)) {
3407 }
3408
3409 {
3410 long capa;
3411 if (STR_EMBED_P(str)) {
3412 if (len == slen) return str;
3413 if (str_embed_capa(str) >= len + termlen) {
3414 STR_SET_LEN(str, len);
3415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3416 return str;
3417 }
3418 str_make_independent_expand(str, slen, len - slen, termlen);
3419 }
3420 else if (str_embed_capa(str) >= len + termlen) {
3421 char *ptr = STR_HEAP_PTR(str);
3422 STR_SET_EMBED(str);
3423 if (slen > len) slen = len;
3424 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3425 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3426 STR_SET_LEN(str, len);
3427 if (independent) ruby_xfree(ptr);
3428 return str;
3429 }
3430 else if (!independent) {
3431 if (len == slen) return str;
3432 str_make_independent_expand(str, slen, len - slen, termlen);
3433 }
3434 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3435 (capa - len) > (len < 1024 ? len : 1024)) {
3436 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3437 (size_t)len + termlen, STR_HEAP_SIZE(str));
3438 RSTRING(str)->as.heap.aux.capa = len;
3439 }
3440 else if (len == slen) return str;
3441 STR_SET_LEN(str, len);
3442 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3443 }
3444 return str;
3445}
3446
3447static void
3448str_ensure_available_capa(VALUE str, long len)
3449{
3450 str_modify_keep_cr(str);
3451
3452 const int termlen = TERM_LEN(str);
3453 long olen = RSTRING_LEN(str);
3454
3455 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3456 rb_raise(rb_eArgError, "string sizes too big");
3457 }
3458
3459 long total = olen + len;
3460 long capa = str_capacity(str, termlen);
3461
3462 if (capa < total) {
3463 if (total >= LONG_MAX / 2) {
3464 capa = total;
3465 }
3466 while (total > capa) {
3467 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3468 }
3469 RESIZE_CAPA_TERM(str, capa, termlen);
3470 }
3471}
3472
3473static VALUE
3474str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3475{
3476 if (keep_cr) {
3477 str_modify_keep_cr(str);
3478 }
3479 else {
3480 rb_str_modify(str);
3481 }
3482 if (len == 0) return 0;
3483
3484 long total, olen, off = -1;
3485 char *sptr;
3486 const int termlen = TERM_LEN(str);
3487
3488 RSTRING_GETMEM(str, sptr, olen);
3489 if (ptr >= sptr && ptr <= sptr + olen) {
3490 off = ptr - sptr;
3491 }
3492
3493 long capa = str_capacity(str, termlen);
3494
3495 if (olen > LONG_MAX - len) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498 total = olen + len;
3499 if (capa < total) {
3500 if (total >= LONG_MAX / 2) {
3501 capa = total;
3502 }
3503 while (total > capa) {
3504 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3505 }
3506 RESIZE_CAPA_TERM(str, capa, termlen);
3507 sptr = RSTRING_PTR(str);
3508 }
3509 if (off != -1) {
3510 ptr = sptr + off;
3511 }
3512 memcpy(sptr + olen, ptr, len);
3513 STR_SET_LEN(str, total);
3514 TERM_FILL(sptr + total, termlen); /* sentinel */
3515
3516 return str;
3517}
3518
3519#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3520#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3521
3522VALUE
3523rb_str_cat(VALUE str, const char *ptr, long len)
3524{
3525 if (len == 0) return str;
3526 if (len < 0) {
3527 rb_raise(rb_eArgError, "negative string size (or size too big)");
3528 }
3529 return str_buf_cat(str, ptr, len);
3530}
3531
3532VALUE
3533rb_str_cat_cstr(VALUE str, const char *ptr)
3534{
3535 must_not_null(ptr);
3536 return rb_str_buf_cat(str, ptr, strlen(ptr));
3537}
3538
3539static void
3540rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3541{
3542 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3543
3544 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3545 if (UNLIKELY(!str_independent(str))) {
3546 str_make_independent(str);
3547 }
3548
3549 long string_length = -1;
3550 const int null_terminator_length = 1;
3551 char *sptr;
3552 RSTRING_GETMEM(str, sptr, string_length);
3553
3554 // Ensure the resulting string wouldn't be too long.
3555 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3556 rb_raise(rb_eArgError, "string sizes too big");
3557 }
3558
3559 long string_capacity = str_capacity(str, null_terminator_length);
3560
3561 // Get the code range before any modifications since those might clear the code range.
3562 int cr = ENC_CODERANGE(str);
3563
3564 // Check if the string has spare string_capacity to write the new byte.
3565 if (LIKELY(string_capacity >= string_length + 1)) {
3566 // In fast path we can write the new byte and note the string's new length.
3567 sptr[string_length] = byte;
3568 STR_SET_LEN(str, string_length + 1);
3569 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3570 }
3571 else {
3572 // If there's not enough string_capacity, make a call into the general string concatenation function.
3573 str_buf_cat(str, (char *)&byte, 1);
3574 }
3575
3576 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3577 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3578 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3579 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3580 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3581 if (ISASCII(byte)) {
3583 }
3584 else {
3586
3587 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3588 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3589 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3590 }
3591 }
3592 }
3593}
3594
3595RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3596RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3597RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3598
3599static VALUE
3600rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3601 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3602{
3603 int str_encindex = ENCODING_GET(str);
3604 int res_encindex;
3605 int str_cr, res_cr;
3606 rb_encoding *str_enc, *ptr_enc;
3607
3608 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3609
3610 if (str_encindex == ptr_encindex) {
3611 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3612 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3613 }
3614 }
3615 else {
3616 str_enc = rb_enc_from_index(str_encindex);
3617 ptr_enc = rb_enc_from_index(ptr_encindex);
3618 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3619 if (len == 0)
3620 return str;
3621 if (RSTRING_LEN(str) == 0) {
3622 rb_str_buf_cat(str, ptr, len);
3623 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3624 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3625 return str;
3626 }
3627 goto incompatible;
3628 }
3629 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3630 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3631 }
3632 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3633 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3634 str_cr = rb_enc_str_coderange(str);
3635 }
3636 }
3637 }
3638 if (ptr_cr_ret)
3639 *ptr_cr_ret = ptr_cr;
3640
3641 if (str_encindex != ptr_encindex &&
3642 str_cr != ENC_CODERANGE_7BIT &&
3643 ptr_cr != ENC_CODERANGE_7BIT) {
3644 str_enc = rb_enc_from_index(str_encindex);
3645 ptr_enc = rb_enc_from_index(ptr_encindex);
3646 goto incompatible;
3647 }
3648
3649 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3650 res_encindex = str_encindex;
3651 res_cr = ENC_CODERANGE_UNKNOWN;
3652 }
3653 else if (str_cr == ENC_CODERANGE_7BIT) {
3654 if (ptr_cr == ENC_CODERANGE_7BIT) {
3655 res_encindex = str_encindex;
3656 res_cr = ENC_CODERANGE_7BIT;
3657 }
3658 else {
3659 res_encindex = ptr_encindex;
3660 res_cr = ptr_cr;
3661 }
3662 }
3663 else if (str_cr == ENC_CODERANGE_VALID) {
3664 res_encindex = str_encindex;
3665 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3666 res_cr = str_cr;
3667 else
3668 res_cr = ptr_cr;
3669 }
3670 else { /* str_cr == ENC_CODERANGE_BROKEN */
3671 res_encindex = str_encindex;
3672 res_cr = str_cr;
3673 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3674 }
3675
3676 if (len < 0) {
3677 rb_raise(rb_eArgError, "negative string size (or size too big)");
3678 }
3679 str_buf_cat(str, ptr, len);
3680 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3681 return str;
3682
3683 incompatible:
3684 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3685 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3687}
3688
3689VALUE
3690rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3691{
3692 return rb_enc_cr_str_buf_cat(str, ptr, len,
3693 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3694}
3695
3696VALUE
3698{
3699 /* ptr must reference NUL terminated ASCII string. */
3700 int encindex = ENCODING_GET(str);
3701 rb_encoding *enc = rb_enc_from_index(encindex);
3702 if (rb_enc_asciicompat(enc)) {
3703 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3704 encindex, ENC_CODERANGE_7BIT, 0);
3705 }
3706 else {
3707 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3708 while (*ptr) {
3709 unsigned int c = (unsigned char)*ptr;
3710 int len = rb_enc_codelen(c, enc);
3711 rb_enc_mbcput(c, buf, enc);
3712 rb_enc_cr_str_buf_cat(str, buf, len,
3713 encindex, ENC_CODERANGE_VALID, 0);
3714 ptr++;
3715 }
3716 return str;
3717 }
3718}
3719
3720VALUE
3722{
3723 int str2_cr = rb_enc_str_coderange(str2);
3724
3725 if (str_enc_fastpath(str)) {
3726 switch (str2_cr) {
3727 case ENC_CODERANGE_7BIT:
3728 // If RHS is 7bit we can do simple concatenation
3729 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3730 RB_GC_GUARD(str2);
3731 return str;
3733 // If RHS is valid, we can do simple concatenation if encodings are the same
3734 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3735 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3736 int str_cr = ENC_CODERANGE(str);
3737 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3738 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3739 }
3740 RB_GC_GUARD(str2);
3741 return str;
3742 }
3743 }
3744 }
3745
3746 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3747 ENCODING_GET(str2), str2_cr, &str2_cr);
3748
3749 ENC_CODERANGE_SET(str2, str2_cr);
3750
3751 return str;
3752}
3753
3754VALUE
3756{
3757 StringValue(str2);
3758 return rb_str_buf_append(str, str2);
3759}
3760
3761VALUE
3762rb_str_concat_literals(size_t num, const VALUE *strary)
3763{
3764 VALUE str;
3765 size_t i, s = 0;
3766 unsigned long len = 1;
3767
3768 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3769 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3770
3771 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3772 str = rb_str_buf_new(len);
3773 str_enc_copy_direct(str, strary[0]);
3774
3775 for (i = s; i < num; ++i) {
3776 const VALUE v = strary[i];
3777 int encidx = ENCODING_GET(v);
3778
3779 rb_str_buf_append(str, v);
3780 if (encidx != ENCINDEX_US_ASCII) {
3781 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3782 rb_enc_set_index(str, encidx);
3783 }
3784 }
3785 return str;
3786}
3787
3788/*
3789 * call-seq:
3790 * concat(*objects) -> string
3791 *
3792 * Concatenates each object in +objects+ to +self+ and returns +self+:
3793 *
3794 * s = 'foo'
3795 * s.concat('bar', 'baz') # => "foobarbaz"
3796 * s # => "foobarbaz"
3797 *
3798 * For each given object +object+ that is an Integer,
3799 * the value is considered a codepoint and converted to a character before concatenation:
3800 *
3801 * s = 'foo'
3802 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3803 *
3804 * Related: String#<<, which takes a single argument.
3805 */
3806static VALUE
3807rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3808{
3809 str_modifiable(str);
3810
3811 if (argc == 1) {
3812 return rb_str_concat(str, argv[0]);
3813 }
3814 else if (argc > 1) {
3815 int i;
3816 VALUE arg_str = rb_str_tmp_new(0);
3817 rb_enc_copy(arg_str, str);
3818 for (i = 0; i < argc; i++) {
3819 rb_str_concat(arg_str, argv[i]);
3820 }
3821 rb_str_buf_append(str, arg_str);
3822 }
3823
3824 return str;
3825}
3826
3827/*
3828 * call-seq:
3829 * append_as_bytes(*objects) -> self
3830 *
3831 * Concatenates each object in +objects+ into +self+; returns +self+;
3832 * performs no encoding validation or conversion:
3833 *
3834 * s = 'foo'
3835 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3836 * s.valid_encoding? # => false
3837 * s.append_as_bytes("\xAC 12")
3838 * s.valid_encoding? # => true
3839 *
3840 * When a given object is an integer,
3841 * the value is considered an 8-bit byte;
3842 * if the integer occupies more than one byte (i.e,. is greater than 255),
3843 * appends only the low-order byte (similar to String#setbyte):
3844 *
3845 * s = ""
3846 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3847 * s.bytesize # => 2
3848 *
3849 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3850 */
3851
3852VALUE
3853rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3854{
3855 long needed_capacity = 0;
3856 volatile VALUE t0;
3857 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3858
3859 for (int index = 0; index < argc; index++) {
3860 VALUE obj = argv[index];
3861 enum ruby_value_type type = types[index] = rb_type(obj);
3862 switch (type) {
3863 case T_FIXNUM:
3864 case T_BIGNUM:
3865 needed_capacity++;
3866 break;
3867 case T_STRING:
3868 needed_capacity += RSTRING_LEN(obj);
3869 break;
3870 default:
3871 rb_raise(
3873 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3874 rb_obj_class(obj)
3875 );
3876 break;
3877 }
3878 }
3879
3880 str_ensure_available_capa(str, needed_capacity);
3881 char *sptr = RSTRING_END(str);
3882
3883 for (int index = 0; index < argc; index++) {
3884 VALUE obj = argv[index];
3885 enum ruby_value_type type = types[index];
3886 switch (type) {
3887 case T_FIXNUM:
3888 case T_BIGNUM: {
3889 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3890 char byte = (char)(NUM2INT(obj) & 0xFF);
3891 *sptr = byte;
3892 sptr++;
3893 break;
3894 }
3895 case T_STRING: {
3896 const char *ptr;
3897 long len;
3898 RSTRING_GETMEM(obj, ptr, len);
3899 memcpy(sptr, ptr, len);
3900 sptr += len;
3901 break;
3902 }
3903 default:
3904 rb_bug("append_as_bytes arguments should have been validated");
3905 }
3906 }
3907
3908 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3909 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3910
3911 int cr = ENC_CODERANGE(str);
3912 switch (cr) {
3913 case ENC_CODERANGE_7BIT: {
3914 for (int index = 0; index < argc; index++) {
3915 VALUE obj = argv[index];
3916 enum ruby_value_type type = types[index];
3917 switch (type) {
3918 case T_FIXNUM:
3919 case T_BIGNUM: {
3920 if (!ISASCII(NUM2INT(obj))) {
3921 goto clear_cr;
3922 }
3923 break;
3924 }
3925 case T_STRING: {
3926 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3927 goto clear_cr;
3928 }
3929 break;
3930 }
3931 default:
3932 rb_bug("append_as_bytes arguments should have been validated");
3933 }
3934 }
3935 break;
3936 }
3938 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3939 goto keep_cr;
3940 }
3941 else {
3942 goto clear_cr;
3943 }
3944 break;
3945 default:
3946 goto clear_cr;
3947 break;
3948 }
3949
3950 RB_GC_GUARD(t0);
3951
3952 clear_cr:
3953 // If no fast path was hit, we clear the coderange.
3954 // append_as_bytes is predominently meant to be used in
3955 // buffering situation, hence it's likely the coderange
3956 // will never be scanned, so it's not worth spending time
3957 // precomputing the coderange except for simple and common
3958 // situations.
3960 keep_cr:
3961 return str;
3962}
3963
3964/*
3965 * call-seq:
3966 * self << object -> self
3967 *
3968 * Appends a string representation of +object+ to +self+;
3969 * returns +self+.
3970 *
3971 * If +object+ is a string, appends it to +self+:
3972 *
3973 * s = 'foo'
3974 * s << 'bar' # => "foobar"
3975 * s # => "foobar"
3976 *
3977 * If +object+ is an integer,
3978 * its value is considered a codepoint;
3979 * converts the value to a character before concatenating:
3980 *
3981 * s = 'foo'
3982 * s << 33 # => "foo!"
3983 *
3984 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3985 * and the encoding of +self+ is Encoding::US_ASCII,
3986 * changes the encoding to Encoding::ASCII_8BIT:
3987 *
3988 * s = 'foo'.encode(Encoding::US_ASCII)
3989 * s.encoding # => #<Encoding:US-ASCII>
3990 * s << 0xff # => "foo\xFF"
3991 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3992 *
3993 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3994 *
3995 * s = 'foo'
3996 * s.encoding # => <Encoding:UTF-8>
3997 * s << 0x00110000 # 1114112 out of char range (RangeError)
3998 * s = 'foo'.encode(Encoding::EUC_JP)
3999 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4000 *
4001 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4002 */
4003VALUE
4005{
4006 unsigned int code;
4007 rb_encoding *enc = STR_ENC_GET(str1);
4008 int encidx;
4009
4010 if (RB_INTEGER_TYPE_P(str2)) {
4011 if (rb_num_to_uint(str2, &code) == 0) {
4012 }
4013 else if (FIXNUM_P(str2)) {
4014 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4015 }
4016 else {
4017 rb_raise(rb_eRangeError, "bignum out of char range");
4018 }
4019 }
4020 else {
4021 return rb_str_append(str1, str2);
4022 }
4023
4024 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4025
4026 if (encidx >= 0) {
4027 rb_str_buf_cat_byte(str1, (unsigned char)code);
4028 }
4029 else {
4030 long pos = RSTRING_LEN(str1);
4031 int cr = ENC_CODERANGE(str1);
4032 int len;
4033 char *buf;
4034
4035 switch (len = rb_enc_codelen(code, enc)) {
4036 case ONIGERR_INVALID_CODE_POINT_VALUE:
4037 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4038 break;
4039 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4040 case 0:
4041 rb_raise(rb_eRangeError, "%u out of char range", code);
4042 break;
4043 }
4044 buf = ALLOCA_N(char, len + 1);
4045 rb_enc_mbcput(code, buf, enc);
4046 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4047 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4048 }
4049 rb_str_resize(str1, pos+len);
4050 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4051 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4053 }
4054 else if (cr == ENC_CODERANGE_BROKEN) {
4056 }
4057 ENC_CODERANGE_SET(str1, cr);
4058 }
4059 return str1;
4060}
4061
4062int
4063rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4064{
4065 int encidx = rb_enc_to_index(enc);
4066
4067 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4068 /* US-ASCII automatically extended to ASCII-8BIT */
4069 if (code > 0xFF) {
4070 rb_raise(rb_eRangeError, "%u out of char range", code);
4071 }
4072 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4073 return ENCINDEX_ASCII_8BIT;
4074 }
4075 return encidx;
4076 }
4077 else {
4078 return -1;
4079 }
4080}
4081
4082/*
4083 * call-seq:
4084 * prepend(*other_strings) -> string
4085 *
4086 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4087 *
4088 * s = 'foo'
4089 * s.prepend('bar', 'baz') # => "barbazfoo"
4090 * s # => "barbazfoo"
4091 *
4092 * Related: String#concat.
4093 */
4094
4095static VALUE
4096rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4097{
4098 str_modifiable(str);
4099
4100 if (argc == 1) {
4101 rb_str_update(str, 0L, 0L, argv[0]);
4102 }
4103 else if (argc > 1) {
4104 int i;
4105 VALUE arg_str = rb_str_tmp_new(0);
4106 rb_enc_copy(arg_str, str);
4107 for (i = 0; i < argc; i++) {
4108 rb_str_append(arg_str, argv[i]);
4109 }
4110 rb_str_update(str, 0L, 0L, arg_str);
4111 }
4112
4113 return str;
4114}
4115
4116st_index_t
4118{
4119 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4120 st_index_t precomputed_hash;
4121 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4122
4123 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4124 return precomputed_hash;
4125 }
4126
4127 return str_do_hash(str);
4128}
4129
4130int
4132{
4133 long len1, len2;
4134 const char *ptr1, *ptr2;
4135 RSTRING_GETMEM(str1, ptr1, len1);
4136 RSTRING_GETMEM(str2, ptr2, len2);
4137 return (len1 != len2 ||
4138 !rb_str_comparable(str1, str2) ||
4139 memcmp(ptr1, ptr2, len1) != 0);
4140}
4141
4142/*
4143 * call-seq:
4144 * hash -> integer
4145 *
4146 * Returns the integer hash value for +self+.
4147 * The value is based on the length, content and encoding of +self+.
4148 *
4149 * Related: Object#hash.
4150 */
4151
4152static VALUE
4153rb_str_hash_m(VALUE str)
4154{
4155 st_index_t hval = rb_str_hash(str);
4156 return ST2FIX(hval);
4157}
4158
4159#define lesser(a,b) (((a)>(b))?(b):(a))
4160
4161int
4163{
4164 int idx1, idx2;
4165 int rc1, rc2;
4166
4167 if (RSTRING_LEN(str1) == 0) return TRUE;
4168 if (RSTRING_LEN(str2) == 0) return TRUE;
4169 idx1 = ENCODING_GET(str1);
4170 idx2 = ENCODING_GET(str2);
4171 if (idx1 == idx2) return TRUE;
4172 rc1 = rb_enc_str_coderange(str1);
4173 rc2 = rb_enc_str_coderange(str2);
4174 if (rc1 == ENC_CODERANGE_7BIT) {
4175 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4176 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4177 return TRUE;
4178 }
4179 if (rc2 == ENC_CODERANGE_7BIT) {
4180 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4181 return TRUE;
4182 }
4183 return FALSE;
4184}
4185
4186int
4188{
4189 long len1, len2;
4190 const char *ptr1, *ptr2;
4191 int retval;
4192
4193 if (str1 == str2) return 0;
4194 RSTRING_GETMEM(str1, ptr1, len1);
4195 RSTRING_GETMEM(str2, ptr2, len2);
4196 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4197 if (len1 == len2) {
4198 if (!rb_str_comparable(str1, str2)) {
4199 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4200 return 1;
4201 return -1;
4202 }
4203 return 0;
4204 }
4205 if (len1 > len2) return 1;
4206 return -1;
4207 }
4208 if (retval > 0) return 1;
4209 return -1;
4210}
4211
4212/*
4213 * call-seq:
4214 * self == object -> true or false
4215 *
4216 * Returns whether +object+ is equal to +self+.
4217 *
4218 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4219 *
4220 * s = 'foo'
4221 * s == 'foo' # => true
4222 * s == 'food' # => false
4223 * s == 'FOO' # => false
4224 *
4225 * Returns +false+ if the two strings' encodings are not compatible:
4226 *
4227 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4228 *
4229 * When +object+ is not a string:
4230 *
4231 * - If +object+ responds to method <tt>to_str</tt>,
4232 * <tt>object == self</tt> is called and its return value is returned.
4233 * - If +object+ does not respond to <tt>to_str</tt>,
4234 * +false+ is returned.
4235 *
4236 * Related: {Comparing}[rdoc-ref:String@Comparing].
4237 */
4238
4239VALUE
4241{
4242 if (str1 == str2) return Qtrue;
4243 if (!RB_TYPE_P(str2, T_STRING)) {
4244 if (!rb_respond_to(str2, idTo_str)) {
4245 return Qfalse;
4246 }
4247 return rb_equal(str2, str1);
4248 }
4249 return rb_str_eql_internal(str1, str2);
4250}
4251
4252/*
4253 * call-seq:
4254 * eql?(object) -> true or false
4255 *
4256 * Returns +true+ if +object+ has the same length and content;
4257 * as +self+; +false+ otherwise:
4258 *
4259 * s = 'foo'
4260 * s.eql?('foo') # => true
4261 * s.eql?('food') # => false
4262 * s.eql?('FOO') # => false
4263 *
4264 * Returns +false+ if the two strings' encodings are not compatible:
4265 *
4266 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4267 *
4268 */
4269
4270VALUE
4271rb_str_eql(VALUE str1, VALUE str2)
4272{
4273 if (str1 == str2) return Qtrue;
4274 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4275 return rb_str_eql_internal(str1, str2);
4276}
4277
4278/*
4279 * call-seq:
4280 * self <=> other_string -> -1, 0, 1, or nil
4281 *
4282 * Compares +self+ and +other_string+, returning:
4283 *
4284 * - -1 if +other_string+ is larger.
4285 * - 0 if the two are equal.
4286 * - 1 if +other_string+ is smaller.
4287 * - +nil+ if the two are incomparable.
4288 *
4289 * Examples:
4290 *
4291 * 'foo' <=> 'foo' # => 0
4292 * 'foo' <=> 'food' # => -1
4293 * 'food' <=> 'foo' # => 1
4294 * 'FOO' <=> 'foo' # => -1
4295 * 'foo' <=> 'FOO' # => 1
4296 * 'foo' <=> 1 # => nil
4297 *
4298 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4299 */
4300
4301static VALUE
4302rb_str_cmp_m(VALUE str1, VALUE str2)
4303{
4304 int result;
4305 VALUE s = rb_check_string_type(str2);
4306 if (NIL_P(s)) {
4307 return rb_invcmp(str1, str2);
4308 }
4309 result = rb_str_cmp(str1, s);
4310 return INT2FIX(result);
4311}
4312
4313static VALUE str_casecmp(VALUE str1, VALUE str2);
4314static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4315
4316/*
4317 * call-seq:
4318 * casecmp(other_string) -> -1, 0, 1, or nil
4319 *
4320 * Ignoring case, compares +self+ and +other_string+; returns:
4321 *
4322 * - -1 if <tt>self.downcase</tt> is smaller than <tt>other_string.downcase</tt>.
4323 * - 0 if the two are equal.
4324 * - 1 if <tt>self.downcase</tt> is larger than <tt>other_string.downcase</tt>.
4325 * - +nil+ if the two are incomparable.
4326 *
4327 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4328 *
4329 * Examples:
4330 *
4331 * 'foo'.casecmp('goo') # => -1
4332 * 'goo'.casecmp('foo') # => 1
4333 * 'foo'.casecmp('food') # => -1
4334 * 'food'.casecmp('foo') # => 1
4335 * 'FOO'.casecmp('foo') # => 0
4336 * 'foo'.casecmp('FOO') # => 0
4337 * 'foo'.casecmp(1) # => nil
4338 *
4339 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4340 */
4341
4342static VALUE
4343rb_str_casecmp(VALUE str1, VALUE str2)
4344{
4345 VALUE s = rb_check_string_type(str2);
4346 if (NIL_P(s)) {
4347 return Qnil;
4348 }
4349 return str_casecmp(str1, s);
4350}
4351
4352static VALUE
4353str_casecmp(VALUE str1, VALUE str2)
4354{
4355 long len;
4356 rb_encoding *enc;
4357 const char *p1, *p1end, *p2, *p2end;
4358
4359 enc = rb_enc_compatible(str1, str2);
4360 if (!enc) {
4361 return Qnil;
4362 }
4363
4364 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4365 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4366 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4367 while (p1 < p1end && p2 < p2end) {
4368 if (*p1 != *p2) {
4369 unsigned int c1 = TOLOWER(*p1 & 0xff);
4370 unsigned int c2 = TOLOWER(*p2 & 0xff);
4371 if (c1 != c2)
4372 return INT2FIX(c1 < c2 ? -1 : 1);
4373 }
4374 p1++;
4375 p2++;
4376 }
4377 }
4378 else {
4379 while (p1 < p1end && p2 < p2end) {
4380 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4381 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4382
4383 if (0 <= c1 && 0 <= c2) {
4384 c1 = TOLOWER(c1);
4385 c2 = TOLOWER(c2);
4386 if (c1 != c2)
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4388 }
4389 else {
4390 int r;
4391 l1 = rb_enc_mbclen(p1, p1end, enc);
4392 l2 = rb_enc_mbclen(p2, p2end, enc);
4393 len = l1 < l2 ? l1 : l2;
4394 r = memcmp(p1, p2, len);
4395 if (r != 0)
4396 return INT2FIX(r < 0 ? -1 : 1);
4397 if (l1 != l2)
4398 return INT2FIX(l1 < l2 ? -1 : 1);
4399 }
4400 p1 += l1;
4401 p2 += l2;
4402 }
4403 }
4404 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4405 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4406 return INT2FIX(-1);
4407}
4408
4409/*
4410 * call-seq:
4411 * casecmp?(other_string) -> true, false, or nil
4412 *
4413 * Returns +true+ if +self+ and +other_string+ are equal after
4414 * Unicode case folding, +false+ if unequal, +nil+ if incomparable.
4415 *
4416 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4417 *
4418 * Examples:
4419 *
4420 * 'foo'.casecmp?('goo') # => false
4421 * 'goo'.casecmp?('foo') # => false
4422 * 'foo'.casecmp?('food') # => false
4423 * 'food'.casecmp?('foo') # => false
4424 * 'FOO'.casecmp?('foo') # => true
4425 * 'foo'.casecmp?('FOO') # => true
4426 * 'foo'.casecmp?(1) # => nil
4427 *
4428 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4429 */
4430
4431static VALUE
4432rb_str_casecmp_p(VALUE str1, VALUE str2)
4433{
4434 VALUE s = rb_check_string_type(str2);
4435 if (NIL_P(s)) {
4436 return Qnil;
4437 }
4438 return str_casecmp_p(str1, s);
4439}
4440
4441static VALUE
4442str_casecmp_p(VALUE str1, VALUE str2)
4443{
4444 rb_encoding *enc;
4445 VALUE folded_str1, folded_str2;
4446 VALUE fold_opt = sym_fold;
4447
4448 enc = rb_enc_compatible(str1, str2);
4449 if (!enc) {
4450 return Qnil;
4451 }
4452
4453 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4454 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4455
4456 return rb_str_eql(folded_str1, folded_str2);
4457}
4458
4459static long
4460strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4461 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4462{
4463 const char *search_start = str_ptr;
4464 long pos, search_len = str_len - offset;
4465
4466 for (;;) {
4467 const char *t;
4468 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4469 if (pos < 0) return pos;
4470 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4471 if (t == search_start + pos) break;
4472 search_len -= t - search_start;
4473 if (search_len <= 0) return -1;
4474 offset += t - search_start;
4475 search_start = t;
4476 }
4477 return pos + offset;
4478}
4479
4480/* found index in byte */
4481#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4482#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4483
4484static long
4485rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4486{
4487 const char *str_ptr, *str_ptr_end, *sub_ptr;
4488 long str_len, sub_len;
4489 rb_encoding *enc;
4490
4491 enc = rb_enc_check(str, sub);
4492 if (is_broken_string(sub)) return -1;
4493
4494 str_ptr = RSTRING_PTR(str);
4495 str_ptr_end = RSTRING_END(str);
4496 str_len = RSTRING_LEN(str);
4497 sub_ptr = RSTRING_PTR(sub);
4498 sub_len = RSTRING_LEN(sub);
4499
4500 if (str_len < sub_len) return -1;
4501
4502 if (offset != 0) {
4503 long str_len_char, sub_len_char;
4504 int single_byte = single_byte_optimizable(str);
4505 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4506 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4507 if (offset < 0) {
4508 offset += str_len_char;
4509 if (offset < 0) return -1;
4510 }
4511 if (str_len_char - offset < sub_len_char) return -1;
4512 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4513 str_ptr += offset;
4514 }
4515 if (sub_len == 0) return offset;
4516
4517 /* need proceed one character at a time */
4518 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4519}
4520
4521
4522/*
4523 * call-seq:
4524 * index(substring, offset = 0) -> integer or nil
4525 * index(regexp, offset = 0) -> integer or nil
4526 *
4527 * :include: doc/string/index.rdoc
4528 *
4529 */
4530
4531static VALUE
4532rb_str_index_m(int argc, VALUE *argv, VALUE str)
4533{
4534 VALUE sub;
4535 VALUE initpos;
4536 rb_encoding *enc = STR_ENC_GET(str);
4537 long pos;
4538
4539 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4540 long slen = str_strlen(str, enc); /* str's enc */
4541 pos = NUM2LONG(initpos);
4542 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4543 if (RB_TYPE_P(sub, T_REGEXP)) {
4545 }
4546 return Qnil;
4547 }
4548 }
4549 else {
4550 pos = 0;
4551 }
4552
4553 if (RB_TYPE_P(sub, T_REGEXP)) {
4554 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4555 enc, single_byte_optimizable(str));
4556
4557 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4558 VALUE match = rb_backref_get();
4559 struct re_registers *regs = RMATCH_REGS(match);
4560 pos = rb_str_sublen(str, BEG(0));
4561 return LONG2NUM(pos);
4562 }
4563 }
4564 else {
4565 StringValue(sub);
4566 pos = rb_str_index(str, sub, pos);
4567 if (pos >= 0) {
4568 pos = rb_str_sublen(str, pos);
4569 return LONG2NUM(pos);
4570 }
4571 }
4572 return Qnil;
4573}
4574
4575/* Ensure that the given pos is a valid character boundary.
4576 * Note that in this function, "character" means a code point
4577 * (Unicode scalar value), not a grapheme cluster.
4578 */
4579static void
4580str_ensure_byte_pos(VALUE str, long pos)
4581{
4582 if (!single_byte_optimizable(str)) {
4583 const char *s = RSTRING_PTR(str);
4584 const char *e = RSTRING_END(str);
4585 const char *p = s + pos;
4586 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4587 rb_raise(rb_eIndexError,
4588 "offset %ld does not land on character boundary", pos);
4589 }
4590 }
4591}
4592
4593/*
4594 * call-seq:
4595 * byteindex(object, offset = 0) -> integer or nil
4596 *
4597 * Returns the 0-based integer index of a substring of +self+
4598 * specified by +object+ (a string or Regexp) and +offset+,
4599 * or +nil+ if there is no such substring;
4600 * the returned index is the count of _bytes_ (not characters).
4601 *
4602 * When +object+ is a string,
4603 * returns the index of the first found substring equal to +object+:
4604 *
4605 * s = 'foo' # => "foo"
4606 * s.size # => 3 # Three 1-byte characters.
4607 * s.bytesize # => 3 # Three bytes.
4608 * s.byteindex('f') # => 0
4609 * s.byteindex('o') # => 1
4610 * s.byteindex('oo') # => 1
4611 * s.byteindex('ooo') # => nil
4612 *
4613 * When +object+ is a Regexp,
4614 * returns the index of the first found substring matching +object+;
4615 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4616 *
4617 * s = 'foo'
4618 * s.byteindex(/f/) # => 0
4619 * $~ # => #<MatchData "f">
4620 * s.byteindex(/o/) # => 1
4621 * s.byteindex(/oo/) # => 1
4622 * s.byteindex(/ooo/) # => nil
4623 * $~ # => nil
4624 *
4625 * \Integer argument +offset+, if given, specifies the 0-based index
4626 * of the byte where searching is to begin.
4627 *
4628 * When +offset+ is non-negative,
4629 * searching begins at byte position +offset+:
4630 *
4631 * s = 'foo'
4632 * s.byteindex('o', 1) # => 1
4633 * s.byteindex('o', 2) # => 2
4634 * s.byteindex('o', 3) # => nil
4635 *
4636 * When +offset+ is negative, counts backward from the end of +self+:
4637 *
4638 * s = 'foo'
4639 * s.byteindex('o', -1) # => 2
4640 * s.byteindex('o', -2) # => 1
4641 * s.byteindex('o', -3) # => 1
4642 * s.byteindex('o', -4) # => nil
4643 *
4644 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4645 *
4646 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4647 * s.size # => 2 # Two 3-byte characters.
4648 * s.bytesize # => 6 # Six bytes.
4649 * s.byteindex("\uFFFF") # => 0
4650 * s.byteindex("\uFFFF", 1) # Raises IndexError
4651 * s.byteindex("\uFFFF", 2) # Raises IndexError
4652 * s.byteindex("\uFFFF", 3) # => 3
4653 * s.byteindex("\uFFFF", 4) # Raises IndexError
4654 * s.byteindex("\uFFFF", 5) # Raises IndexError
4655 * s.byteindex("\uFFFF", 6) # => nil
4656 *
4657 * Related: see {Querying}[rdoc-ref:String@Querying].
4658 */
4659
4660static VALUE
4661rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4662{
4663 VALUE sub;
4664 VALUE initpos;
4665 long pos;
4666
4667 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4668 long slen = RSTRING_LEN(str);
4669 pos = NUM2LONG(initpos);
4670 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4671 if (RB_TYPE_P(sub, T_REGEXP)) {
4673 }
4674 return Qnil;
4675 }
4676 }
4677 else {
4678 pos = 0;
4679 }
4680
4681 str_ensure_byte_pos(str, pos);
4682
4683 if (RB_TYPE_P(sub, T_REGEXP)) {
4684 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4685 VALUE match = rb_backref_get();
4686 struct re_registers *regs = RMATCH_REGS(match);
4687 pos = BEG(0);
4688 return LONG2NUM(pos);
4689 }
4690 }
4691 else {
4692 StringValue(sub);
4693 pos = rb_str_byteindex(str, sub, pos);
4694 if (pos >= 0) return LONG2NUM(pos);
4695 }
4696 return Qnil;
4697}
4698
4699#ifndef HAVE_MEMRCHR
4700static void*
4701memrchr(const char *search_str, int chr, long search_len)
4702{
4703 const char *ptr = search_str + search_len;
4704 while (ptr > search_str) {
4705 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4706 }
4707
4708 return ((void *)0);
4709}
4710#endif
4711
4712static long
4713str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4714{
4715 char *hit, *adjusted;
4716 int c;
4717 long slen, searchlen;
4718 char *sbeg, *e, *t;
4719
4720 sbeg = RSTRING_PTR(str);
4721 slen = RSTRING_LEN(sub);
4722 if (slen == 0) return s - sbeg;
4723 e = RSTRING_END(str);
4724 t = RSTRING_PTR(sub);
4725 c = *t & 0xff;
4726 searchlen = s - sbeg + 1;
4727
4728 if (memcmp(s, t, slen) == 0) {
4729 return s - sbeg;
4730 }
4731
4732 do {
4733 hit = memrchr(sbeg, c, searchlen);
4734 if (!hit) break;
4735 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4736 if (hit != adjusted) {
4737 searchlen = adjusted - sbeg;
4738 continue;
4739 }
4740 if (memcmp(hit, t, slen) == 0)
4741 return hit - sbeg;
4742 searchlen = adjusted - sbeg;
4743 } while (searchlen > 0);
4744
4745 return -1;
4746}
4747
4748/* found index in byte */
4749static long
4750rb_str_rindex(VALUE str, VALUE sub, long pos)
4751{
4752 long len, slen;
4753 char *sbeg, *s;
4754 rb_encoding *enc;
4755 int singlebyte;
4756
4757 enc = rb_enc_check(str, sub);
4758 if (is_broken_string(sub)) return -1;
4759 singlebyte = single_byte_optimizable(str);
4760 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4761 slen = str_strlen(sub, enc); /* rb_enc_check */
4762
4763 /* substring longer than string */
4764 if (len < slen) return -1;
4765 if (len - pos < slen) pos = len - slen;
4766 if (len == 0) return pos;
4767
4768 sbeg = RSTRING_PTR(str);
4769
4770 if (pos == 0) {
4771 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4772 return 0;
4773 else
4774 return -1;
4775 }
4776
4777 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4778 return str_rindex(str, sub, s, enc);
4779}
4780
4781/*
4782 * call-seq:
4783 * rindex(substring, offset = self.length) -> integer or nil
4784 * rindex(regexp, offset = self.length) -> integer or nil
4785 *
4786 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4787 * or +nil+ if none found:
4788 *
4789 * 'foo'.rindex('f') # => 0
4790 * 'foo'.rindex('o') # => 2
4791 * 'foo'.rindex('oo') # => 1
4792 * 'foo'.rindex('ooo') # => nil
4793 *
4794 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4795 * or +nil+ if none found:
4796 *
4797 * 'foo'.rindex(/f/) # => 0
4798 * 'foo'.rindex(/o/) # => 2
4799 * 'foo'.rindex(/oo/) # => 1
4800 * 'foo'.rindex(/ooo/) # => nil
4801 *
4802 * The _last_ match means starting at the possible last position, not
4803 * the last of longest matches.
4804 *
4805 * 'foo'.rindex(/o+/) # => 2
4806 * $~ #=> #<MatchData "o">
4807 *
4808 * To get the last longest match, needs to combine with negative
4809 * lookbehind.
4810 *
4811 * 'foo'.rindex(/(?<!o)o+/) # => 1
4812 * $~ #=> #<MatchData "oo">
4813 *
4814 * Or String#index with negative lookforward.
4815 *
4816 * 'foo'.index(/o+(?!.*o)/) # => 1
4817 * $~ #=> #<MatchData "oo">
4818 *
4819 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4820 * string to _end_ the search:
4821 *
4822 * 'foo'.rindex('o', 0) # => nil
4823 * 'foo'.rindex('o', 1) # => 1
4824 * 'foo'.rindex('o', 2) # => 2
4825 * 'foo'.rindex('o', 3) # => 2
4826 *
4827 * If +offset+ is a negative Integer, the maximum starting position in the
4828 * string to _end_ the search is the sum of the string's length and +offset+:
4829 *
4830 * 'foo'.rindex('o', -1) # => 2
4831 * 'foo'.rindex('o', -2) # => 1
4832 * 'foo'.rindex('o', -3) # => nil
4833 * 'foo'.rindex('o', -4) # => nil
4834 *
4835 * Related: String#index.
4836 */
4837
4838static VALUE
4839rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4840{
4841 VALUE sub;
4842 VALUE initpos;
4843 rb_encoding *enc = STR_ENC_GET(str);
4844 long pos, len = str_strlen(str, enc); /* str's enc */
4845
4846 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4847 pos = NUM2LONG(initpos);
4848 if (pos < 0 && (pos += len) < 0) {
4849 if (RB_TYPE_P(sub, T_REGEXP)) {
4851 }
4852 return Qnil;
4853 }
4854 if (pos > len) pos = len;
4855 }
4856 else {
4857 pos = len;
4858 }
4859
4860 if (RB_TYPE_P(sub, T_REGEXP)) {
4861 /* enc = rb_enc_check(str, sub); */
4862 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4863 enc, single_byte_optimizable(str));
4864
4865 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4866 VALUE match = rb_backref_get();
4867 struct re_registers *regs = RMATCH_REGS(match);
4868 pos = rb_str_sublen(str, BEG(0));
4869 return LONG2NUM(pos);
4870 }
4871 }
4872 else {
4873 StringValue(sub);
4874 pos = rb_str_rindex(str, sub, pos);
4875 if (pos >= 0) {
4876 pos = rb_str_sublen(str, pos);
4877 return LONG2NUM(pos);
4878 }
4879 }
4880 return Qnil;
4881}
4882
4883static long
4884rb_str_byterindex(VALUE str, VALUE sub, long pos)
4885{
4886 long len, slen;
4887 char *sbeg, *s;
4888 rb_encoding *enc;
4889
4890 enc = rb_enc_check(str, sub);
4891 if (is_broken_string(sub)) return -1;
4892 len = RSTRING_LEN(str);
4893 slen = RSTRING_LEN(sub);
4894
4895 /* substring longer than string */
4896 if (len < slen) return -1;
4897 if (len - pos < slen) pos = len - slen;
4898 if (len == 0) return pos;
4899
4900 sbeg = RSTRING_PTR(str);
4901
4902 if (pos == 0) {
4903 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4904 return 0;
4905 else
4906 return -1;
4907 }
4908
4909 s = sbeg + pos;
4910 return str_rindex(str, sub, s, enc);
4911}
4912
4913/*
4914 * call-seq:
4915 * byterindex(object, offset = self.bytesize) -> integer or nil
4916 *
4917 * Returns the 0-based integer index of a substring of +self+
4918 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4919 * or +nil+ if there is no such substring;
4920 * the returned index is the count of _bytes_ (not characters).
4921 *
4922 * When +object+ is a string,
4923 * returns the index of the _last_ found substring equal to +object+:
4924 *
4925 * s = 'foo' # => "foo"
4926 * s.size # => 3 # Three 1-byte characters.
4927 * s.bytesize # => 3 # Three bytes.
4928 * s.byterindex('f') # => 0
4929 s.byterindex('o') # => 2
4930 s.byterindex('oo') # => 1
4931 s.byterindex('ooo') # => nil
4932 *
4933 * When +object+ is a Regexp,
4934 * returns the index of the last found substring matching +object+;
4935 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4936 *
4937 * s = 'foo'
4938 * s.byterindex(/f/) # => 0
4939 * $~ # => #<MatchData "f">
4940 * s.byterindex(/o/) # => 2
4941 * s.byterindex(/oo/) # => 1
4942 * s.byterindex(/ooo/) # => nil
4943 * $~ # => nil
4944 *
4945 * The last match means starting at the possible last position,
4946 * not the last of the longest matches:
4947 *
4948 * s = 'foo'
4949 * s.byterindex(/o+/) # => 2
4950 * $~ #=> #<MatchData "o">
4951 *
4952 * To get the last longest match, use a negative lookbehind:
4953 *
4954 * s = 'foo'
4955 * s.byterindex(/(?<!o)o+/) # => 1
4956 * $~ # => #<MatchData "oo">
4957 *
4958 * Or use method #byteindex with negative lookahead:
4959 *
4960 * s = 'foo'
4961 * s.byteindex(/o+(?!.*o)/) # => 1
4962 * $~ #=> #<MatchData "oo">
4963 *
4964 * \Integer argument +offset+, if given, specifies the 0-based index
4965 * of the byte where searching is to end.
4966 *
4967 * When +offset+ is non-negative,
4968 * searching ends at byte position +offset+:
4969 *
4970 * s = 'foo'
4971 * s.byterindex('o', 0) # => nil
4972 * s.byterindex('o', 1) # => 1
4973 * s.byterindex('o', 2) # => 2
4974 * s.byterindex('o', 3) # => 2
4975 *
4976 * When +offset+ is negative, counts backward from the end of +self+:
4977 *
4978 * s = 'foo'
4979 * s.byterindex('o', -1) # => 2
4980 * s.byterindex('o', -2) # => 1
4981 * s.byterindex('o', -3) # => nil
4982 *
4983 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4984 *
4985 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4986 * s.size # => 2 # Two 3-byte characters.
4987 * s.bytesize # => 6 # Six bytes.
4988 * s.byterindex("\uFFFF") # => 3
4989 * s.byterindex("\uFFFF", 1) # Raises IndexError
4990 * s.byterindex("\uFFFF", 2) # Raises IndexError
4991 * s.byterindex("\uFFFF", 3) # => 3
4992 * s.byterindex("\uFFFF", 4) # Raises IndexError
4993 * s.byterindex("\uFFFF", 5) # Raises IndexError
4994 * s.byterindex("\uFFFF", 6) # => nil
4995 *
4996 * Related: see {Querying}[rdoc-ref:String@Querying].
4997 */
4998
4999static VALUE
5000rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5001{
5002 VALUE sub;
5003 VALUE initpos;
5004 long pos, len = RSTRING_LEN(str);
5005
5006 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5007 pos = NUM2LONG(initpos);
5008 if (pos < 0 && (pos += len) < 0) {
5009 if (RB_TYPE_P(sub, T_REGEXP)) {
5011 }
5012 return Qnil;
5013 }
5014 if (pos > len) pos = len;
5015 }
5016 else {
5017 pos = len;
5018 }
5019
5020 str_ensure_byte_pos(str, pos);
5021
5022 if (RB_TYPE_P(sub, T_REGEXP)) {
5023 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5024 VALUE match = rb_backref_get();
5025 struct re_registers *regs = RMATCH_REGS(match);
5026 pos = BEG(0);
5027 return LONG2NUM(pos);
5028 }
5029 }
5030 else {
5031 StringValue(sub);
5032 pos = rb_str_byterindex(str, sub, pos);
5033 if (pos >= 0) return LONG2NUM(pos);
5034 }
5035 return Qnil;
5036}
5037
5038/*
5039 * call-seq:
5040 * self =~ object -> integer or nil
5041 *
5042 * When +object+ is a Regexp, returns the index of the first substring in +self+
5043 * matched by +object+,
5044 * or +nil+ if no match is found;
5045 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5046 *
5047 * 'foo' =~ /f/ # => 0
5048 * $~ # => #<MatchData "f">
5049 * 'foo' =~ /o/ # => 1
5050 * $~ # => #<MatchData "o">
5051 * 'foo' =~ /x/ # => nil
5052 * $~ # => nil
5053 *
5054 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5055 * (see Regexp#=~):
5056 *
5057 * number = nil
5058 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5059 * number # => nil # Not assigned.
5060 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5061 * number # => "9" # Assigned.
5062 *
5063 * If +object+ is not a Regexp, returns the value
5064 * returned by <tt>object =~ self</tt>.
5065 *
5066 * Related: see {Querying}[rdoc-ref:String@Querying].
5067 */
5068
5069static VALUE
5070rb_str_match(VALUE x, VALUE y)
5071{
5072 switch (OBJ_BUILTIN_TYPE(y)) {
5073 case T_STRING:
5074 rb_raise(rb_eTypeError, "type mismatch: String given");
5075
5076 case T_REGEXP:
5077 return rb_reg_match(y, x);
5078
5079 default:
5080 return rb_funcall(y, idEqTilde, 1, x);
5081 }
5082}
5083
5084
5085static VALUE get_pat(VALUE);
5086
5087
5088/*
5089 * call-seq:
5090 * match(pattern, offset = 0) -> matchdata or nil
5091 * match(pattern, offset = 0) {|matchdata| ... } -> object
5092 *
5093 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5094 *
5095 * Note: also updates Regexp@Global+Variables.
5096 *
5097 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5098 * regexp = Regexp.new(pattern)
5099 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5100 * (see Regexp#match):
5101 * matchdata = regexp.match(self)
5102 *
5103 * With no block given, returns the computed +matchdata+:
5104 *
5105 * 'foo'.match('f') # => #<MatchData "f">
5106 * 'foo'.match('o') # => #<MatchData "o">
5107 * 'foo'.match('x') # => nil
5108 *
5109 * If Integer argument +offset+ is given, the search begins at index +offset+:
5110 *
5111 * 'foo'.match('f', 1) # => nil
5112 * 'foo'.match('o', 1) # => #<MatchData "o">
5113 *
5114 * With a block given, calls the block with the computed +matchdata+
5115 * and returns the block's return value:
5116 *
5117 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5118 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5119 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5120 *
5121 */
5122
5123static VALUE
5124rb_str_match_m(int argc, VALUE *argv, VALUE str)
5125{
5126 VALUE re, result;
5127 if (argc < 1)
5128 rb_check_arity(argc, 1, 2);
5129 re = argv[0];
5130 argv[0] = str;
5131 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5132 if (!NIL_P(result) && rb_block_given_p()) {
5133 return rb_yield(result);
5134 }
5135 return result;
5136}
5137
5138/*
5139 * call-seq:
5140 * match?(pattern, offset = 0) -> true or false
5141 *
5142 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5143 *
5144 * Note: does not update Regexp@Global+Variables.
5145 *
5146 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5147 * regexp = Regexp.new(pattern)
5148 *
5149 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5150 * +false+ otherwise:
5151 *
5152 * 'foo'.match?(/o/) # => true
5153 * 'foo'.match?('o') # => true
5154 * 'foo'.match?(/x/) # => false
5155 *
5156 * If Integer argument +offset+ is given, the search begins at index +offset+:
5157 * 'foo'.match?('f', 1) # => false
5158 * 'foo'.match?('o', 1) # => true
5159 *
5160 */
5161
5162static VALUE
5163rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5164{
5165 VALUE re;
5166 rb_check_arity(argc, 1, 2);
5167 re = get_pat(argv[0]);
5168 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5169}
5170
5171enum neighbor_char {
5172 NEIGHBOR_NOT_CHAR,
5173 NEIGHBOR_FOUND,
5174 NEIGHBOR_WRAPPED
5175};
5176
5177static enum neighbor_char
5178enc_succ_char(char *p, long len, rb_encoding *enc)
5179{
5180 long i;
5181 int l;
5182
5183 if (rb_enc_mbminlen(enc) > 1) {
5184 /* wchar, trivial case */
5185 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5186 if (!MBCLEN_CHARFOUND_P(r)) {
5187 return NEIGHBOR_NOT_CHAR;
5188 }
5189 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5190 l = rb_enc_code_to_mbclen(c, enc);
5191 if (!l) return NEIGHBOR_NOT_CHAR;
5192 if (l != len) return NEIGHBOR_WRAPPED;
5193 rb_enc_mbcput(c, p, enc);
5194 r = rb_enc_precise_mbclen(p, p + len, enc);
5195 if (!MBCLEN_CHARFOUND_P(r)) {
5196 return NEIGHBOR_NOT_CHAR;
5197 }
5198 return NEIGHBOR_FOUND;
5199 }
5200 while (1) {
5201 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5202 p[i] = '\0';
5203 if (i < 0)
5204 return NEIGHBOR_WRAPPED;
5205 ++((unsigned char*)p)[i];
5206 l = rb_enc_precise_mbclen(p, p+len, enc);
5207 if (MBCLEN_CHARFOUND_P(l)) {
5208 l = MBCLEN_CHARFOUND_LEN(l);
5209 if (l == len) {
5210 return NEIGHBOR_FOUND;
5211 }
5212 else {
5213 memset(p+l, 0xff, len-l);
5214 }
5215 }
5216 if (MBCLEN_INVALID_P(l) && i < len-1) {
5217 long len2;
5218 int l2;
5219 for (len2 = len-1; 0 < len2; len2--) {
5220 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5221 if (!MBCLEN_INVALID_P(l2))
5222 break;
5223 }
5224 memset(p+len2+1, 0xff, len-(len2+1));
5225 }
5226 }
5227}
5228
5229static enum neighbor_char
5230enc_pred_char(char *p, long len, rb_encoding *enc)
5231{
5232 long i;
5233 int l;
5234 if (rb_enc_mbminlen(enc) > 1) {
5235 /* wchar, trivial case */
5236 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5237 if (!MBCLEN_CHARFOUND_P(r)) {
5238 return NEIGHBOR_NOT_CHAR;
5239 }
5240 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5241 if (!c) return NEIGHBOR_NOT_CHAR;
5242 --c;
5243 l = rb_enc_code_to_mbclen(c, enc);
5244 if (!l) return NEIGHBOR_NOT_CHAR;
5245 if (l != len) return NEIGHBOR_WRAPPED;
5246 rb_enc_mbcput(c, p, enc);
5247 r = rb_enc_precise_mbclen(p, p + len, enc);
5248 if (!MBCLEN_CHARFOUND_P(r)) {
5249 return NEIGHBOR_NOT_CHAR;
5250 }
5251 return NEIGHBOR_FOUND;
5252 }
5253 while (1) {
5254 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5255 p[i] = '\xff';
5256 if (i < 0)
5257 return NEIGHBOR_WRAPPED;
5258 --((unsigned char*)p)[i];
5259 l = rb_enc_precise_mbclen(p, p+len, enc);
5260 if (MBCLEN_CHARFOUND_P(l)) {
5261 l = MBCLEN_CHARFOUND_LEN(l);
5262 if (l == len) {
5263 return NEIGHBOR_FOUND;
5264 }
5265 else {
5266 memset(p+l, 0, len-l);
5267 }
5268 }
5269 if (MBCLEN_INVALID_P(l) && i < len-1) {
5270 long len2;
5271 int l2;
5272 for (len2 = len-1; 0 < len2; len2--) {
5273 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5274 if (!MBCLEN_INVALID_P(l2))
5275 break;
5276 }
5277 memset(p+len2+1, 0, len-(len2+1));
5278 }
5279 }
5280}
5281
5282/*
5283 overwrite +p+ by succeeding letter in +enc+ and returns
5284 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5285 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5286 assuming each ranges are successive, and mbclen
5287 never change in each ranges.
5288 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5289 character.
5290 */
5291static enum neighbor_char
5292enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5293{
5294 enum neighbor_char ret;
5295 unsigned int c;
5296 int ctype;
5297 int range;
5298 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5299
5300 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5301 int try;
5302 const int max_gaps = 1;
5303
5304 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5305 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5306 ctype = ONIGENC_CTYPE_DIGIT;
5307 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5308 ctype = ONIGENC_CTYPE_ALPHA;
5309 else
5310 return NEIGHBOR_NOT_CHAR;
5311
5312 MEMCPY(save, p, char, len);
5313 for (try = 0; try <= max_gaps; ++try) {
5314 ret = enc_succ_char(p, len, enc);
5315 if (ret == NEIGHBOR_FOUND) {
5316 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5317 if (rb_enc_isctype(c, ctype, enc))
5318 return NEIGHBOR_FOUND;
5319 }
5320 }
5321 MEMCPY(p, save, char, len);
5322 range = 1;
5323 while (1) {
5324 MEMCPY(save, p, char, len);
5325 ret = enc_pred_char(p, len, enc);
5326 if (ret == NEIGHBOR_FOUND) {
5327 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5328 if (!rb_enc_isctype(c, ctype, enc)) {
5329 MEMCPY(p, save, char, len);
5330 break;
5331 }
5332 }
5333 else {
5334 MEMCPY(p, save, char, len);
5335 break;
5336 }
5337 range++;
5338 }
5339 if (range == 1) {
5340 return NEIGHBOR_NOT_CHAR;
5341 }
5342
5343 if (ctype != ONIGENC_CTYPE_DIGIT) {
5344 MEMCPY(carry, p, char, len);
5345 return NEIGHBOR_WRAPPED;
5346 }
5347
5348 MEMCPY(carry, p, char, len);
5349 enc_succ_char(carry, len, enc);
5350 return NEIGHBOR_WRAPPED;
5351}
5352
5353
5354static VALUE str_succ(VALUE str);
5355
5356/*
5357 * call-seq:
5358 * succ -> new_str
5359 *
5360 * Returns the successor to +self+. The successor is calculated by
5361 * incrementing characters.
5362 *
5363 * The first character to be incremented is the rightmost alphanumeric:
5364 * or, if no alphanumerics, the rightmost character:
5365 *
5366 * 'THX1138'.succ # => "THX1139"
5367 * '<<koala>>'.succ # => "<<koalb>>"
5368 * '***'.succ # => '**+'
5369 *
5370 * The successor to a digit is another digit, "carrying" to the next-left
5371 * character for a "rollover" from 9 to 0, and prepending another digit
5372 * if necessary:
5373 *
5374 * '00'.succ # => "01"
5375 * '09'.succ # => "10"
5376 * '99'.succ # => "100"
5377 *
5378 * The successor to a letter is another letter of the same case,
5379 * carrying to the next-left character for a rollover,
5380 * and prepending another same-case letter if necessary:
5381 *
5382 * 'aa'.succ # => "ab"
5383 * 'az'.succ # => "ba"
5384 * 'zz'.succ # => "aaa"
5385 * 'AA'.succ # => "AB"
5386 * 'AZ'.succ # => "BA"
5387 * 'ZZ'.succ # => "AAA"
5388 *
5389 * The successor to a non-alphanumeric character is the next character
5390 * in the underlying character set's collating sequence,
5391 * carrying to the next-left character for a rollover,
5392 * and prepending another character if necessary:
5393 *
5394 * s = 0.chr * 3
5395 * s # => "\x00\x00\x00"
5396 * s.succ # => "\x00\x00\x01"
5397 * s = 255.chr * 3
5398 * s # => "\xFF\xFF\xFF"
5399 * s.succ # => "\x01\x00\x00\x00"
5400 *
5401 * Carrying can occur between and among mixtures of alphanumeric characters:
5402 *
5403 * s = 'zz99zz99'
5404 * s.succ # => "aaa00aa00"
5405 * s = '99zz99zz'
5406 * s.succ # => "100aa00aa"
5407 *
5408 * The successor to an empty +String+ is a new empty +String+:
5409 *
5410 * ''.succ # => ""
5411 *
5412 */
5413
5414VALUE
5416{
5417 VALUE str;
5418 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5419 rb_enc_cr_str_copy_for_substr(str, orig);
5420 return str_succ(str);
5421}
5422
5423static VALUE
5424str_succ(VALUE str)
5425{
5426 rb_encoding *enc;
5427 char *sbeg, *s, *e, *last_alnum = 0;
5428 int found_alnum = 0;
5429 long l, slen;
5430 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5431 long carry_pos = 0, carry_len = 1;
5432 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5433
5434 slen = RSTRING_LEN(str);
5435 if (slen == 0) return str;
5436
5437 enc = STR_ENC_GET(str);
5438 sbeg = RSTRING_PTR(str);
5439 s = e = sbeg + slen;
5440
5441 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5442 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5443 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5444 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5445 break;
5446 }
5447 }
5448 l = rb_enc_precise_mbclen(s, e, enc);
5449 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5450 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5451 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5452 switch (neighbor) {
5453 case NEIGHBOR_NOT_CHAR:
5454 continue;
5455 case NEIGHBOR_FOUND:
5456 return str;
5457 case NEIGHBOR_WRAPPED:
5458 last_alnum = s;
5459 break;
5460 }
5461 found_alnum = 1;
5462 carry_pos = s - sbeg;
5463 carry_len = l;
5464 }
5465 if (!found_alnum) { /* str contains no alnum */
5466 s = e;
5467 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5468 enum neighbor_char neighbor;
5469 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5470 l = rb_enc_precise_mbclen(s, e, enc);
5471 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5472 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5473 MEMCPY(tmp, s, char, l);
5474 neighbor = enc_succ_char(tmp, l, enc);
5475 switch (neighbor) {
5476 case NEIGHBOR_FOUND:
5477 MEMCPY(s, tmp, char, l);
5478 return str;
5479 break;
5480 case NEIGHBOR_WRAPPED:
5481 MEMCPY(s, tmp, char, l);
5482 break;
5483 case NEIGHBOR_NOT_CHAR:
5484 break;
5485 }
5486 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5487 /* wrapped to \0...\0. search next valid char. */
5488 enc_succ_char(s, l, enc);
5489 }
5490 if (!rb_enc_asciicompat(enc)) {
5491 MEMCPY(carry, s, char, l);
5492 carry_len = l;
5493 }
5494 carry_pos = s - sbeg;
5495 }
5497 }
5498 RESIZE_CAPA(str, slen + carry_len);
5499 sbeg = RSTRING_PTR(str);
5500 s = sbeg + carry_pos;
5501 memmove(s + carry_len, s, slen - carry_pos);
5502 memmove(s, carry, carry_len);
5503 slen += carry_len;
5504 STR_SET_LEN(str, slen);
5505 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5507 return str;
5508}
5509
5510
5511/*
5512 * call-seq:
5513 * succ! -> self
5514 *
5515 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5516 */
5517
5518static VALUE
5519rb_str_succ_bang(VALUE str)
5520{
5521 rb_str_modify(str);
5522 str_succ(str);
5523 return str;
5524}
5525
5526static int
5527all_digits_p(const char *s, long len)
5528{
5529 while (len-- > 0) {
5530 if (!ISDIGIT(*s)) return 0;
5531 s++;
5532 }
5533 return 1;
5534}
5535
5536static int
5537str_upto_i(VALUE str, VALUE arg)
5538{
5539 rb_yield(str);
5540 return 0;
5541}
5542
5543/*
5544 * call-seq:
5545 * upto(other_string, exclusive = false) {|string| ... } -> self
5546 * upto(other_string, exclusive = false) -> new_enumerator
5547 *
5548 * With a block given, calls the block with each +String+ value
5549 * returned by successive calls to String#succ;
5550 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5551 * the sequence terminates when value +other_string+ is reached;
5552 * returns +self+:
5553 *
5554 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5555 * Output:
5556 *
5557 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5558 *
5559 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5560 *
5561 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5562 *
5563 * Output:
5564 *
5565 * a8 a9 b0 b1 b2 b3 b4 b5
5566 *
5567 * If +other_string+ would not be reached, does not call the block:
5568 *
5569 * '25'.upto('5') {|s| fail s }
5570 * 'aa'.upto('a') {|s| fail s }
5571 *
5572 * With no block given, returns a new Enumerator:
5573 *
5574 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5575 *
5576 */
5577
5578static VALUE
5579rb_str_upto(int argc, VALUE *argv, VALUE beg)
5580{
5581 VALUE end, exclusive;
5582
5583 rb_scan_args(argc, argv, "11", &end, &exclusive);
5584 RETURN_ENUMERATOR(beg, argc, argv);
5585 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5586}
5587
5588VALUE
5589rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5590{
5591 VALUE current, after_end;
5592 ID succ;
5593 int n, ascii;
5594 rb_encoding *enc;
5595
5596 CONST_ID(succ, "succ");
5597 StringValue(end);
5598 enc = rb_enc_check(beg, end);
5599 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5600 /* single character */
5601 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5602 char c = RSTRING_PTR(beg)[0];
5603 char e = RSTRING_PTR(end)[0];
5604
5605 if (c > e || (excl && c == e)) return beg;
5606 for (;;) {
5607 VALUE str = rb_enc_str_new(&c, 1, enc);
5609 if ((*each)(str, arg)) break;
5610 if (!excl && c == e) break;
5611 c++;
5612 if (excl && c == e) break;
5613 }
5614 return beg;
5615 }
5616 /* both edges are all digits */
5617 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5618 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5619 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5620 VALUE b, e;
5621 int width;
5622
5623 width = RSTRING_LENINT(beg);
5624 b = rb_str_to_inum(beg, 10, FALSE);
5625 e = rb_str_to_inum(end, 10, FALSE);
5626 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5627 long bi = FIX2LONG(b);
5628 long ei = FIX2LONG(e);
5629 rb_encoding *usascii = rb_usascii_encoding();
5630
5631 while (bi <= ei) {
5632 if (excl && bi == ei) break;
5633 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5634 bi++;
5635 }
5636 }
5637 else {
5638 ID op = excl ? '<' : idLE;
5639 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5640
5641 args[0] = INT2FIX(width);
5642 while (rb_funcall(b, op, 1, e)) {
5643 args[1] = b;
5644 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5645 b = rb_funcallv(b, succ, 0, 0);
5646 }
5647 }
5648 return beg;
5649 }
5650 /* normal case */
5651 n = rb_str_cmp(beg, end);
5652 if (n > 0 || (excl && n == 0)) return beg;
5653
5654 after_end = rb_funcallv(end, succ, 0, 0);
5655 current = str_duplicate(rb_cString, beg);
5656 while (!rb_str_equal(current, after_end)) {
5657 VALUE next = Qnil;
5658 if (excl || !rb_str_equal(current, end))
5659 next = rb_funcallv(current, succ, 0, 0);
5660 if ((*each)(current, arg)) break;
5661 if (NIL_P(next)) break;
5662 current = next;
5663 StringValue(current);
5664 if (excl && rb_str_equal(current, end)) break;
5665 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5666 break;
5667 }
5668
5669 return beg;
5670}
5671
5672VALUE
5673rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5674{
5675 VALUE current;
5676 ID succ;
5677
5678 CONST_ID(succ, "succ");
5679 /* both edges are all digits */
5680 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5681 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5682 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5683 int width = RSTRING_LENINT(beg);
5684 b = rb_str_to_inum(beg, 10, FALSE);
5685 if (FIXNUM_P(b)) {
5686 long bi = FIX2LONG(b);
5687 rb_encoding *usascii = rb_usascii_encoding();
5688
5689 while (FIXABLE(bi)) {
5690 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5691 bi++;
5692 }
5693 b = LONG2NUM(bi);
5694 }
5695 args[0] = INT2FIX(width);
5696 while (1) {
5697 args[1] = b;
5698 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5699 b = rb_funcallv(b, succ, 0, 0);
5700 }
5701 }
5702 /* normal case */
5703 current = str_duplicate(rb_cString, beg);
5704 while (1) {
5705 VALUE next = rb_funcallv(current, succ, 0, 0);
5706 if ((*each)(current, arg)) break;
5707 current = next;
5708 StringValue(current);
5709 if (RSTRING_LEN(current) == 0)
5710 break;
5711 }
5712
5713 return beg;
5714}
5715
5716static int
5717include_range_i(VALUE str, VALUE arg)
5718{
5719 VALUE *argp = (VALUE *)arg;
5720 if (!rb_equal(str, *argp)) return 0;
5721 *argp = Qnil;
5722 return 1;
5723}
5724
5725VALUE
5726rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5727{
5728 beg = rb_str_new_frozen(beg);
5729 StringValue(end);
5730 end = rb_str_new_frozen(end);
5731 if (NIL_P(val)) return Qfalse;
5732 val = rb_check_string_type(val);
5733 if (NIL_P(val)) return Qfalse;
5734 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5735 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5736 rb_enc_asciicompat(STR_ENC_GET(val))) {
5737 const char *bp = RSTRING_PTR(beg);
5738 const char *ep = RSTRING_PTR(end);
5739 const char *vp = RSTRING_PTR(val);
5740 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5741 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5742 return Qfalse;
5743 else {
5744 char b = *bp;
5745 char e = *ep;
5746 char v = *vp;
5747
5748 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5749 if (b <= v && v < e) return Qtrue;
5750 return RBOOL(!RTEST(exclusive) && v == e);
5751 }
5752 }
5753 }
5754#if 0
5755 /* both edges are all digits */
5756 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5757 all_digits_p(bp, RSTRING_LEN(beg)) &&
5758 all_digits_p(ep, RSTRING_LEN(end))) {
5759 /* TODO */
5760 }
5761#endif
5762 }
5763 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5764
5765 return RBOOL(NIL_P(val));
5766}
5767
5768static VALUE
5769rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5770{
5771 if (rb_reg_search(re, str, 0, 0) >= 0) {
5772 VALUE match = rb_backref_get();
5773 int nth = rb_reg_backref_number(match, backref);
5774 return rb_reg_nth_match(nth, match);
5775 }
5776 return Qnil;
5777}
5778
5779static VALUE
5780rb_str_aref(VALUE str, VALUE indx)
5781{
5782 long idx;
5783
5784 if (FIXNUM_P(indx)) {
5785 idx = FIX2LONG(indx);
5786 }
5787 else if (RB_TYPE_P(indx, T_REGEXP)) {
5788 return rb_str_subpat(str, indx, INT2FIX(0));
5789 }
5790 else if (RB_TYPE_P(indx, T_STRING)) {
5791 if (rb_str_index(str, indx, 0) != -1)
5792 return str_duplicate(rb_cString, indx);
5793 return Qnil;
5794 }
5795 else {
5796 /* check if indx is Range */
5797 long beg, len = str_strlen(str, NULL);
5798 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5799 case Qfalse:
5800 break;
5801 case Qnil:
5802 return Qnil;
5803 default:
5804 return rb_str_substr(str, beg, len);
5805 }
5806 idx = NUM2LONG(indx);
5807 }
5808
5809 return str_substr(str, idx, 1, FALSE);
5810}
5811
5812
5813/*
5814 * call-seq:
5815 * self[index] -> new_string or nil
5816 * self[start, length] -> new_string or nil
5817 * self[range] -> new_string or nil
5818 * self[regexp, capture = 0] -> new_string or nil
5819 * self[substring] -> new_string or nil
5820 *
5821 * Returns the substring of +self+ specified by the arguments.
5822 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5823 *
5824 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5825 */
5826
5827static VALUE
5828rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5829{
5830 if (argc == 2) {
5831 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5832 return rb_str_subpat(str, argv[0], argv[1]);
5833 }
5834 else {
5835 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5836 }
5837 }
5838 rb_check_arity(argc, 1, 2);
5839 return rb_str_aref(str, argv[0]);
5840}
5841
5842VALUE
5844{
5845 char *ptr = RSTRING_PTR(str);
5846 long olen = RSTRING_LEN(str), nlen;
5847
5848 str_modifiable(str);
5849 if (len > olen) len = olen;
5850 nlen = olen - len;
5851 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5852 char *oldptr = ptr;
5853 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5854 STR_SET_EMBED(str);
5855 ptr = RSTRING(str)->as.embed.ary;
5856 memmove(ptr, oldptr + len, nlen);
5857 if (fl == STR_NOEMBED) xfree(oldptr);
5858 }
5859 else {
5860 if (!STR_SHARED_P(str)) {
5861 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5862 rb_enc_cr_str_exact_copy(shared, str);
5863 OBJ_FREEZE(shared);
5864 }
5865 ptr = RSTRING(str)->as.heap.ptr += len;
5866 }
5867 STR_SET_LEN(str, nlen);
5868
5869 if (!SHARABLE_MIDDLE_SUBSTRING) {
5870 TERM_FILL(ptr + nlen, TERM_LEN(str));
5871 }
5873 return str;
5874}
5875
5876static void
5877rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5878{
5879 char *sptr;
5880 long slen;
5881 int cr;
5882
5883 if (beg == 0 && vlen == 0) {
5884 rb_str_drop_bytes(str, len);
5885 return;
5886 }
5887
5888 str_modify_keep_cr(str);
5889 RSTRING_GETMEM(str, sptr, slen);
5890 if (len < vlen) {
5891 /* expand string */
5892 RESIZE_CAPA(str, slen + vlen - len);
5893 sptr = RSTRING_PTR(str);
5894 }
5895
5897 cr = rb_enc_str_coderange(val);
5898 else
5900
5901 if (vlen != len) {
5902 memmove(sptr + beg + vlen,
5903 sptr + beg + len,
5904 slen - (beg + len));
5905 }
5906 if (vlen < beg && len < 0) {
5907 MEMZERO(sptr + slen, char, -len);
5908 }
5909 if (vlen > 0) {
5910 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5911 }
5912 slen += vlen - len;
5913 STR_SET_LEN(str, slen);
5914 TERM_FILL(&sptr[slen], TERM_LEN(str));
5915 ENC_CODERANGE_SET(str, cr);
5916}
5917
5918static inline void
5919rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5920{
5921 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5922}
5923
5924void
5925rb_str_update(VALUE str, long beg, long len, VALUE val)
5926{
5927 long slen;
5928 char *p, *e;
5929 rb_encoding *enc;
5930 int singlebyte = single_byte_optimizable(str);
5931 int cr;
5932
5933 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5934
5935 StringValue(val);
5936 enc = rb_enc_check(str, val);
5937 slen = str_strlen(str, enc); /* rb_enc_check */
5938
5939 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5940 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5941 }
5942 if (beg < 0) {
5943 beg += slen;
5944 }
5945 RUBY_ASSERT(beg >= 0);
5946 RUBY_ASSERT(beg <= slen);
5947
5948 if (len > slen - beg) {
5949 len = slen - beg;
5950 }
5951 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5952 if (!p) p = RSTRING_END(str);
5953 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5954 if (!e) e = RSTRING_END(str);
5955 /* error check */
5956 beg = p - RSTRING_PTR(str); /* physical position */
5957 len = e - p; /* physical length */
5958 rb_str_update_0(str, beg, len, val);
5959 rb_enc_associate(str, enc);
5961 if (cr != ENC_CODERANGE_BROKEN)
5962 ENC_CODERANGE_SET(str, cr);
5963}
5964
5965static void
5966rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5967{
5968 int nth;
5969 VALUE match;
5970 long start, end, len;
5971 rb_encoding *enc;
5972 struct re_registers *regs;
5973
5974 if (rb_reg_search(re, str, 0, 0) < 0) {
5975 rb_raise(rb_eIndexError, "regexp not matched");
5976 }
5977 match = rb_backref_get();
5978 nth = rb_reg_backref_number(match, backref);
5979 regs = RMATCH_REGS(match);
5980 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5981 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5982 }
5983 if (nth < 0) {
5984 nth += regs->num_regs;
5985 }
5986
5987 start = BEG(nth);
5988 if (start == -1) {
5989 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5990 }
5991 end = END(nth);
5992 len = end - start;
5993 StringValue(val);
5994 enc = rb_enc_check_str(str, val);
5995 rb_str_update_0(str, start, len, val);
5996 rb_enc_associate(str, enc);
5997}
5998
5999static VALUE
6000rb_str_aset(VALUE str, VALUE indx, VALUE val)
6001{
6002 long idx, beg;
6003
6004 switch (TYPE(indx)) {
6005 case T_REGEXP:
6006 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6007 return val;
6008
6009 case T_STRING:
6010 beg = rb_str_index(str, indx, 0);
6011 if (beg < 0) {
6012 rb_raise(rb_eIndexError, "string not matched");
6013 }
6014 beg = rb_str_sublen(str, beg);
6015 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6016 return val;
6017
6018 default:
6019 /* check if indx is Range */
6020 {
6021 long beg, len;
6022 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6023 rb_str_update(str, beg, len, val);
6024 return val;
6025 }
6026 }
6027 /* FALLTHROUGH */
6028
6029 case T_FIXNUM:
6030 idx = NUM2LONG(indx);
6031 rb_str_update(str, idx, 1, val);
6032 return val;
6033 }
6034}
6035
6036/*
6037 * call-seq:
6038 * self[index] = new_string
6039 * self[start, length] = new_string
6040 * self[range] = new_string
6041 * self[regexp, capture = 0] = new_string
6042 * self[substring] = new_string
6043 *
6044 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6045 * See {String Slices}[rdoc-ref:String@String+Slices].
6046 *
6047 * A few examples:
6048 *
6049 * s = 'foo'
6050 * s[2] = 'rtune' # => "rtune"
6051 * s # => "fortune"
6052 * s[1, 5] = 'init' # => "init"
6053 * s # => "finite"
6054 * s[3..4] = 'al' # => "al"
6055 * s # => "finale"
6056 * s[/e$/] = 'ly' # => "ly"
6057 * s # => "finally"
6058 * s['lly'] = 'ncial' # => "ncial"
6059 * s # => "financial"
6060 *
6061 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6062 */
6063
6064static VALUE
6065rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6066{
6067 if (argc == 3) {
6068 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6069 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6070 }
6071 else {
6072 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6073 }
6074 return argv[2];
6075 }
6076 rb_check_arity(argc, 2, 3);
6077 return rb_str_aset(str, argv[0], argv[1]);
6078}
6079
6080/*
6081 * call-seq:
6082 * insert(index, other_string) -> self
6083 *
6084 * Inserts the given +other_string+ into +self+; returns +self+.
6085 *
6086 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6087 *
6088 * 'foo'.insert(1, 'bar') # => "fbaroo"
6089 *
6090 * If the Integer +index+ is negative, counts backward from the end of +self+
6091 * and inserts +other_string+ at offset <tt>index+1</tt>
6092 * (that is, _after_ <tt>self[index]</tt>):
6093 *
6094 * 'foo'.insert(-2, 'bar') # => "fobaro"
6095 *
6096 */
6097
6098static VALUE
6099rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6100{
6101 long pos = NUM2LONG(idx);
6102
6103 if (pos == -1) {
6104 return rb_str_append(str, str2);
6105 }
6106 else if (pos < 0) {
6107 pos++;
6108 }
6109 rb_str_update(str, pos, 0, str2);
6110 return str;
6111}
6112
6113
6114/*
6115 * call-seq:
6116 * slice!(index) -> new_string or nil
6117 * slice!(start, length) -> new_string or nil
6118 * slice!(range) -> new_string or nil
6119 * slice!(regexp, capture = 0) -> new_string or nil
6120 * slice!(substring) -> new_string or nil
6121 *
6122 * Removes and returns the substring of +self+ specified by the arguments.
6123 * See {String Slices}[rdoc-ref:String@String+Slices].
6124 *
6125 * A few examples:
6126 *
6127 * string = "This is a string"
6128 * string.slice!(2) #=> "i"
6129 * string.slice!(3..6) #=> " is "
6130 * string.slice!(/s.*t/) #=> "sa st"
6131 * string.slice!("r") #=> "r"
6132 * string #=> "Thing"
6133 *
6134 */
6135
6136static VALUE
6137rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6138{
6139 VALUE result = Qnil;
6140 VALUE indx;
6141 long beg, len = 1;
6142 char *p;
6143
6144 rb_check_arity(argc, 1, 2);
6145 str_modify_keep_cr(str);
6146 indx = argv[0];
6147 if (RB_TYPE_P(indx, T_REGEXP)) {
6148 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6149 VALUE match = rb_backref_get();
6150 struct re_registers *regs = RMATCH_REGS(match);
6151 int nth = 0;
6152 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6153 if ((nth += regs->num_regs) <= 0) return Qnil;
6154 }
6155 else if (nth >= regs->num_regs) return Qnil;
6156 beg = BEG(nth);
6157 len = END(nth) - beg;
6158 goto subseq;
6159 }
6160 else if (argc == 2) {
6161 beg = NUM2LONG(indx);
6162 len = NUM2LONG(argv[1]);
6163 goto num_index;
6164 }
6165 else if (FIXNUM_P(indx)) {
6166 beg = FIX2LONG(indx);
6167 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6168 if (!len) return Qnil;
6169 beg = p - RSTRING_PTR(str);
6170 goto subseq;
6171 }
6172 else if (RB_TYPE_P(indx, T_STRING)) {
6173 beg = rb_str_index(str, indx, 0);
6174 if (beg == -1) return Qnil;
6175 len = RSTRING_LEN(indx);
6176 result = str_duplicate(rb_cString, indx);
6177 goto squash;
6178 }
6179 else {
6180 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6181 case Qnil:
6182 return Qnil;
6183 case Qfalse:
6184 beg = NUM2LONG(indx);
6185 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6186 if (!len) return Qnil;
6187 beg = p - RSTRING_PTR(str);
6188 goto subseq;
6189 default:
6190 goto num_index;
6191 }
6192 }
6193
6194 num_index:
6195 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6196 beg = p - RSTRING_PTR(str);
6197
6198 subseq:
6199 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6200 rb_enc_cr_str_copy_for_substr(result, str);
6201
6202 squash:
6203 if (len > 0) {
6204 if (beg == 0) {
6205 rb_str_drop_bytes(str, len);
6206 }
6207 else {
6208 char *sptr = RSTRING_PTR(str);
6209 long slen = RSTRING_LEN(str);
6210 if (beg + len > slen) /* pathological check */
6211 len = slen - beg;
6212 memmove(sptr + beg,
6213 sptr + beg + len,
6214 slen - (beg + len));
6215 slen -= len;
6216 STR_SET_LEN(str, slen);
6217 TERM_FILL(&sptr[slen], TERM_LEN(str));
6218 }
6219 }
6220 return result;
6221}
6222
6223static VALUE
6224get_pat(VALUE pat)
6225{
6226 VALUE val;
6227
6228 switch (OBJ_BUILTIN_TYPE(pat)) {
6229 case T_REGEXP:
6230 return pat;
6231
6232 case T_STRING:
6233 break;
6234
6235 default:
6236 val = rb_check_string_type(pat);
6237 if (NIL_P(val)) {
6238 Check_Type(pat, T_REGEXP);
6239 }
6240 pat = val;
6241 }
6242
6243 return rb_reg_regcomp(pat);
6244}
6245
6246static VALUE
6247get_pat_quoted(VALUE pat, int check)
6248{
6249 VALUE val;
6250
6251 switch (OBJ_BUILTIN_TYPE(pat)) {
6252 case T_REGEXP:
6253 return pat;
6254
6255 case T_STRING:
6256 break;
6257
6258 default:
6259 val = rb_check_string_type(pat);
6260 if (NIL_P(val)) {
6261 Check_Type(pat, T_REGEXP);
6262 }
6263 pat = val;
6264 }
6265 if (check && is_broken_string(pat)) {
6266 rb_exc_raise(rb_reg_check_preprocess(pat));
6267 }
6268 return pat;
6269}
6270
6271static long
6272rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6273{
6274 if (BUILTIN_TYPE(pat) == T_STRING) {
6275 pos = rb_str_byteindex(str, pat, pos);
6276 if (set_backref_str) {
6277 if (pos >= 0) {
6278 str = rb_str_new_frozen_String(str);
6279 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6280 if (match) {
6281 *match = match_data;
6282 }
6283 }
6284 else {
6286 }
6287 }
6288 return pos;
6289 }
6290 else {
6291 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6292 }
6293}
6294
6295static long
6296rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6297{
6298 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6299}
6300
6301
6302/*
6303 * call-seq:
6304 * sub!(pattern, replacement) -> self or nil
6305 * sub!(pattern) {|match| ... } -> self or nil
6306 *
6307 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6308 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6309 *
6310 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6311 *
6312 * Related: String#sub, String#gsub, String#gsub!.
6313 *
6314 */
6315
6316static VALUE
6317rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6318{
6319 VALUE pat, repl, hash = Qnil;
6320 int iter = 0;
6321 long plen;
6322 int min_arity = rb_block_given_p() ? 1 : 2;
6323 long beg;
6324
6325 rb_check_arity(argc, min_arity, 2);
6326 if (argc == 1) {
6327 iter = 1;
6328 }
6329 else {
6330 repl = argv[1];
6331 hash = rb_check_hash_type(argv[1]);
6332 if (NIL_P(hash)) {
6333 StringValue(repl);
6334 }
6335 }
6336
6337 pat = get_pat_quoted(argv[0], 1);
6338
6339 str_modifiable(str);
6340 beg = rb_pat_search(pat, str, 0, 1);
6341 if (beg >= 0) {
6342 rb_encoding *enc;
6343 int cr = ENC_CODERANGE(str);
6344 long beg0, end0;
6345 VALUE match, match0 = Qnil;
6346 struct re_registers *regs;
6347 char *p, *rp;
6348 long len, rlen;
6349
6350 match = rb_backref_get();
6351 regs = RMATCH_REGS(match);
6352 if (RB_TYPE_P(pat, T_STRING)) {
6353 beg0 = beg;
6354 end0 = beg0 + RSTRING_LEN(pat);
6355 match0 = pat;
6356 }
6357 else {
6358 beg0 = BEG(0);
6359 end0 = END(0);
6360 if (iter) match0 = rb_reg_nth_match(0, match);
6361 }
6362
6363 if (iter || !NIL_P(hash)) {
6364 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6365
6366 if (iter) {
6367 repl = rb_obj_as_string(rb_yield(match0));
6368 }
6369 else {
6370 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6371 repl = rb_obj_as_string(repl);
6372 }
6373 str_mod_check(str, p, len);
6374 rb_check_frozen(str);
6375 }
6376 else {
6377 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6378 }
6379
6380 enc = rb_enc_compatible(str, repl);
6381 if (!enc) {
6382 rb_encoding *str_enc = STR_ENC_GET(str);
6383 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6384 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6385 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6386 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6387 rb_enc_inspect_name(str_enc),
6388 rb_enc_inspect_name(STR_ENC_GET(repl)));
6389 }
6390 enc = STR_ENC_GET(repl);
6391 }
6392 rb_str_modify(str);
6393 rb_enc_associate(str, enc);
6395 int cr2 = ENC_CODERANGE(repl);
6396 if (cr2 == ENC_CODERANGE_BROKEN ||
6397 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6399 else
6400 cr = cr2;
6401 }
6402 plen = end0 - beg0;
6403 rlen = RSTRING_LEN(repl);
6404 len = RSTRING_LEN(str);
6405 if (rlen > plen) {
6406 RESIZE_CAPA(str, len + rlen - plen);
6407 }
6408 p = RSTRING_PTR(str);
6409 if (rlen != plen) {
6410 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6411 }
6412 rp = RSTRING_PTR(repl);
6413 memmove(p + beg0, rp, rlen);
6414 len += rlen - plen;
6415 STR_SET_LEN(str, len);
6416 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6417 ENC_CODERANGE_SET(str, cr);
6418
6419 RB_GC_GUARD(match);
6420
6421 return str;
6422 }
6423 return Qnil;
6424}
6425
6426
6427/*
6428 * call-seq:
6429 * sub(pattern, replacement) -> new_string
6430 * sub(pattern) {|match| ... } -> new_string
6431 *
6432 * Returns a copy of +self+ with only the first occurrence
6433 * (not all occurrences) of the given +pattern+ replaced.
6434 *
6435 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6436 *
6437 * Related: String#sub!, String#gsub, String#gsub!.
6438 *
6439 */
6440
6441static VALUE
6442rb_str_sub(int argc, VALUE *argv, VALUE str)
6443{
6444 str = str_duplicate(rb_cString, str);
6445 rb_str_sub_bang(argc, argv, str);
6446 return str;
6447}
6448
6449static VALUE
6450str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6451{
6452 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6453 long beg, beg0, end0;
6454 long offset, blen, slen, len, last;
6455 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6456 char *sp, *cp;
6457 int need_backref_str = -1;
6458 rb_encoding *str_enc;
6459
6460 switch (argc) {
6461 case 1:
6462 RETURN_ENUMERATOR(str, argc, argv);
6463 mode = ITER;
6464 break;
6465 case 2:
6466 repl = argv[1];
6467 hash = rb_check_hash_type(argv[1]);
6468 if (NIL_P(hash)) {
6469 StringValue(repl);
6470 }
6471 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6472 mode = FAST_MAP;
6473 }
6474 else {
6475 mode = MAP;
6476 }
6477 break;
6478 default:
6479 rb_error_arity(argc, 1, 2);
6480 }
6481
6482 pat = get_pat_quoted(argv[0], 1);
6483 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6484
6485 if (beg < 0) {
6486 if (bang) return Qnil; /* no match, no substitution */
6487 return str_duplicate(rb_cString, str);
6488 }
6489
6490 offset = 0;
6491 blen = RSTRING_LEN(str) + 30; /* len + margin */
6492 dest = rb_str_buf_new(blen);
6493 sp = RSTRING_PTR(str);
6494 slen = RSTRING_LEN(str);
6495 cp = sp;
6496 str_enc = STR_ENC_GET(str);
6497 rb_enc_associate(dest, str_enc);
6498 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6499
6500 do {
6501 struct re_registers *regs = RMATCH_REGS(match);
6502 if (RB_TYPE_P(pat, T_STRING)) {
6503 beg0 = beg;
6504 end0 = beg0 + RSTRING_LEN(pat);
6505 match0 = pat;
6506 }
6507 else {
6508 beg0 = BEG(0);
6509 end0 = END(0);
6510 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6511 }
6512
6513 if (mode != STR) {
6514 if (mode == ITER) {
6515 val = rb_obj_as_string(rb_yield(match0));
6516 }
6517 else {
6518 struct RString fake_str;
6519 VALUE key;
6520 if (mode == FAST_MAP) {
6521 // It is safe to use a fake_str here because we established that it won't escape,
6522 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6523 // default proc.
6524 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6525 }
6526 else {
6527 key = rb_str_subseq(str, beg0, end0 - beg0);
6528 }
6529 val = rb_hash_aref(hash, key);
6530 val = rb_obj_as_string(val);
6531 }
6532 str_mod_check(str, sp, slen);
6533 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6534 rb_raise(rb_eRuntimeError, "block should not cheat");
6535 }
6536 }
6537 else if (need_backref_str) {
6538 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6539 if (need_backref_str < 0) {
6540 need_backref_str = val != repl;
6541 }
6542 }
6543 else {
6544 val = repl;
6545 }
6546
6547 len = beg0 - offset; /* copy pre-match substr */
6548 if (len) {
6549 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6550 }
6551
6552 rb_str_buf_append(dest, val);
6553
6554 last = offset;
6555 offset = end0;
6556 if (beg0 == end0) {
6557 /*
6558 * Always consume at least one character of the input string
6559 * in order to prevent infinite loops.
6560 */
6561 if (RSTRING_LEN(str) <= end0) break;
6562 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6563 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6564 offset = end0 + len;
6565 }
6566 cp = RSTRING_PTR(str) + offset;
6567 if (offset > RSTRING_LEN(str)) break;
6568
6569 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6570 if (mode != FAST_MAP && mode != STR) {
6571 match = Qnil;
6572 }
6573 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6574
6575 RB_GC_GUARD(match);
6576 } while (beg >= 0);
6577
6578 if (RSTRING_LEN(str) > offset) {
6579 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6580 }
6581 rb_pat_search0(pat, str, last, 1, &match);
6582 if (bang) {
6583 str_shared_replace(str, dest);
6584 }
6585 else {
6586 str = dest;
6587 }
6588
6589 return str;
6590}
6591
6592
6593/*
6594 * call-seq:
6595 * gsub!(pattern, replacement) -> self or nil
6596 * gsub!(pattern) {|match| ... } -> self or nil
6597 * gsub!(pattern) -> an_enumerator
6598 *
6599 * Performs the specified substring replacement(s) on +self+;
6600 * returns +self+ if any replacement occurred, +nil+ otherwise.
6601 *
6602 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6603 *
6604 * Returns an Enumerator if no +replacement+ and no block given.
6605 *
6606 * Related: String#sub, String#gsub, String#sub!.
6607 *
6608 */
6609
6610static VALUE
6611rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6612{
6613 str_modify_keep_cr(str);
6614 return str_gsub(argc, argv, str, 1);
6615}
6616
6617
6618/*
6619 * call-seq:
6620 * gsub(pattern, replacement) -> new_string
6621 * gsub(pattern) {|match| ... } -> new_string
6622 * gsub(pattern) -> enumerator
6623 *
6624 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6625 *
6626 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6627 *
6628 * Returns an Enumerator if no +replacement+ and no block given.
6629 *
6630 * Related: String#sub, String#sub!, String#gsub!.
6631 *
6632 */
6633
6634static VALUE
6635rb_str_gsub(int argc, VALUE *argv, VALUE str)
6636{
6637 return str_gsub(argc, argv, str, 0);
6638}
6639
6640
6641/*
6642 * call-seq:
6643 * replace(other_string) -> self
6644 *
6645 * Replaces the contents of +self+ with the contents of +other_string+:
6646 *
6647 * s = 'foo' # => "foo"
6648 * s.replace('bar') # => "bar"
6649 *
6650 */
6651
6652VALUE
6654{
6655 str_modifiable(str);
6656 if (str == str2) return str;
6657
6658 StringValue(str2);
6659 str_discard(str);
6660 return str_replace(str, str2);
6661}
6662
6663/*
6664 * call-seq:
6665 * clear -> self
6666 *
6667 * Removes the contents of +self+:
6668 *
6669 * s = 'foo' # => "foo"
6670 * s.clear # => ""
6671 *
6672 */
6673
6674static VALUE
6675rb_str_clear(VALUE str)
6676{
6677 str_discard(str);
6678 STR_SET_EMBED(str);
6679 STR_SET_LEN(str, 0);
6680 RSTRING_PTR(str)[0] = 0;
6681 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6683 else
6685 return str;
6686}
6687
6688/*
6689 * call-seq:
6690 * chr -> string
6691 *
6692 * Returns a string containing the first character of +self+:
6693 *
6694 * s = 'foo' # => "foo"
6695 * s.chr # => "f"
6696 *
6697 */
6698
6699static VALUE
6700rb_str_chr(VALUE str)
6701{
6702 return rb_str_substr(str, 0, 1);
6703}
6704
6705/*
6706 * call-seq:
6707 * getbyte(index) -> integer or nil
6708 *
6709 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6710 *
6711 * s = 'abcde' # => "abcde"
6712 * s.getbyte(0) # => 97
6713 * s.getbyte(-1) # => 101
6714 * s.getbyte(5) # => nil
6715 *
6716 * Related: String#setbyte.
6717 */
6718VALUE
6719rb_str_getbyte(VALUE str, VALUE index)
6720{
6721 long pos = NUM2LONG(index);
6722
6723 if (pos < 0)
6724 pos += RSTRING_LEN(str);
6725 if (pos < 0 || RSTRING_LEN(str) <= pos)
6726 return Qnil;
6727
6728 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6729}
6730
6731/*
6732 * call-seq:
6733 * setbyte(index, integer) -> integer
6734 *
6735 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6736 *
6737 * s = 'abcde' # => "abcde"
6738 * s.setbyte(0, 98) # => 98
6739 * s # => "bbcde"
6740 *
6741 * Related: String#getbyte.
6742 */
6743VALUE
6744rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6745{
6746 long pos = NUM2LONG(index);
6747 long len = RSTRING_LEN(str);
6748 char *ptr, *head, *left = 0;
6749 rb_encoding *enc;
6750 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6751
6752 if (pos < -len || len <= pos)
6753 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6754 if (pos < 0)
6755 pos += len;
6756
6757 VALUE v = rb_to_int(value);
6758 VALUE w = rb_int_and(v, INT2FIX(0xff));
6759 char byte = (char)(NUM2INT(w) & 0xFF);
6760
6761 if (!str_independent(str))
6762 str_make_independent(str);
6763 enc = STR_ENC_GET(str);
6764 head = RSTRING_PTR(str);
6765 ptr = &head[pos];
6766 if (!STR_EMBED_P(str)) {
6767 cr = ENC_CODERANGE(str);
6768 switch (cr) {
6769 case ENC_CODERANGE_7BIT:
6770 left = ptr;
6771 *ptr = byte;
6772 if (ISASCII(byte)) goto end;
6773 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6774 if (!MBCLEN_CHARFOUND_P(nlen))
6776 else
6778 goto end;
6780 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6781 width = rb_enc_precise_mbclen(left, head+len, enc);
6782 *ptr = byte;
6783 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6784 if (!MBCLEN_CHARFOUND_P(nlen))
6786 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6788 goto end;
6789 }
6790 }
6792 *ptr = byte;
6793
6794 end:
6795 return value;
6796}
6797
6798static VALUE
6799str_byte_substr(VALUE str, long beg, long len, int empty)
6800{
6801 long n = RSTRING_LEN(str);
6802
6803 if (beg > n || len < 0) return Qnil;
6804 if (beg < 0) {
6805 beg += n;
6806 if (beg < 0) return Qnil;
6807 }
6808 if (len > n - beg)
6809 len = n - beg;
6810 if (len <= 0) {
6811 if (!empty) return Qnil;
6812 len = 0;
6813 }
6814
6815 VALUE str2 = str_subseq(str, beg, len);
6816
6817 str_enc_copy_direct(str2, str);
6818
6819 if (RSTRING_LEN(str2) == 0) {
6820 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6822 else
6824 }
6825 else {
6826 switch (ENC_CODERANGE(str)) {
6827 case ENC_CODERANGE_7BIT:
6829 break;
6830 default:
6832 break;
6833 }
6834 }
6835
6836 return str2;
6837}
6838
6839VALUE
6840rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6841{
6842 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6843}
6844
6845static VALUE
6846str_byte_aref(VALUE str, VALUE indx)
6847{
6848 long idx;
6849 if (FIXNUM_P(indx)) {
6850 idx = FIX2LONG(indx);
6851 }
6852 else {
6853 /* check if indx is Range */
6854 long beg, len = RSTRING_LEN(str);
6855
6856 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6857 case Qfalse:
6858 break;
6859 case Qnil:
6860 return Qnil;
6861 default:
6862 return str_byte_substr(str, beg, len, TRUE);
6863 }
6864
6865 idx = NUM2LONG(indx);
6866 }
6867 return str_byte_substr(str, idx, 1, FALSE);
6868}
6869
6870/*
6871 * call-seq:
6872 * byteslice(offset, length = 1) -> string or nil
6873 * byteslice(range) -> string or nil
6874 *
6875 * :include: doc/string/byteslice.rdoc
6876 */
6877
6878static VALUE
6879rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6880{
6881 if (argc == 2) {
6882 long beg = NUM2LONG(argv[0]);
6883 long len = NUM2LONG(argv[1]);
6884 return str_byte_substr(str, beg, len, TRUE);
6885 }
6886 rb_check_arity(argc, 1, 2);
6887 return str_byte_aref(str, argv[0]);
6888}
6889
6890static void
6891str_check_beg_len(VALUE str, long *beg, long *len)
6892{
6893 long end, slen = RSTRING_LEN(str);
6894
6895 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6896 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6897 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6898 }
6899 if (*beg < 0) {
6900 *beg += slen;
6901 }
6902 RUBY_ASSERT(*beg >= 0);
6903 RUBY_ASSERT(*beg <= slen);
6904
6905 if (*len > slen - *beg) {
6906 *len = slen - *beg;
6907 }
6908 end = *beg + *len;
6909 str_ensure_byte_pos(str, *beg);
6910 str_ensure_byte_pos(str, end);
6911}
6912
6913/*
6914 * call-seq:
6915 * bytesplice(offset, length, str) -> self
6916 * bytesplice(offset, length, str, str_offset, str_length) -> self
6917 * bytesplice(range, str) -> self
6918 * bytesplice(range, str, str_range) -> self
6919 *
6920 * :include: doc/string/bytesplice.rdoc
6921 */
6922
6923static VALUE
6924rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6925{
6926 long beg, len, vbeg, vlen;
6927 VALUE val;
6928 int cr;
6929
6930 rb_check_arity(argc, 2, 5);
6931 if (!(argc == 2 || argc == 3 || argc == 5)) {
6932 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6933 }
6934 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6935 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6936 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6937 rb_builtin_class_name(argv[0]));
6938 }
6939 val = argv[1];
6940 StringValue(val);
6941 if (argc == 2) {
6942 /* bytesplice(range, str) */
6943 vbeg = 0;
6944 vlen = RSTRING_LEN(val);
6945 }
6946 else {
6947 /* bytesplice(range, str, str_range) */
6948 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6949 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6950 rb_builtin_class_name(argv[2]));
6951 }
6952 }
6953 }
6954 else {
6955 beg = NUM2LONG(argv[0]);
6956 len = NUM2LONG(argv[1]);
6957 val = argv[2];
6958 StringValue(val);
6959 if (argc == 3) {
6960 /* bytesplice(index, length, str) */
6961 vbeg = 0;
6962 vlen = RSTRING_LEN(val);
6963 }
6964 else {
6965 /* bytesplice(index, length, str, str_index, str_length) */
6966 vbeg = NUM2LONG(argv[3]);
6967 vlen = NUM2LONG(argv[4]);
6968 }
6969 }
6970 str_check_beg_len(str, &beg, &len);
6971 str_check_beg_len(val, &vbeg, &vlen);
6972 str_modify_keep_cr(str);
6973
6974 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6975 rb_enc_associate(str, rb_enc_check(str, val));
6976 }
6977
6978 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6980 if (cr != ENC_CODERANGE_BROKEN)
6981 ENC_CODERANGE_SET(str, cr);
6982 return str;
6983}
6984
6985/*
6986 * call-seq:
6987 * reverse -> string
6988 *
6989 * Returns a new string with the characters from +self+ in reverse order.
6990 *
6991 * 'stressed'.reverse # => "desserts"
6992 *
6993 */
6994
6995static VALUE
6996rb_str_reverse(VALUE str)
6997{
6998 rb_encoding *enc;
6999 VALUE rev;
7000 char *s, *e, *p;
7001 int cr;
7002
7003 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7004 enc = STR_ENC_GET(str);
7005 rev = rb_str_new(0, RSTRING_LEN(str));
7006 s = RSTRING_PTR(str); e = RSTRING_END(str);
7007 p = RSTRING_END(rev);
7008 cr = ENC_CODERANGE(str);
7009
7010 if (RSTRING_LEN(str) > 1) {
7011 if (single_byte_optimizable(str)) {
7012 while (s < e) {
7013 *--p = *s++;
7014 }
7015 }
7016 else if (cr == ENC_CODERANGE_VALID) {
7017 while (s < e) {
7018 int clen = rb_enc_fast_mbclen(s, e, enc);
7019
7020 p -= clen;
7021 memcpy(p, s, clen);
7022 s += clen;
7023 }
7024 }
7025 else {
7026 cr = rb_enc_asciicompat(enc) ?
7028 while (s < e) {
7029 int clen = rb_enc_mbclen(s, e, enc);
7030
7031 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7032 p -= clen;
7033 memcpy(p, s, clen);
7034 s += clen;
7035 }
7036 }
7037 }
7038 STR_SET_LEN(rev, RSTRING_LEN(str));
7039 str_enc_copy_direct(rev, str);
7040 ENC_CODERANGE_SET(rev, cr);
7041
7042 return rev;
7043}
7044
7045
7046/*
7047 * call-seq:
7048 * reverse! -> self
7049 *
7050 * Returns +self+ with its characters reversed:
7051 *
7052 * s = 'stressed'
7053 * s.reverse! # => "desserts"
7054 * s # => "desserts"
7055 *
7056 */
7057
7058static VALUE
7059rb_str_reverse_bang(VALUE str)
7060{
7061 if (RSTRING_LEN(str) > 1) {
7062 if (single_byte_optimizable(str)) {
7063 char *s, *e, c;
7064
7065 str_modify_keep_cr(str);
7066 s = RSTRING_PTR(str);
7067 e = RSTRING_END(str) - 1;
7068 while (s < e) {
7069 c = *s;
7070 *s++ = *e;
7071 *e-- = c;
7072 }
7073 }
7074 else {
7075 str_shared_replace(str, rb_str_reverse(str));
7076 }
7077 }
7078 else {
7079 str_modify_keep_cr(str);
7080 }
7081 return str;
7082}
7083
7084
7085/*
7086 * call-seq:
7087 * include?(other_string) -> true or false
7088 *
7089 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7090 *
7091 * s = 'foo'
7092 * s.include?('f') # => true
7093 * s.include?('fo') # => true
7094 * s.include?('food') # => false
7095 *
7096 */
7097
7098VALUE
7099rb_str_include(VALUE str, VALUE arg)
7100{
7101 long i;
7102
7103 StringValue(arg);
7104 i = rb_str_index(str, arg, 0);
7105
7106 return RBOOL(i != -1);
7107}
7108
7109
7110/*
7111 * call-seq:
7112 * to_i(base = 10) -> integer
7113 *
7114 * Returns the result of interpreting leading characters in +self+
7115 * as an integer in the given +base+ (which must be in (0, 2..36)):
7116 *
7117 * '123456'.to_i # => 123456
7118 * '123def'.to_i(16) # => 1195503
7119 *
7120 * With +base+ zero, string +object+ may contain leading characters
7121 * to specify the actual base:
7122 *
7123 * '123def'.to_i(0) # => 123
7124 * '0123def'.to_i(0) # => 83
7125 * '0b123def'.to_i(0) # => 1
7126 * '0o123def'.to_i(0) # => 83
7127 * '0d123def'.to_i(0) # => 123
7128 * '0x123def'.to_i(0) # => 1195503
7129 *
7130 * Characters past a leading valid number (in the given +base+) are ignored:
7131 *
7132 * '12.345'.to_i # => 12
7133 * '12345'.to_i(2) # => 1
7134 *
7135 * Returns zero if there is no leading valid number:
7136 *
7137 * 'abcdef'.to_i # => 0
7138 * '2'.to_i(2) # => 0
7139 *
7140 */
7141
7142static VALUE
7143rb_str_to_i(int argc, VALUE *argv, VALUE str)
7144{
7145 int base = 10;
7146
7147 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7148 rb_raise(rb_eArgError, "invalid radix %d", base);
7149 }
7150 return rb_str_to_inum(str, base, FALSE);
7151}
7152
7153
7154/*
7155 * call-seq:
7156 * to_f -> float
7157 *
7158 * Returns the result of interpreting leading characters in +self+ as a Float:
7159 *
7160 * '3.14159'.to_f # => 3.14159
7161 * '1.234e-2'.to_f # => 0.01234
7162 *
7163 * Characters past a leading valid number (in the given +base+) are ignored:
7164 *
7165 * '3.14 (pi to two places)'.to_f # => 3.14
7166 *
7167 * Returns zero if there is no leading valid number:
7168 *
7169 * 'abcdef'.to_f # => 0.0
7170 *
7171 */
7172
7173static VALUE
7174rb_str_to_f(VALUE str)
7175{
7176 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7177}
7178
7179
7180/*
7181 * call-seq:
7182 * to_s -> self or string
7183 *
7184 * Returns +self+ if +self+ is a +String+,
7185 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7186 */
7187
7188static VALUE
7189rb_str_to_s(VALUE str)
7190{
7191 if (rb_obj_class(str) != rb_cString) {
7192 return str_duplicate(rb_cString, str);
7193 }
7194 return str;
7195}
7196
7197#if 0
7198static void
7199str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7200{
7201 char s[RUBY_MAX_CHAR_LEN];
7202 int n = rb_enc_codelen(c, enc);
7203
7204 rb_enc_mbcput(c, s, enc);
7205 rb_enc_str_buf_cat(str, s, n, enc);
7206}
7207#endif
7208
7209#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7210
7211int
7212rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7213{
7214 char buf[CHAR_ESC_LEN + 1];
7215 int l;
7216
7217#if SIZEOF_INT > 4
7218 c &= 0xffffffff;
7219#endif
7220 if (unicode_p) {
7221 if (c < 0x7F && ISPRINT(c)) {
7222 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7223 }
7224 else if (c < 0x10000) {
7225 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7226 }
7227 else {
7228 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7229 }
7230 }
7231 else {
7232 if (c < 0x100) {
7233 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7234 }
7235 else {
7236 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7237 }
7238 }
7239 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7240 rb_str_buf_cat(result, buf, l);
7241 return l;
7242}
7243
7244const char *
7245ruby_escaped_char(int c)
7246{
7247 switch (c) {
7248 case '\0': return "\\0";
7249 case '\n': return "\\n";
7250 case '\r': return "\\r";
7251 case '\t': return "\\t";
7252 case '\f': return "\\f";
7253 case '\013': return "\\v";
7254 case '\010': return "\\b";
7255 case '\007': return "\\a";
7256 case '\033': return "\\e";
7257 case '\x7f': return "\\c?";
7258 }
7259 return NULL;
7260}
7261
7262VALUE
7263rb_str_escape(VALUE str)
7264{
7265 int encidx = ENCODING_GET(str);
7266 rb_encoding *enc = rb_enc_from_index(encidx);
7267 const char *p = RSTRING_PTR(str);
7268 const char *pend = RSTRING_END(str);
7269 const char *prev = p;
7270 char buf[CHAR_ESC_LEN + 1];
7271 VALUE result = rb_str_buf_new(0);
7272 int unicode_p = rb_enc_unicode_p(enc);
7273 int asciicompat = rb_enc_asciicompat(enc);
7274
7275 while (p < pend) {
7276 unsigned int c;
7277 const char *cc;
7278 int n = rb_enc_precise_mbclen(p, pend, enc);
7279 if (!MBCLEN_CHARFOUND_P(n)) {
7280 if (p > prev) str_buf_cat(result, prev, p - prev);
7281 n = rb_enc_mbminlen(enc);
7282 if (pend < p + n)
7283 n = (int)(pend - p);
7284 while (n--) {
7285 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7286 str_buf_cat(result, buf, strlen(buf));
7287 prev = ++p;
7288 }
7289 continue;
7290 }
7291 n = MBCLEN_CHARFOUND_LEN(n);
7292 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7293 p += n;
7294 cc = ruby_escaped_char(c);
7295 if (cc) {
7296 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7297 str_buf_cat(result, cc, strlen(cc));
7298 prev = p;
7299 }
7300 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7301 }
7302 else {
7303 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7304 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7305 prev = p;
7306 }
7307 }
7308 if (p > prev) str_buf_cat(result, prev, p - prev);
7309 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7310
7311 return result;
7312}
7313
7314/*
7315 * call-seq:
7316 * inspect -> string
7317 *
7318 * Returns a printable version of +self+, enclosed in double-quotes,
7319 * and with special characters escaped:
7320 *
7321 * s = "foo\tbar\tbaz\n"
7322 * s.inspect
7323 * # => "\"foo\\tbar\\tbaz\\n\""
7324 *
7325 */
7326
7327VALUE
7329{
7330 int encidx = ENCODING_GET(str);
7331 rb_encoding *enc = rb_enc_from_index(encidx);
7332 const char *p, *pend, *prev;
7333 char buf[CHAR_ESC_LEN + 1];
7334 VALUE result = rb_str_buf_new(0);
7335 rb_encoding *resenc = rb_default_internal_encoding();
7336 int unicode_p = rb_enc_unicode_p(enc);
7337 int asciicompat = rb_enc_asciicompat(enc);
7338
7339 if (resenc == NULL) resenc = rb_default_external_encoding();
7340 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7341 rb_enc_associate(result, resenc);
7342 str_buf_cat2(result, "\"");
7343
7344 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7345 prev = p;
7346 while (p < pend) {
7347 unsigned int c, cc;
7348 int n;
7349
7350 n = rb_enc_precise_mbclen(p, pend, enc);
7351 if (!MBCLEN_CHARFOUND_P(n)) {
7352 if (p > prev) str_buf_cat(result, prev, p - prev);
7353 n = rb_enc_mbminlen(enc);
7354 if (pend < p + n)
7355 n = (int)(pend - p);
7356 while (n--) {
7357 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7358 str_buf_cat(result, buf, strlen(buf));
7359 prev = ++p;
7360 }
7361 continue;
7362 }
7363 n = MBCLEN_CHARFOUND_LEN(n);
7364 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7365 p += n;
7366 if ((asciicompat || unicode_p) &&
7367 (c == '"'|| c == '\\' ||
7368 (c == '#' &&
7369 p < pend &&
7370 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7371 (cc = rb_enc_codepoint(p,pend,enc),
7372 (cc == '$' || cc == '@' || cc == '{'))))) {
7373 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7374 str_buf_cat2(result, "\\");
7375 if (asciicompat || enc == resenc) {
7376 prev = p - n;
7377 continue;
7378 }
7379 }
7380 switch (c) {
7381 case '\n': cc = 'n'; break;
7382 case '\r': cc = 'r'; break;
7383 case '\t': cc = 't'; break;
7384 case '\f': cc = 'f'; break;
7385 case '\013': cc = 'v'; break;
7386 case '\010': cc = 'b'; break;
7387 case '\007': cc = 'a'; break;
7388 case 033: cc = 'e'; break;
7389 default: cc = 0; break;
7390 }
7391 if (cc) {
7392 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7393 buf[0] = '\\';
7394 buf[1] = (char)cc;
7395 str_buf_cat(result, buf, 2);
7396 prev = p;
7397 continue;
7398 }
7399 /* The special casing of 0x85 (NEXT_LINE) here is because
7400 * Oniguruma historically treats it as printable, but it
7401 * doesn't match the print POSIX bracket class or character
7402 * property in regexps.
7403 *
7404 * See Ruby Bug #16842 for details:
7405 * https://bugs.ruby-lang.org/issues/16842
7406 */
7407 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7408 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7409 continue;
7410 }
7411 else {
7412 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7413 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7414 prev = p;
7415 continue;
7416 }
7417 }
7418 if (p > prev) str_buf_cat(result, prev, p - prev);
7419 str_buf_cat2(result, "\"");
7420
7421 return result;
7422}
7423
7424#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7425
7426/*
7427 * call-seq:
7428 * dump -> string
7429 *
7430 * Returns a printable version of +self+, enclosed in double-quotes,
7431 * with special characters escaped, and with non-printing characters
7432 * replaced by hexadecimal notation:
7433 *
7434 * "hello \n ''".dump # => "\"hello \\n ''\""
7435 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7436 *
7437 * Related: String#undump (inverse of String#dump).
7438 *
7439 */
7440
7441VALUE
7443{
7444 int encidx = rb_enc_get_index(str);
7445 rb_encoding *enc = rb_enc_from_index(encidx);
7446 long len;
7447 const char *p, *pend;
7448 char *q, *qend;
7449 VALUE result;
7450 int u8 = (encidx == rb_utf8_encindex());
7451 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7452
7453 len = 2; /* "" */
7454 if (!rb_enc_asciicompat(enc)) {
7455 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7456 len += strlen(enc->name);
7457 }
7458
7459 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7460 while (p < pend) {
7461 int clen;
7462 unsigned char c = *p++;
7463
7464 switch (c) {
7465 case '"': case '\\':
7466 case '\n': case '\r':
7467 case '\t': case '\f':
7468 case '\013': case '\010': case '\007': case '\033':
7469 clen = 2;
7470 break;
7471
7472 case '#':
7473 clen = IS_EVSTR(p, pend) ? 2 : 1;
7474 break;
7475
7476 default:
7477 if (ISPRINT(c)) {
7478 clen = 1;
7479 }
7480 else {
7481 if (u8 && c > 0x7F) { /* \u notation */
7482 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7483 if (MBCLEN_CHARFOUND_P(n)) {
7484 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7485 if (cc <= 0xFFFF)
7486 clen = 6; /* \uXXXX */
7487 else if (cc <= 0xFFFFF)
7488 clen = 9; /* \u{XXXXX} */
7489 else
7490 clen = 10; /* \u{XXXXXX} */
7491 p += MBCLEN_CHARFOUND_LEN(n)-1;
7492 break;
7493 }
7494 }
7495 clen = 4; /* \xNN */
7496 }
7497 break;
7498 }
7499
7500 if (clen > LONG_MAX - len) {
7501 rb_raise(rb_eRuntimeError, "string size too big");
7502 }
7503 len += clen;
7504 }
7505
7506 result = rb_str_new(0, len);
7507 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7508 q = RSTRING_PTR(result); qend = q + len + 1;
7509
7510 *q++ = '"';
7511 while (p < pend) {
7512 unsigned char c = *p++;
7513
7514 if (c == '"' || c == '\\') {
7515 *q++ = '\\';
7516 *q++ = c;
7517 }
7518 else if (c == '#') {
7519 if (IS_EVSTR(p, pend)) *q++ = '\\';
7520 *q++ = '#';
7521 }
7522 else if (c == '\n') {
7523 *q++ = '\\';
7524 *q++ = 'n';
7525 }
7526 else if (c == '\r') {
7527 *q++ = '\\';
7528 *q++ = 'r';
7529 }
7530 else if (c == '\t') {
7531 *q++ = '\\';
7532 *q++ = 't';
7533 }
7534 else if (c == '\f') {
7535 *q++ = '\\';
7536 *q++ = 'f';
7537 }
7538 else if (c == '\013') {
7539 *q++ = '\\';
7540 *q++ = 'v';
7541 }
7542 else if (c == '\010') {
7543 *q++ = '\\';
7544 *q++ = 'b';
7545 }
7546 else if (c == '\007') {
7547 *q++ = '\\';
7548 *q++ = 'a';
7549 }
7550 else if (c == '\033') {
7551 *q++ = '\\';
7552 *q++ = 'e';
7553 }
7554 else if (ISPRINT(c)) {
7555 *q++ = c;
7556 }
7557 else {
7558 *q++ = '\\';
7559 if (u8) {
7560 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7561 if (MBCLEN_CHARFOUND_P(n)) {
7562 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7563 p += n;
7564 if (cc <= 0xFFFF)
7565 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7566 else
7567 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7568 q += strlen(q);
7569 continue;
7570 }
7571 }
7572 snprintf(q, qend-q, "x%02X", c);
7573 q += 3;
7574 }
7575 }
7576 *q++ = '"';
7577 *q = '\0';
7578 if (!rb_enc_asciicompat(enc)) {
7579 snprintf(q, qend-q, nonascii_suffix, enc->name);
7580 encidx = rb_ascii8bit_encindex();
7581 }
7582 /* result from dump is ASCII */
7583 rb_enc_associate_index(result, encidx);
7585 return result;
7586}
7587
7588static int
7589unescape_ascii(unsigned int c)
7590{
7591 switch (c) {
7592 case 'n':
7593 return '\n';
7594 case 'r':
7595 return '\r';
7596 case 't':
7597 return '\t';
7598 case 'f':
7599 return '\f';
7600 case 'v':
7601 return '\13';
7602 case 'b':
7603 return '\010';
7604 case 'a':
7605 return '\007';
7606 case 'e':
7607 return 033;
7608 }
7610}
7611
7612static void
7613undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7614{
7615 const char *s = *ss;
7616 unsigned int c;
7617 int codelen;
7618 size_t hexlen;
7619 unsigned char buf[6];
7620 static rb_encoding *enc_utf8 = NULL;
7621
7622 switch (*s) {
7623 case '\\':
7624 case '"':
7625 case '#':
7626 rb_str_cat(undumped, s, 1); /* cat itself */
7627 s++;
7628 break;
7629 case 'n':
7630 case 'r':
7631 case 't':
7632 case 'f':
7633 case 'v':
7634 case 'b':
7635 case 'a':
7636 case 'e':
7637 *buf = unescape_ascii(*s);
7638 rb_str_cat(undumped, (char *)buf, 1);
7639 s++;
7640 break;
7641 case 'u':
7642 if (*binary) {
7643 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7644 }
7645 *utf8 = true;
7646 if (++s >= s_end) {
7647 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7648 }
7649 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7650 if (*penc != enc_utf8) {
7651 *penc = enc_utf8;
7652 rb_enc_associate(undumped, enc_utf8);
7653 }
7654 if (*s == '{') { /* handle \u{...} form */
7655 s++;
7656 for (;;) {
7657 if (s >= s_end) {
7658 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7659 }
7660 if (*s == '}') {
7661 s++;
7662 break;
7663 }
7664 if (ISSPACE(*s)) {
7665 s++;
7666 continue;
7667 }
7668 c = scan_hex(s, s_end-s, &hexlen);
7669 if (hexlen == 0 || hexlen > 6) {
7670 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7671 }
7672 if (c > 0x10ffff) {
7673 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7674 }
7675 if (0xd800 <= c && c <= 0xdfff) {
7676 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7677 }
7678 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7679 rb_str_cat(undumped, (char *)buf, codelen);
7680 s += hexlen;
7681 }
7682 }
7683 else { /* handle \uXXXX form */
7684 c = scan_hex(s, 4, &hexlen);
7685 if (hexlen != 4) {
7686 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7687 }
7688 if (0xd800 <= c && c <= 0xdfff) {
7689 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7690 }
7691 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7692 rb_str_cat(undumped, (char *)buf, codelen);
7693 s += hexlen;
7694 }
7695 break;
7696 case 'x':
7697 if (*utf8) {
7698 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7699 }
7700 *binary = true;
7701 if (++s >= s_end) {
7702 rb_raise(rb_eRuntimeError, "invalid hex escape");
7703 }
7704 *buf = scan_hex(s, 2, &hexlen);
7705 if (hexlen != 2) {
7706 rb_raise(rb_eRuntimeError, "invalid hex escape");
7707 }
7708 rb_str_cat(undumped, (char *)buf, 1);
7709 s += hexlen;
7710 break;
7711 default:
7712 rb_str_cat(undumped, s-1, 2);
7713 s++;
7714 }
7715
7716 *ss = s;
7717}
7718
7719static VALUE rb_str_is_ascii_only_p(VALUE str);
7720
7721/*
7722 * call-seq:
7723 * undump -> string
7724 *
7725 * Returns an unescaped version of +self+:
7726 *
7727 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7728 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7729 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7730 * s_undumped == s_orig # => true
7731 *
7732 * Related: String#dump (inverse of String#undump).
7733 *
7734 */
7735
7736static VALUE
7737str_undump(VALUE str)
7738{
7739 const char *s = RSTRING_PTR(str);
7740 const char *s_end = RSTRING_END(str);
7741 rb_encoding *enc = rb_enc_get(str);
7742 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7743 bool utf8 = false;
7744 bool binary = false;
7745 int w;
7746
7748 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7749 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7750 }
7751 if (!str_null_check(str, &w)) {
7752 rb_raise(rb_eRuntimeError, "string contains null byte");
7753 }
7754 if (RSTRING_LEN(str) < 2) goto invalid_format;
7755 if (*s != '"') goto invalid_format;
7756
7757 /* strip '"' at the start */
7758 s++;
7759
7760 for (;;) {
7761 if (s >= s_end) {
7762 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7763 }
7764
7765 if (*s == '"') {
7766 /* epilogue */
7767 s++;
7768 if (s == s_end) {
7769 /* ascii compatible dumped string */
7770 break;
7771 }
7772 else {
7773 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7774 static const char dup_suffix[] = ".dup";
7775 const char *encname;
7776 int encidx;
7777 ptrdiff_t size;
7778
7779 /* check separately for strings dumped by older versions */
7780 size = sizeof(dup_suffix) - 1;
7781 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7782
7783 size = sizeof(force_encoding_suffix) - 1;
7784 if (s_end - s <= size) goto invalid_format;
7785 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7786 s += size;
7787
7788 if (utf8) {
7789 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7790 }
7791
7792 encname = s;
7793 s = memchr(s, '"', s_end-s);
7794 size = s - encname;
7795 if (!s) goto invalid_format;
7796 if (s_end - s != 2) goto invalid_format;
7797 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7798
7799 encidx = rb_enc_find_index2(encname, (long)size);
7800 if (encidx < 0) {
7801 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7802 }
7803 rb_enc_associate_index(undumped, encidx);
7804 }
7805 break;
7806 }
7807
7808 if (*s == '\\') {
7809 s++;
7810 if (s >= s_end) {
7811 rb_raise(rb_eRuntimeError, "invalid escape");
7812 }
7813 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7814 }
7815 else {
7816 rb_str_cat(undumped, s++, 1);
7817 }
7818 }
7819
7820 RB_GC_GUARD(str);
7821
7822 return undumped;
7823invalid_format:
7824 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7825}
7826
7827static void
7828rb_str_check_dummy_enc(rb_encoding *enc)
7829{
7830 if (rb_enc_dummy_p(enc)) {
7831 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7832 rb_enc_name(enc));
7833 }
7834}
7835
7836static rb_encoding *
7837str_true_enc(VALUE str)
7838{
7839 rb_encoding *enc = STR_ENC_GET(str);
7840 rb_str_check_dummy_enc(enc);
7841 return enc;
7842}
7843
7844static OnigCaseFoldType
7845check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7846{
7847 if (argc==0)
7848 return flags;
7849 if (argc>2)
7850 rb_raise(rb_eArgError, "too many options");
7851 if (argv[0]==sym_turkic) {
7852 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7853 if (argc==2) {
7854 if (argv[1]==sym_lithuanian)
7855 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7856 else
7857 rb_raise(rb_eArgError, "invalid second option");
7858 }
7859 }
7860 else if (argv[0]==sym_lithuanian) {
7861 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7862 if (argc==2) {
7863 if (argv[1]==sym_turkic)
7864 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7865 else
7866 rb_raise(rb_eArgError, "invalid second option");
7867 }
7868 }
7869 else if (argc>1)
7870 rb_raise(rb_eArgError, "too many options");
7871 else if (argv[0]==sym_ascii)
7872 flags |= ONIGENC_CASE_ASCII_ONLY;
7873 else if (argv[0]==sym_fold) {
7874 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7875 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7876 else
7877 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7878 }
7879 else
7880 rb_raise(rb_eArgError, "invalid option");
7881 return flags;
7882}
7883
7884static inline bool
7885case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7886{
7887 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7888 return true;
7889 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7890}
7891
7892/* 16 should be long enough to absorb any kind of single character length increase */
7893#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7894#ifndef CASEMAP_DEBUG
7895# define CASEMAP_DEBUG 0
7896#endif
7897
7898struct mapping_buffer;
7899typedef struct mapping_buffer {
7900 size_t capa;
7901 size_t used;
7902 struct mapping_buffer *next;
7903 OnigUChar space[FLEX_ARY_LEN];
7905
7906static void
7907mapping_buffer_free(void *p)
7908{
7909 mapping_buffer *previous_buffer;
7910 mapping_buffer *current_buffer = p;
7911 while (current_buffer) {
7912 previous_buffer = current_buffer;
7913 current_buffer = current_buffer->next;
7914 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7915 }
7916}
7917
7918static const rb_data_type_t mapping_buffer_type = {
7919 "mapping_buffer",
7920 {0, mapping_buffer_free,},
7921 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7922};
7923
7924static VALUE
7925rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7926{
7927 VALUE target;
7928
7929 const OnigUChar *source_current, *source_end;
7930 int target_length = 0;
7931 VALUE buffer_anchor;
7932 mapping_buffer *current_buffer = 0;
7933 mapping_buffer **pre_buffer;
7934 size_t buffer_count = 0;
7935 int buffer_length_or_invalid;
7936
7937 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7938
7939 source_current = (OnigUChar*)RSTRING_PTR(source);
7940 source_end = (OnigUChar*)RSTRING_END(source);
7941
7942 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7943 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7944 while (source_current < source_end) {
7945 /* increase multiplier using buffer count to converge quickly */
7946 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7947 if (CASEMAP_DEBUG) {
7948 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7949 }
7950 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7951 *pre_buffer = current_buffer;
7952 pre_buffer = &current_buffer->next;
7953 current_buffer->next = NULL;
7954 current_buffer->capa = capa;
7955 buffer_length_or_invalid = enc->case_map(flags,
7956 &source_current, source_end,
7957 current_buffer->space,
7958 current_buffer->space+current_buffer->capa,
7959 enc);
7960 if (buffer_length_or_invalid < 0) {
7961 current_buffer = DATA_PTR(buffer_anchor);
7962 DATA_PTR(buffer_anchor) = 0;
7963 mapping_buffer_free(current_buffer);
7964 rb_raise(rb_eArgError, "input string invalid");
7965 }
7966 target_length += current_buffer->used = buffer_length_or_invalid;
7967 }
7968 if (CASEMAP_DEBUG) {
7969 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7970 }
7971
7972 if (buffer_count==1) {
7973 target = rb_str_new((const char*)current_buffer->space, target_length);
7974 }
7975 else {
7976 char *target_current;
7977
7978 target = rb_str_new(0, target_length);
7979 target_current = RSTRING_PTR(target);
7980 current_buffer = DATA_PTR(buffer_anchor);
7981 while (current_buffer) {
7982 memcpy(target_current, current_buffer->space, current_buffer->used);
7983 target_current += current_buffer->used;
7984 current_buffer = current_buffer->next;
7985 }
7986 }
7987 current_buffer = DATA_PTR(buffer_anchor);
7988 DATA_PTR(buffer_anchor) = 0;
7989 mapping_buffer_free(current_buffer);
7990
7991 RB_GC_GUARD(buffer_anchor);
7992
7993 /* TODO: check about string terminator character */
7994 str_enc_copy_direct(target, source);
7995 /*ENC_CODERANGE_SET(mapped, cr);*/
7996
7997 return target;
7998}
7999
8000static VALUE
8001rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8002{
8003 const OnigUChar *source_current, *source_end;
8004 OnigUChar *target_current, *target_end;
8005 long old_length = RSTRING_LEN(source);
8006 int length_or_invalid;
8007
8008 if (old_length == 0) return Qnil;
8009
8010 source_current = (OnigUChar*)RSTRING_PTR(source);
8011 source_end = (OnigUChar*)RSTRING_END(source);
8012 if (source == target) {
8013 target_current = (OnigUChar*)source_current;
8014 target_end = (OnigUChar*)source_end;
8015 }
8016 else {
8017 target_current = (OnigUChar*)RSTRING_PTR(target);
8018 target_end = (OnigUChar*)RSTRING_END(target);
8019 }
8020
8021 length_or_invalid = onigenc_ascii_only_case_map(flags,
8022 &source_current, source_end,
8023 target_current, target_end, enc);
8024 if (length_or_invalid < 0)
8025 rb_raise(rb_eArgError, "input string invalid");
8026 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8027 fprintf(stderr, "problem with rb_str_ascii_casemap"
8028 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8029 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8030 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8031 }
8032
8033 str_enc_copy(target, source);
8034
8035 return target;
8036}
8037
8038static bool
8039upcase_single(VALUE str)
8040{
8041 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8042 bool modified = false;
8043
8044 while (s < send) {
8045 unsigned int c = *(unsigned char*)s;
8046
8047 if ('a' <= c && c <= 'z') {
8048 *s = 'A' + (c - 'a');
8049 modified = true;
8050 }
8051 s++;
8052 }
8053 return modified;
8054}
8055
8056/*
8057 * call-seq:
8058 * upcase!(mapping) -> self or nil
8059 *
8060 * Upcases the characters in +self+;
8061 * returns +self+ if any changes were made, +nil+ otherwise:
8062 *
8063 * s = 'Hello World!' # => "Hello World!"
8064 * s.upcase! # => "HELLO WORLD!"
8065 * s # => "HELLO WORLD!"
8066 * s.upcase! # => nil
8067 *
8068 * The casing may be affected by the given +mapping+;
8069 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8070 *
8071 * Related: String#upcase, String#downcase, String#downcase!.
8072 *
8073 */
8074
8075static VALUE
8076rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8077{
8078 rb_encoding *enc;
8079 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8080
8081 flags = check_case_options(argc, argv, flags);
8082 str_modify_keep_cr(str);
8083 enc = str_true_enc(str);
8084 if (case_option_single_p(flags, enc, str)) {
8085 if (upcase_single(str))
8086 flags |= ONIGENC_CASE_MODIFIED;
8087 }
8088 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8089 rb_str_ascii_casemap(str, str, &flags, enc);
8090 else
8091 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8092
8093 if (ONIGENC_CASE_MODIFIED&flags) return str;
8094 return Qnil;
8095}
8096
8097
8098/*
8099 * call-seq:
8100 * upcase(mapping) -> string
8101 *
8102 * Returns a string containing the upcased characters in +self+:
8103 *
8104 * s = 'Hello World!' # => "Hello World!"
8105 * s.upcase # => "HELLO WORLD!"
8106 *
8107 * The casing may be affected by the given +mapping+;
8108 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8109 *
8110 * Related: String#upcase!, String#downcase, String#downcase!.
8111 *
8112 */
8113
8114static VALUE
8115rb_str_upcase(int argc, VALUE *argv, VALUE str)
8116{
8117 rb_encoding *enc;
8118 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8119 VALUE ret;
8120
8121 flags = check_case_options(argc, argv, flags);
8122 enc = str_true_enc(str);
8123 if (case_option_single_p(flags, enc, str)) {
8124 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8125 str_enc_copy_direct(ret, str);
8126 upcase_single(ret);
8127 }
8128 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8129 ret = rb_str_new(0, RSTRING_LEN(str));
8130 rb_str_ascii_casemap(str, ret, &flags, enc);
8131 }
8132 else {
8133 ret = rb_str_casemap(str, &flags, enc);
8134 }
8135
8136 return ret;
8137}
8138
8139static bool
8140downcase_single(VALUE str)
8141{
8142 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8143 bool modified = false;
8144
8145 while (s < send) {
8146 unsigned int c = *(unsigned char*)s;
8147
8148 if ('A' <= c && c <= 'Z') {
8149 *s = 'a' + (c - 'A');
8150 modified = true;
8151 }
8152 s++;
8153 }
8154
8155 return modified;
8156}
8157
8158/*
8159 * call-seq:
8160 * downcase!(mapping) -> self or nil
8161 *
8162 * Downcases the characters in +self+;
8163 * returns +self+ if any changes were made, +nil+ otherwise:
8164 *
8165 * s = 'Hello World!' # => "Hello World!"
8166 * s.downcase! # => "hello world!"
8167 * s # => "hello world!"
8168 * s.downcase! # => nil
8169 *
8170 * The casing may be affected by the given +mapping+;
8171 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8172 *
8173 * Related: String#downcase, String#upcase, String#upcase!.
8174 *
8175 */
8176
8177static VALUE
8178rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8179{
8180 rb_encoding *enc;
8181 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8182
8183 flags = check_case_options(argc, argv, flags);
8184 str_modify_keep_cr(str);
8185 enc = str_true_enc(str);
8186 if (case_option_single_p(flags, enc, str)) {
8187 if (downcase_single(str))
8188 flags |= ONIGENC_CASE_MODIFIED;
8189 }
8190 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8191 rb_str_ascii_casemap(str, str, &flags, enc);
8192 else
8193 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8194
8195 if (ONIGENC_CASE_MODIFIED&flags) return str;
8196 return Qnil;
8197}
8198
8199
8200/*
8201 * call-seq:
8202 * downcase(mapping) -> string
8203 *
8204 * Returns a string containing the downcased characters in +self+:
8205 *
8206 * s = 'Hello World!' # => "Hello World!"
8207 * s.downcase # => "hello world!"
8208 *
8209 * The casing may be affected by the given +mapping+;
8210 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8211 *
8212 * Related: String#downcase!, String#upcase, String#upcase!.
8213 *
8214 */
8215
8216static VALUE
8217rb_str_downcase(int argc, VALUE *argv, VALUE str)
8218{
8219 rb_encoding *enc;
8220 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8221 VALUE ret;
8222
8223 flags = check_case_options(argc, argv, flags);
8224 enc = str_true_enc(str);
8225 if (case_option_single_p(flags, enc, str)) {
8226 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8227 str_enc_copy_direct(ret, str);
8228 downcase_single(ret);
8229 }
8230 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8231 ret = rb_str_new(0, RSTRING_LEN(str));
8232 rb_str_ascii_casemap(str, ret, &flags, enc);
8233 }
8234 else {
8235 ret = rb_str_casemap(str, &flags, enc);
8236 }
8237
8238 return ret;
8239}
8240
8241
8242/*
8243 * call-seq:
8244 * capitalize!(mapping = :ascii) -> self or nil
8245 *
8246 * Like String#capitalize, except that:
8247 *
8248 * - Changes character casings in +self+ (not in a copy of +self+).
8249 * - Returns +self+ if any changes are made, +nil+ otherwise.
8250 *
8251 * Related: See {Modifying}[rdoc-ref:String@Modifying].
8252 */
8253
8254static VALUE
8255rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8256{
8257 rb_encoding *enc;
8258 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8259
8260 flags = check_case_options(argc, argv, flags);
8261 str_modify_keep_cr(str);
8262 enc = str_true_enc(str);
8263 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8264 if (flags&ONIGENC_CASE_ASCII_ONLY)
8265 rb_str_ascii_casemap(str, str, &flags, enc);
8266 else
8267 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8268
8269 if (ONIGENC_CASE_MODIFIED&flags) return str;
8270 return Qnil;
8271}
8272
8273
8274/*
8275 * call-seq:
8276 * capitalize(mapping = :ascii) -> string
8277 *
8278 * Returns a string containing the characters in +self+,
8279 * each with possibly changed case:
8280 *
8281 * - The first character is upcased.
8282 * - All other characters are downcased.
8283 *
8284 * Examples:
8285 *
8286 * 'hello world'.capitalize # => "Hello world"
8287 * 'HELLO WORLD'.capitalize # => "Hello world"
8288 *
8289 * Some characters do not have upcase and downcase, and so are not changed;
8290 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc]:
8291 *
8292 * '1, 2, 3, ...'.capitalize # => "1, 2, 3, ..."
8293 *
8294 * The casing is affected by the given +mapping+,
8295 * which may be +:ascii+, +:fold+, or +:turkic+;
8296 * see {Case Mappings}[rdoc-ref:case_mapping.rdoc@Case+Mappings].
8297 *
8298 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
8299 */
8300
8301static VALUE
8302rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8303{
8304 rb_encoding *enc;
8305 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8306 VALUE ret;
8307
8308 flags = check_case_options(argc, argv, flags);
8309 enc = str_true_enc(str);
8310 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8311 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8312 ret = rb_str_new(0, RSTRING_LEN(str));
8313 rb_str_ascii_casemap(str, ret, &flags, enc);
8314 }
8315 else {
8316 ret = rb_str_casemap(str, &flags, enc);
8317 }
8318 return ret;
8319}
8320
8321
8322/*
8323 * call-seq:
8324 * swapcase!(mapping) -> self or nil
8325 *
8326 * Upcases each lowercase character in +self+;
8327 * downcases uppercase character;
8328 * returns +self+ if any changes were made, +nil+ otherwise:
8329 *
8330 * s = 'Hello World!' # => "Hello World!"
8331 * s.swapcase! # => "hELLO wORLD!"
8332 * s # => "hELLO wORLD!"
8333 * ''.swapcase! # => nil
8334 *
8335 * The casing may be affected by the given +mapping+;
8336 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8337 *
8338 * Related: String#swapcase.
8339 *
8340 */
8341
8342static VALUE
8343rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8344{
8345 rb_encoding *enc;
8346 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8347
8348 flags = check_case_options(argc, argv, flags);
8349 str_modify_keep_cr(str);
8350 enc = str_true_enc(str);
8351 if (flags&ONIGENC_CASE_ASCII_ONLY)
8352 rb_str_ascii_casemap(str, str, &flags, enc);
8353 else
8354 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8355
8356 if (ONIGENC_CASE_MODIFIED&flags) return str;
8357 return Qnil;
8358}
8359
8360
8361/*
8362 * call-seq:
8363 * swapcase(mapping) -> string
8364 *
8365 * Returns a string containing the characters in +self+, with cases reversed;
8366 * each uppercase character is downcased;
8367 * each lowercase character is upcased:
8368 *
8369 * s = 'Hello World!' # => "Hello World!"
8370 * s.swapcase # => "hELLO wORLD!"
8371 *
8372 * The casing may be affected by the given +mapping+;
8373 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8374 *
8375 * Related: String#swapcase!.
8376 *
8377 */
8378
8379static VALUE
8380rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8381{
8382 rb_encoding *enc;
8383 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8384 VALUE ret;
8385
8386 flags = check_case_options(argc, argv, flags);
8387 enc = str_true_enc(str);
8388 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8389 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8390 ret = rb_str_new(0, RSTRING_LEN(str));
8391 rb_str_ascii_casemap(str, ret, &flags, enc);
8392 }
8393 else {
8394 ret = rb_str_casemap(str, &flags, enc);
8395 }
8396 return ret;
8397}
8398
8399typedef unsigned char *USTR;
8400
8401struct tr {
8402 int gen;
8403 unsigned int now, max;
8404 char *p, *pend;
8405};
8406
8407static unsigned int
8408trnext(struct tr *t, rb_encoding *enc)
8409{
8410 int n;
8411
8412 for (;;) {
8413 nextpart:
8414 if (!t->gen) {
8415 if (t->p == t->pend) return -1;
8416 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8417 t->p += n;
8418 }
8419 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8420 t->p += n;
8421 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8422 t->p += n;
8423 if (t->p < t->pend) {
8424 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8425 t->p += n;
8426 if (t->now > c) {
8427 if (t->now < 0x80 && c < 0x80) {
8428 rb_raise(rb_eArgError,
8429 "invalid range \"%c-%c\" in string transliteration",
8430 t->now, c);
8431 }
8432 else {
8433 rb_raise(rb_eArgError, "invalid range in string transliteration");
8434 }
8435 continue; /* not reached */
8436 }
8437 else if (t->now < c) {
8438 t->gen = 1;
8439 t->max = c;
8440 }
8441 }
8442 }
8443 return t->now;
8444 }
8445 else {
8446 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8447 if (t->now == t->max) {
8448 t->gen = 0;
8449 goto nextpart;
8450 }
8451 }
8452 if (t->now < t->max) {
8453 return t->now;
8454 }
8455 else {
8456 t->gen = 0;
8457 return t->max;
8458 }
8459 }
8460 }
8461}
8462
8463static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8464
8465static VALUE
8466tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8467{
8468 const unsigned int errc = -1;
8469 unsigned int trans[256];
8470 rb_encoding *enc, *e1, *e2;
8471 struct tr trsrc, trrepl;
8472 int cflag = 0;
8473 unsigned int c, c0, last = 0;
8474 int modify = 0, i, l;
8475 unsigned char *s, *send;
8476 VALUE hash = 0;
8477 int singlebyte = single_byte_optimizable(str);
8478 int termlen;
8479 int cr;
8480
8481#define CHECK_IF_ASCII(c) \
8482 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8483 (cr = ENC_CODERANGE_VALID) : 0)
8484
8485 StringValue(src);
8486 StringValue(repl);
8487 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8488 if (RSTRING_LEN(repl) == 0) {
8489 return rb_str_delete_bang(1, &src, str);
8490 }
8491
8492 cr = ENC_CODERANGE(str);
8493 e1 = rb_enc_check(str, src);
8494 e2 = rb_enc_check(str, repl);
8495 if (e1 == e2) {
8496 enc = e1;
8497 }
8498 else {
8499 enc = rb_enc_check(src, repl);
8500 }
8501 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8502 if (RSTRING_LEN(src) > 1 &&
8503 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8504 trsrc.p + l < trsrc.pend) {
8505 cflag = 1;
8506 trsrc.p += l;
8507 }
8508 trrepl.p = RSTRING_PTR(repl);
8509 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8510 trsrc.gen = trrepl.gen = 0;
8511 trsrc.now = trrepl.now = 0;
8512 trsrc.max = trrepl.max = 0;
8513
8514 if (cflag) {
8515 for (i=0; i<256; i++) {
8516 trans[i] = 1;
8517 }
8518 while ((c = trnext(&trsrc, enc)) != errc) {
8519 if (c < 256) {
8520 trans[c] = errc;
8521 }
8522 else {
8523 if (!hash) hash = rb_hash_new();
8524 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8525 }
8526 }
8527 while ((c = trnext(&trrepl, enc)) != errc)
8528 /* retrieve last replacer */;
8529 last = trrepl.now;
8530 for (i=0; i<256; i++) {
8531 if (trans[i] != errc) {
8532 trans[i] = last;
8533 }
8534 }
8535 }
8536 else {
8537 unsigned int r;
8538
8539 for (i=0; i<256; i++) {
8540 trans[i] = errc;
8541 }
8542 while ((c = trnext(&trsrc, enc)) != errc) {
8543 r = trnext(&trrepl, enc);
8544 if (r == errc) r = trrepl.now;
8545 if (c < 256) {
8546 trans[c] = r;
8547 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8548 }
8549 else {
8550 if (!hash) hash = rb_hash_new();
8551 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8552 }
8553 }
8554 }
8555
8556 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8557 cr = ENC_CODERANGE_7BIT;
8558 str_modify_keep_cr(str);
8559 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8560 termlen = rb_enc_mbminlen(enc);
8561 if (sflag) {
8562 int clen, tlen;
8563 long offset, max = RSTRING_LEN(str);
8564 unsigned int save = -1;
8565 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8566
8567 while (s < send) {
8568 int may_modify = 0;
8569
8570 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8571 if (!MBCLEN_CHARFOUND_P(r)) {
8572 xfree(buf);
8573 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8574 }
8575 clen = MBCLEN_CHARFOUND_LEN(r);
8576 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8577
8578 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8579
8580 s += clen;
8581 if (c < 256) {
8582 c = trans[c];
8583 }
8584 else if (hash) {
8585 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8586 if (NIL_P(tmp)) {
8587 if (cflag) c = last;
8588 else c = errc;
8589 }
8590 else if (cflag) c = errc;
8591 else c = NUM2INT(tmp);
8592 }
8593 else {
8594 c = errc;
8595 }
8596 if (c != (unsigned int)-1) {
8597 if (save == c) {
8598 CHECK_IF_ASCII(c);
8599 continue;
8600 }
8601 save = c;
8602 tlen = rb_enc_codelen(c, enc);
8603 modify = 1;
8604 }
8605 else {
8606 save = -1;
8607 c = c0;
8608 if (enc != e1) may_modify = 1;
8609 }
8610 if ((offset = t - buf) + tlen > max) {
8611 size_t MAYBE_UNUSED(old) = max + termlen;
8612 max = offset + tlen + (send - s);
8613 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8614 t = buf + offset;
8615 }
8616 rb_enc_mbcput(c, t, enc);
8617 if (may_modify && memcmp(s, t, tlen) != 0) {
8618 modify = 1;
8619 }
8620 CHECK_IF_ASCII(c);
8621 t += tlen;
8622 }
8623 if (!STR_EMBED_P(str)) {
8624 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8625 }
8626 TERM_FILL((char *)t, termlen);
8627 RSTRING(str)->as.heap.ptr = (char *)buf;
8628 STR_SET_LEN(str, t - buf);
8629 STR_SET_NOEMBED(str);
8630 RSTRING(str)->as.heap.aux.capa = max;
8631 }
8632 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8633 while (s < send) {
8634 c = (unsigned char)*s;
8635 if (trans[c] != errc) {
8636 if (!cflag) {
8637 c = trans[c];
8638 *s = c;
8639 modify = 1;
8640 }
8641 else {
8642 *s = last;
8643 modify = 1;
8644 }
8645 }
8646 CHECK_IF_ASCII(c);
8647 s++;
8648 }
8649 }
8650 else {
8651 int clen, tlen;
8652 long offset, max = (long)((send - s) * 1.2);
8653 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8654
8655 while (s < send) {
8656 int may_modify = 0;
8657
8658 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8659 if (!MBCLEN_CHARFOUND_P(r)) {
8660 xfree(buf);
8661 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8662 }
8663 clen = MBCLEN_CHARFOUND_LEN(r);
8664 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8665
8666 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8667
8668 if (c < 256) {
8669 c = trans[c];
8670 }
8671 else if (hash) {
8672 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8673 if (NIL_P(tmp)) {
8674 if (cflag) c = last;
8675 else c = errc;
8676 }
8677 else if (cflag) c = errc;
8678 else c = NUM2INT(tmp);
8679 }
8680 else {
8681 c = cflag ? last : errc;
8682 }
8683 if (c != errc) {
8684 tlen = rb_enc_codelen(c, enc);
8685 modify = 1;
8686 }
8687 else {
8688 c = c0;
8689 if (enc != e1) may_modify = 1;
8690 }
8691 if ((offset = t - buf) + tlen > max) {
8692 size_t MAYBE_UNUSED(old) = max + termlen;
8693 max = offset + tlen + (long)((send - s) * 1.2);
8694 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8695 t = buf + offset;
8696 }
8697 if (s != t) {
8698 rb_enc_mbcput(c, t, enc);
8699 if (may_modify && memcmp(s, t, tlen) != 0) {
8700 modify = 1;
8701 }
8702 }
8703 CHECK_IF_ASCII(c);
8704 s += clen;
8705 t += tlen;
8706 }
8707 if (!STR_EMBED_P(str)) {
8708 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8709 }
8710 TERM_FILL((char *)t, termlen);
8711 RSTRING(str)->as.heap.ptr = (char *)buf;
8712 STR_SET_LEN(str, t - buf);
8713 STR_SET_NOEMBED(str);
8714 RSTRING(str)->as.heap.aux.capa = max;
8715 }
8716
8717 if (modify) {
8718 if (cr != ENC_CODERANGE_BROKEN)
8719 ENC_CODERANGE_SET(str, cr);
8720 rb_enc_associate(str, enc);
8721 return str;
8722 }
8723 return Qnil;
8724}
8725
8726
8727/*
8728 * call-seq:
8729 * tr!(selector, replacements) -> self or nil
8730 *
8731 * Like String#tr, but modifies +self+ in place.
8732 * Returns +self+ if any changes were made, +nil+ otherwise.
8733 *
8734 */
8735
8736static VALUE
8737rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8738{
8739 return tr_trans(str, src, repl, 0);
8740}
8741
8742
8743/*
8744 * call-seq:
8745 * tr(selector, replacements) -> new_string
8746 *
8747 * Returns a copy of +self+ with each character specified by string +selector+
8748 * translated to the corresponding character in string +replacements+.
8749 * The correspondence is _positional_:
8750 *
8751 * - Each occurrence of the first character specified by +selector+
8752 * is translated to the first character in +replacements+.
8753 * - Each occurrence of the second character specified by +selector+
8754 * is translated to the second character in +replacements+.
8755 * - And so on.
8756 *
8757 * Example:
8758 *
8759 * 'hello'.tr('el', 'ip') #=> "hippo"
8760 *
8761 * If +replacements+ is shorter than +selector+,
8762 * it is implicitly padded with its own last character:
8763 *
8764 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8765 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8766 *
8767 * Arguments +selector+ and +replacements+ must be valid character selectors
8768 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8769 * and may use any of its valid forms, including negation, ranges, and escaping:
8770 *
8771 * # Negation.
8772 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8773 * # Ranges.
8774 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8775 * # Escapes.
8776 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8777 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8778 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8779 *
8780 */
8781
8782static VALUE
8783rb_str_tr(VALUE str, VALUE src, VALUE repl)
8784{
8785 str = str_duplicate(rb_cString, str);
8786 tr_trans(str, src, repl, 0);
8787 return str;
8788}
8789
8790#define TR_TABLE_MAX (UCHAR_MAX+1)
8791#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8792static void
8793tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8794 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8795{
8796 const unsigned int errc = -1;
8797 char buf[TR_TABLE_MAX];
8798 struct tr tr;
8799 unsigned int c;
8800 VALUE table = 0, ptable = 0;
8801 int i, l, cflag = 0;
8802
8803 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8804 tr.gen = tr.now = tr.max = 0;
8805
8806 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8807 cflag = 1;
8808 tr.p += l;
8809 }
8810 if (first) {
8811 for (i=0; i<TR_TABLE_MAX; i++) {
8812 stable[i] = 1;
8813 }
8814 stable[TR_TABLE_MAX] = cflag;
8815 }
8816 else if (stable[TR_TABLE_MAX] && !cflag) {
8817 stable[TR_TABLE_MAX] = 0;
8818 }
8819 for (i=0; i<TR_TABLE_MAX; i++) {
8820 buf[i] = cflag;
8821 }
8822
8823 while ((c = trnext(&tr, enc)) != errc) {
8824 if (c < TR_TABLE_MAX) {
8825 buf[(unsigned char)c] = !cflag;
8826 }
8827 else {
8828 VALUE key = UINT2NUM(c);
8829
8830 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8831 if (cflag) {
8832 ptable = *ctablep;
8833 table = ptable ? ptable : rb_hash_new();
8834 *ctablep = table;
8835 }
8836 else {
8837 table = rb_hash_new();
8838 ptable = *tablep;
8839 *tablep = table;
8840 }
8841 }
8842 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8843 rb_hash_aset(table, key, Qtrue);
8844 }
8845 }
8846 }
8847 for (i=0; i<TR_TABLE_MAX; i++) {
8848 stable[i] = stable[i] && buf[i];
8849 }
8850 if (!table && !cflag) {
8851 *tablep = 0;
8852 }
8853}
8854
8855
8856static int
8857tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8858{
8859 if (c < TR_TABLE_MAX) {
8860 return table[c] != 0;
8861 }
8862 else {
8863 VALUE v = UINT2NUM(c);
8864
8865 if (del) {
8866 if (!NIL_P(rb_hash_lookup(del, v)) &&
8867 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8868 return TRUE;
8869 }
8870 }
8871 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8872 return FALSE;
8873 }
8874 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8875 }
8876}
8877
8878/*
8879 * call-seq:
8880 * delete!(*selectors) -> self or nil
8881 *
8882 * Like String#delete, but modifies +self+ in place.
8883 * Returns +self+ if any changes were made, +nil+ otherwise.
8884 *
8885 */
8886
8887static VALUE
8888rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8889{
8890 char squeez[TR_TABLE_SIZE];
8891 rb_encoding *enc = 0;
8892 char *s, *send, *t;
8893 VALUE del = 0, nodel = 0;
8894 int modify = 0;
8895 int i, ascompat, cr;
8896
8897 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8899 for (i=0; i<argc; i++) {
8900 VALUE s = argv[i];
8901
8902 StringValue(s);
8903 enc = rb_enc_check(str, s);
8904 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8905 }
8906
8907 str_modify_keep_cr(str);
8908 ascompat = rb_enc_asciicompat(enc);
8909 s = t = RSTRING_PTR(str);
8910 send = RSTRING_END(str);
8911 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8912 while (s < send) {
8913 unsigned int c;
8914 int clen;
8915
8916 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8917 if (squeez[c]) {
8918 modify = 1;
8919 }
8920 else {
8921 if (t != s) *t = c;
8922 t++;
8923 }
8924 s++;
8925 }
8926 else {
8927 c = rb_enc_codepoint_len(s, send, &clen, enc);
8928
8929 if (tr_find(c, squeez, del, nodel)) {
8930 modify = 1;
8931 }
8932 else {
8933 if (t != s) rb_enc_mbcput(c, t, enc);
8934 t += clen;
8936 }
8937 s += clen;
8938 }
8939 }
8940 TERM_FILL(t, TERM_LEN(str));
8941 STR_SET_LEN(str, t - RSTRING_PTR(str));
8942 ENC_CODERANGE_SET(str, cr);
8943
8944 if (modify) return str;
8945 return Qnil;
8946}
8947
8948
8949/*
8950 * call-seq:
8951 * delete(*selectors) -> new_string
8952 *
8953 * Returns a copy of +self+ with characters specified by +selectors+ removed
8954 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8955 *
8956 * "hello".delete "l","lo" #=> "heo"
8957 * "hello".delete "lo" #=> "he"
8958 * "hello".delete "aeiou", "^e" #=> "hell"
8959 * "hello".delete "ej-m" #=> "ho"
8960 *
8961 */
8962
8963static VALUE
8964rb_str_delete(int argc, VALUE *argv, VALUE str)
8965{
8966 str = str_duplicate(rb_cString, str);
8967 rb_str_delete_bang(argc, argv, str);
8968 return str;
8969}
8970
8971
8972/*
8973 * call-seq:
8974 * squeeze!(*selectors) -> self or nil
8975 *
8976 * Like String#squeeze, but modifies +self+ in place.
8977 * Returns +self+ if any changes were made, +nil+ otherwise.
8978 */
8979
8980static VALUE
8981rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8982{
8983 char squeez[TR_TABLE_SIZE];
8984 rb_encoding *enc = 0;
8985 VALUE del = 0, nodel = 0;
8986 unsigned char *s, *send, *t;
8987 int i, modify = 0;
8988 int ascompat, singlebyte = single_byte_optimizable(str);
8989 unsigned int save;
8990
8991 if (argc == 0) {
8992 enc = STR_ENC_GET(str);
8993 }
8994 else {
8995 for (i=0; i<argc; i++) {
8996 VALUE s = argv[i];
8997
8998 StringValue(s);
8999 enc = rb_enc_check(str, s);
9000 if (singlebyte && !single_byte_optimizable(s))
9001 singlebyte = 0;
9002 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9003 }
9004 }
9005
9006 str_modify_keep_cr(str);
9007 s = t = (unsigned char *)RSTRING_PTR(str);
9008 if (!s || RSTRING_LEN(str) == 0) return Qnil;
9009 send = (unsigned char *)RSTRING_END(str);
9010 save = -1;
9011 ascompat = rb_enc_asciicompat(enc);
9012
9013 if (singlebyte) {
9014 while (s < send) {
9015 unsigned int c = *s++;
9016 if (c != save || (argc > 0 && !squeez[c])) {
9017 *t++ = save = c;
9018 }
9019 }
9020 }
9021 else {
9022 while (s < send) {
9023 unsigned int c;
9024 int clen;
9025
9026 if (ascompat && (c = *s) < 0x80) {
9027 if (c != save || (argc > 0 && !squeez[c])) {
9028 *t++ = save = c;
9029 }
9030 s++;
9031 }
9032 else {
9033 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9034
9035 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9036 if (t != s) rb_enc_mbcput(c, t, enc);
9037 save = c;
9038 t += clen;
9039 }
9040 s += clen;
9041 }
9042 }
9043 }
9044
9045 TERM_FILL((char *)t, TERM_LEN(str));
9046 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9047 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9048 modify = 1;
9049 }
9050
9051 if (modify) return str;
9052 return Qnil;
9053}
9054
9055
9056/*
9057 * call-seq:
9058 * squeeze(*selectors) -> new_string
9059 *
9060 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9061 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9062 *
9063 * "Squeezed" means that each multiple-character run of a selected character
9064 * is squeezed down to a single character;
9065 * with no arguments given, squeezes all characters:
9066 *
9067 * "yellow moon".squeeze #=> "yelow mon"
9068 * " now is the".squeeze(" ") #=> " now is the"
9069 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9070 *
9071 */
9072
9073static VALUE
9074rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9075{
9076 str = str_duplicate(rb_cString, str);
9077 rb_str_squeeze_bang(argc, argv, str);
9078 return str;
9079}
9080
9081
9082/*
9083 * call-seq:
9084 * tr_s!(selector, replacements) -> self or nil
9085 *
9086 * Like String#tr_s, but modifies +self+ in place.
9087 * Returns +self+ if any changes were made, +nil+ otherwise.
9088 *
9089 * Related: String#squeeze!.
9090 */
9091
9092static VALUE
9093rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9094{
9095 return tr_trans(str, src, repl, 1);
9096}
9097
9098
9099/*
9100 * call-seq:
9101 * tr_s(selector, replacements) -> string
9102 *
9103 * Like String#tr, but also squeezes the modified portions of the translated string;
9104 * returns a new string (translated and squeezed).
9105 *
9106 * 'hello'.tr_s('l', 'r') #=> "hero"
9107 * 'hello'.tr_s('el', '-') #=> "h-o"
9108 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9109 *
9110 * Related: String#squeeze.
9111 *
9112 */
9113
9114static VALUE
9115rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9116{
9117 str = str_duplicate(rb_cString, str);
9118 tr_trans(str, src, repl, 1);
9119 return str;
9120}
9121
9122
9123/*
9124 * call-seq:
9125 * count(*selectors) -> integer
9126 *
9127 * Returns the total number of characters in +self+
9128 * that are specified by the given +selectors+
9129 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9130 *
9131 * a = "hello world"
9132 * a.count "lo" #=> 5
9133 * a.count "lo", "o" #=> 2
9134 * a.count "hello", "^l" #=> 4
9135 * a.count "ej-m" #=> 4
9136 *
9137 * "hello^world".count "\\^aeiou" #=> 4
9138 * "hello-world".count "a\\-eo" #=> 4
9139 *
9140 * c = "hello world\\r\\n"
9141 * c.count "\\" #=> 2
9142 * c.count "\\A" #=> 0
9143 * c.count "X-\\w" #=> 3
9144 */
9145
9146static VALUE
9147rb_str_count(int argc, VALUE *argv, VALUE str)
9148{
9149 char table[TR_TABLE_SIZE];
9150 rb_encoding *enc = 0;
9151 VALUE del = 0, nodel = 0, tstr;
9152 char *s, *send;
9153 int i;
9154 int ascompat;
9155 size_t n = 0;
9156
9158
9159 tstr = argv[0];
9160 StringValue(tstr);
9161 enc = rb_enc_check(str, tstr);
9162 if (argc == 1) {
9163 const char *ptstr;
9164 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9165 (ptstr = RSTRING_PTR(tstr),
9166 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9167 !is_broken_string(str)) {
9168 int clen;
9169 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9170
9171 s = RSTRING_PTR(str);
9172 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9173 send = RSTRING_END(str);
9174 while (s < send) {
9175 if (*(unsigned char*)s++ == c) n++;
9176 }
9177 return SIZET2NUM(n);
9178 }
9179 }
9180
9181 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9182 for (i=1; i<argc; i++) {
9183 tstr = argv[i];
9184 StringValue(tstr);
9185 enc = rb_enc_check(str, tstr);
9186 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9187 }
9188
9189 s = RSTRING_PTR(str);
9190 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9191 send = RSTRING_END(str);
9192 ascompat = rb_enc_asciicompat(enc);
9193 while (s < send) {
9194 unsigned int c;
9195
9196 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9197 if (table[c]) {
9198 n++;
9199 }
9200 s++;
9201 }
9202 else {
9203 int clen;
9204 c = rb_enc_codepoint_len(s, send, &clen, enc);
9205 if (tr_find(c, table, del, nodel)) {
9206 n++;
9207 }
9208 s += clen;
9209 }
9210 }
9211
9212 return SIZET2NUM(n);
9213}
9214
9215static VALUE
9216rb_fs_check(VALUE val)
9217{
9218 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9219 val = rb_check_string_type(val);
9220 if (NIL_P(val)) return 0;
9221 }
9222 return val;
9223}
9224
9225static const char isspacetable[256] = {
9226 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9228 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9242};
9243
9244#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9245
9246static long
9247split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9248{
9249 if (empty_count >= 0 && len == 0) {
9250 return empty_count + 1;
9251 }
9252 if (empty_count > 0) {
9253 /* make different substrings */
9254 if (result) {
9255 do {
9256 rb_ary_push(result, str_new_empty_String(str));
9257 } while (--empty_count > 0);
9258 }
9259 else {
9260 do {
9261 rb_yield(str_new_empty_String(str));
9262 } while (--empty_count > 0);
9263 }
9264 }
9265 str = rb_str_subseq(str, beg, len);
9266 if (result) {
9267 rb_ary_push(result, str);
9268 }
9269 else {
9270 rb_yield(str);
9271 }
9272 return empty_count;
9273}
9274
9275typedef enum {
9276 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9277} split_type_t;
9278
9279static split_type_t
9280literal_split_pattern(VALUE spat, split_type_t default_type)
9281{
9282 rb_encoding *enc = STR_ENC_GET(spat);
9283 const char *ptr;
9284 long len;
9285 RSTRING_GETMEM(spat, ptr, len);
9286 if (len == 0) {
9287 /* Special case - split into chars */
9288 return SPLIT_TYPE_CHARS;
9289 }
9290 else if (rb_enc_asciicompat(enc)) {
9291 if (len == 1 && ptr[0] == ' ') {
9292 return SPLIT_TYPE_AWK;
9293 }
9294 }
9295 else {
9296 int l;
9297 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9298 return SPLIT_TYPE_AWK;
9299 }
9300 }
9301 return default_type;
9302}
9303
9304/*
9305 * call-seq:
9306 * split(field_sep = $;, limit = 0) -> array
9307 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9308 *
9309 * :include: doc/string/split.rdoc
9310 *
9311 */
9312
9313static VALUE
9314rb_str_split_m(int argc, VALUE *argv, VALUE str)
9315{
9316 rb_encoding *enc;
9317 VALUE spat;
9318 VALUE limit;
9319 split_type_t split_type;
9320 long beg, end, i = 0, empty_count = -1;
9321 int lim = 0;
9322 VALUE result, tmp;
9323
9324 result = rb_block_given_p() ? Qfalse : Qnil;
9325 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9326 lim = NUM2INT(limit);
9327 if (lim <= 0) limit = Qnil;
9328 else if (lim == 1) {
9329 if (RSTRING_LEN(str) == 0)
9330 return result ? rb_ary_new2(0) : str;
9331 tmp = str_duplicate(rb_cString, str);
9332 if (!result) {
9333 rb_yield(tmp);
9334 return str;
9335 }
9336 return rb_ary_new3(1, tmp);
9337 }
9338 i = 1;
9339 }
9340 if (NIL_P(limit) && !lim) empty_count = 0;
9341
9342 enc = STR_ENC_GET(str);
9343 split_type = SPLIT_TYPE_REGEXP;
9344 if (!NIL_P(spat)) {
9345 spat = get_pat_quoted(spat, 0);
9346 }
9347 else if (NIL_P(spat = rb_fs)) {
9348 split_type = SPLIT_TYPE_AWK;
9349 }
9350 else if (!(spat = rb_fs_check(spat))) {
9351 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9352 }
9353 else {
9354 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9355 }
9356 if (split_type != SPLIT_TYPE_AWK) {
9357 switch (BUILTIN_TYPE(spat)) {
9358 case T_REGEXP:
9359 rb_reg_options(spat); /* check if uninitialized */
9360 tmp = RREGEXP_SRC(spat);
9361 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9362 if (split_type == SPLIT_TYPE_AWK) {
9363 spat = tmp;
9364 split_type = SPLIT_TYPE_STRING;
9365 }
9366 break;
9367
9368 case T_STRING:
9369 mustnot_broken(spat);
9370 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9371 break;
9372
9373 default:
9375 }
9376 }
9377
9378#define SPLIT_STR(beg, len) ( \
9379 empty_count = split_string(result, str, beg, len, empty_count), \
9380 str_mod_check(str, str_start, str_len))
9381
9382 beg = 0;
9383 char *ptr = RSTRING_PTR(str);
9384 char *const str_start = ptr;
9385 const long str_len = RSTRING_LEN(str);
9386 char *const eptr = str_start + str_len;
9387 if (split_type == SPLIT_TYPE_AWK) {
9388 char *bptr = ptr;
9389 int skip = 1;
9390 unsigned int c;
9391
9392 if (result) result = rb_ary_new();
9393 end = beg;
9394 if (is_ascii_string(str)) {
9395 while (ptr < eptr) {
9396 c = (unsigned char)*ptr++;
9397 if (skip) {
9398 if (ascii_isspace(c)) {
9399 beg = ptr - bptr;
9400 }
9401 else {
9402 end = ptr - bptr;
9403 skip = 0;
9404 if (!NIL_P(limit) && lim <= i) break;
9405 }
9406 }
9407 else if (ascii_isspace(c)) {
9408 SPLIT_STR(beg, end-beg);
9409 skip = 1;
9410 beg = ptr - bptr;
9411 if (!NIL_P(limit)) ++i;
9412 }
9413 else {
9414 end = ptr - bptr;
9415 }
9416 }
9417 }
9418 else {
9419 while (ptr < eptr) {
9420 int n;
9421
9422 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9423 ptr += n;
9424 if (skip) {
9425 if (rb_isspace(c)) {
9426 beg = ptr - bptr;
9427 }
9428 else {
9429 end = ptr - bptr;
9430 skip = 0;
9431 if (!NIL_P(limit) && lim <= i) break;
9432 }
9433 }
9434 else if (rb_isspace(c)) {
9435 SPLIT_STR(beg, end-beg);
9436 skip = 1;
9437 beg = ptr - bptr;
9438 if (!NIL_P(limit)) ++i;
9439 }
9440 else {
9441 end = ptr - bptr;
9442 }
9443 }
9444 }
9445 }
9446 else if (split_type == SPLIT_TYPE_STRING) {
9447 char *substr_start = ptr;
9448 char *sptr = RSTRING_PTR(spat);
9449 long slen = RSTRING_LEN(spat);
9450
9451 if (result) result = rb_ary_new();
9452 mustnot_broken(str);
9453 enc = rb_enc_check(str, spat);
9454 while (ptr < eptr &&
9455 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9456 /* Check we are at the start of a char */
9457 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9458 if (t != ptr + end) {
9459 ptr = t;
9460 continue;
9461 }
9462 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9463 str_mod_check(spat, sptr, slen);
9464 ptr += end + slen;
9465 substr_start = ptr;
9466 if (!NIL_P(limit) && lim <= ++i) break;
9467 }
9468 beg = ptr - str_start;
9469 }
9470 else if (split_type == SPLIT_TYPE_CHARS) {
9471 int n;
9472
9473 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9474 mustnot_broken(str);
9475 enc = rb_enc_get(str);
9476 while (ptr < eptr &&
9477 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9478 SPLIT_STR(ptr - str_start, n);
9479 ptr += n;
9480 if (!NIL_P(limit) && lim <= ++i) break;
9481 }
9482 beg = ptr - str_start;
9483 }
9484 else {
9485 if (result) result = rb_ary_new();
9486 long len = RSTRING_LEN(str);
9487 long start = beg;
9488 long idx;
9489 int last_null = 0;
9490 struct re_registers *regs;
9491 VALUE match = 0;
9492
9493 for (; rb_reg_search(spat, str, start, 0) >= 0;
9494 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9495 match = rb_backref_get();
9496 if (!result) rb_match_busy(match);
9497 regs = RMATCH_REGS(match);
9498 end = BEG(0);
9499 if (start == end && BEG(0) == END(0)) {
9500 if (!ptr) {
9501 SPLIT_STR(0, 0);
9502 break;
9503 }
9504 else if (last_null == 1) {
9505 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9506 beg = start;
9507 }
9508 else {
9509 if (start == len)
9510 start++;
9511 else
9512 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9513 last_null = 1;
9514 continue;
9515 }
9516 }
9517 else {
9518 SPLIT_STR(beg, end-beg);
9519 beg = start = END(0);
9520 }
9521 last_null = 0;
9522
9523 for (idx=1; idx < regs->num_regs; idx++) {
9524 if (BEG(idx) == -1) continue;
9525 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9526 }
9527 if (!NIL_P(limit) && lim <= ++i) break;
9528 }
9529 if (match) rb_match_unbusy(match);
9530 }
9531 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9532 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9533 }
9534
9535 return result ? result : str;
9536}
9537
9538VALUE
9539rb_str_split(VALUE str, const char *sep0)
9540{
9541 VALUE sep;
9542
9543 StringValue(str);
9544 sep = rb_str_new_cstr(sep0);
9545 return rb_str_split_m(1, &sep, str);
9546}
9547
9548#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9549
9550static inline int
9551enumerator_element(VALUE ary, VALUE e)
9552{
9553 if (ary) {
9554 rb_ary_push(ary, e);
9555 return 0;
9556 }
9557 else {
9558 rb_yield(e);
9559 return 1;
9560 }
9561}
9562
9563#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9564
9565static const char *
9566chomp_newline(const char *p, const char *e, rb_encoding *enc)
9567{
9568 const char *prev = rb_enc_prev_char(p, e, e, enc);
9569 if (rb_enc_is_newline(prev, e, enc)) {
9570 e = prev;
9571 prev = rb_enc_prev_char(p, e, e, enc);
9572 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9573 e = prev;
9574 }
9575 return e;
9576}
9577
9578static VALUE
9579get_rs(void)
9580{
9581 VALUE rs = rb_rs;
9582 if (!NIL_P(rs) &&
9583 (!RB_TYPE_P(rs, T_STRING) ||
9584 RSTRING_LEN(rs) != 1 ||
9585 RSTRING_PTR(rs)[0] != '\n')) {
9586 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9587 }
9588 return rs;
9589}
9590
9591#define rb_rs get_rs()
9592
9593static VALUE
9594rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9595{
9596 rb_encoding *enc;
9597 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9598 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9599 long pos, len, rslen;
9600 int rsnewline = 0;
9601
9602 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9603 rs = rb_rs;
9604 if (!NIL_P(opts)) {
9605 static ID keywords[1];
9606 if (!keywords[0]) {
9607 keywords[0] = rb_intern_const("chomp");
9608 }
9609 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9610 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9611 }
9612
9613 if (NIL_P(rs)) {
9614 if (!ENUM_ELEM(ary, str)) {
9615 return ary;
9616 }
9617 else {
9618 return orig;
9619 }
9620 }
9621
9622 if (!RSTRING_LEN(str)) goto end;
9623 str = rb_str_new_frozen(str);
9624 ptr = subptr = RSTRING_PTR(str);
9625 pend = RSTRING_END(str);
9626 len = RSTRING_LEN(str);
9627 StringValue(rs);
9628 rslen = RSTRING_LEN(rs);
9629
9630 if (rs == rb_default_rs)
9631 enc = rb_enc_get(str);
9632 else
9633 enc = rb_enc_check(str, rs);
9634
9635 if (rslen == 0) {
9636 /* paragraph mode */
9637 int n;
9638 const char *eol = NULL;
9639 subend = subptr;
9640 while (subend < pend) {
9641 long chomp_rslen = 0;
9642 do {
9643 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9644 n = 0;
9645 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9646 if (rb_enc_is_newline(subend + n, pend, enc)) {
9647 if (eol == subend) break;
9648 subend += rslen;
9649 if (subptr) {
9650 eol = subend;
9651 chomp_rslen = -rslen;
9652 }
9653 }
9654 else {
9655 if (!subptr) subptr = subend;
9656 subend += rslen;
9657 }
9658 rslen = 0;
9659 } while (subend < pend);
9660 if (!subptr) break;
9661 if (rslen == 0) chomp_rslen = 0;
9662 line = rb_str_subseq(str, subptr - ptr,
9663 subend - subptr + (chomp ? chomp_rslen : rslen));
9664 if (ENUM_ELEM(ary, line)) {
9665 str_mod_check(str, ptr, len);
9666 }
9667 subptr = eol = NULL;
9668 }
9669 goto end;
9670 }
9671 else {
9672 rsptr = RSTRING_PTR(rs);
9673 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9674 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9675 rsnewline = 1;
9676 }
9677 }
9678
9679 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9680 rs = rb_str_new(rsptr, rslen);
9681 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9682 rsptr = RSTRING_PTR(rs);
9683 rslen = RSTRING_LEN(rs);
9684 }
9685
9686 while (subptr < pend) {
9687 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9688 if (pos < 0) break;
9689 hit = subptr + pos;
9690 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9691 if (hit != adjusted) {
9692 subptr = adjusted;
9693 continue;
9694 }
9695 subend = hit += rslen;
9696 if (chomp) {
9697 if (rsnewline) {
9698 subend = chomp_newline(subptr, subend, enc);
9699 }
9700 else {
9701 subend -= rslen;
9702 }
9703 }
9704 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9705 if (ENUM_ELEM(ary, line)) {
9706 str_mod_check(str, ptr, len);
9707 }
9708 subptr = hit;
9709 }
9710
9711 if (subptr != pend) {
9712 if (chomp) {
9713 if (rsnewline) {
9714 pend = chomp_newline(subptr, pend, enc);
9715 }
9716 else if (pend - subptr >= rslen &&
9717 memcmp(pend - rslen, rsptr, rslen) == 0) {
9718 pend -= rslen;
9719 }
9720 }
9721 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9722 ENUM_ELEM(ary, line);
9723 RB_GC_GUARD(str);
9724 }
9725
9726 end:
9727 if (ary)
9728 return ary;
9729 else
9730 return orig;
9731}
9732
9733/*
9734 * call-seq:
9735 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9736 * each_line(line_sep = $/, chomp: false) -> enumerator
9737 *
9738 * :include: doc/string/each_line.rdoc
9739 *
9740 */
9741
9742static VALUE
9743rb_str_each_line(int argc, VALUE *argv, VALUE str)
9744{
9745 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9746 return rb_str_enumerate_lines(argc, argv, str, 0);
9747}
9748
9749/*
9750 * call-seq:
9751 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9752 *
9753 * Forms substrings ("lines") of +self+ according to the given arguments
9754 * (see String#each_line for details); returns the lines in an array.
9755 *
9756 */
9757
9758static VALUE
9759rb_str_lines(int argc, VALUE *argv, VALUE str)
9760{
9761 VALUE ary = WANTARRAY("lines", 0);
9762 return rb_str_enumerate_lines(argc, argv, str, ary);
9763}
9764
9765static VALUE
9766rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9767{
9768 return LONG2FIX(RSTRING_LEN(str));
9769}
9770
9771static VALUE
9772rb_str_enumerate_bytes(VALUE str, VALUE ary)
9773{
9774 long i;
9775
9776 for (i=0; i<RSTRING_LEN(str); i++) {
9777 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9778 }
9779 if (ary)
9780 return ary;
9781 else
9782 return str;
9783}
9784
9785/*
9786 * call-seq:
9787 * each_byte {|byte| ... } -> self
9788 * each_byte -> enumerator
9789 *
9790 * :include: doc/string/each_byte.rdoc
9791 *
9792 */
9793
9794static VALUE
9795rb_str_each_byte(VALUE str)
9796{
9797 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9798 return rb_str_enumerate_bytes(str, 0);
9799}
9800
9801/*
9802 * call-seq:
9803 * bytes -> array_of_bytes
9804 *
9805 * :include: doc/string/bytes.rdoc
9806 *
9807 */
9808
9809static VALUE
9810rb_str_bytes(VALUE str)
9811{
9812 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9813 return rb_str_enumerate_bytes(str, ary);
9814}
9815
9816static VALUE
9817rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9818{
9819 return rb_str_length(str);
9820}
9821
9822static VALUE
9823rb_str_enumerate_chars(VALUE str, VALUE ary)
9824{
9825 VALUE orig = str;
9826 long i, len, n;
9827 const char *ptr;
9828 rb_encoding *enc;
9829
9830 str = rb_str_new_frozen(str);
9831 ptr = RSTRING_PTR(str);
9832 len = RSTRING_LEN(str);
9833 enc = rb_enc_get(str);
9834
9836 for (i = 0; i < len; i += n) {
9837 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9838 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9839 }
9840 }
9841 else {
9842 for (i = 0; i < len; i += n) {
9843 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9844 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9845 }
9846 }
9847 RB_GC_GUARD(str);
9848 if (ary)
9849 return ary;
9850 else
9851 return orig;
9852}
9853
9854/*
9855 * call-seq:
9856 * each_char {|c| ... } -> self
9857 * each_char -> enumerator
9858 *
9859 * :include: doc/string/each_char.rdoc
9860 *
9861 */
9862
9863static VALUE
9864rb_str_each_char(VALUE str)
9865{
9866 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9867 return rb_str_enumerate_chars(str, 0);
9868}
9869
9870/*
9871 * call-seq:
9872 * chars -> array_of_characters
9873 *
9874 * :include: doc/string/chars.rdoc
9875 *
9876 */
9877
9878static VALUE
9879rb_str_chars(VALUE str)
9880{
9881 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9882 return rb_str_enumerate_chars(str, ary);
9883}
9884
9885static VALUE
9886rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9887{
9888 VALUE orig = str;
9889 int n;
9890 unsigned int c;
9891 const char *ptr, *end;
9892 rb_encoding *enc;
9893
9894 if (single_byte_optimizable(str))
9895 return rb_str_enumerate_bytes(str, ary);
9896
9897 str = rb_str_new_frozen(str);
9898 ptr = RSTRING_PTR(str);
9899 end = RSTRING_END(str);
9900 enc = STR_ENC_GET(str);
9901
9902 while (ptr < end) {
9903 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9904 ENUM_ELEM(ary, UINT2NUM(c));
9905 ptr += n;
9906 }
9907 RB_GC_GUARD(str);
9908 if (ary)
9909 return ary;
9910 else
9911 return orig;
9912}
9913
9914/*
9915 * call-seq:
9916 * each_codepoint {|integer| ... } -> self
9917 * each_codepoint -> enumerator
9918 *
9919 * :include: doc/string/each_codepoint.rdoc
9920 *
9921 */
9922
9923static VALUE
9924rb_str_each_codepoint(VALUE str)
9925{
9926 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9927 return rb_str_enumerate_codepoints(str, 0);
9928}
9929
9930/*
9931 * call-seq:
9932 * codepoints -> array_of_integers
9933 *
9934 * :include: doc/string/codepoints.rdoc
9935 *
9936 */
9937
9938static VALUE
9939rb_str_codepoints(VALUE str)
9940{
9941 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9942 return rb_str_enumerate_codepoints(str, ary);
9943}
9944
9945static regex_t *
9946get_reg_grapheme_cluster(rb_encoding *enc)
9947{
9948 int encidx = rb_enc_to_index(enc);
9949
9950 const OnigUChar source_ascii[] = "\\X";
9951 const OnigUChar *source = source_ascii;
9952 size_t source_len = sizeof(source_ascii) - 1;
9953
9954 switch (encidx) {
9955#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9956#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9957#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9958#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9959#define CASE_UTF(e) \
9960 case ENCINDEX_UTF_##e: { \
9961 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9962 source = source_UTF_##e; \
9963 source_len = sizeof(source_UTF_##e); \
9964 break; \
9965 }
9966 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9967#undef CASE_UTF
9968#undef CHARS_16BE
9969#undef CHARS_16LE
9970#undef CHARS_32BE
9971#undef CHARS_32LE
9972 }
9973
9974 regex_t *reg_grapheme_cluster;
9975 OnigErrorInfo einfo;
9976 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9977 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9978 if (r) {
9979 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9980 onig_error_code_to_str(message, r, &einfo);
9981 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9982 }
9983
9984 return reg_grapheme_cluster;
9985}
9986
9987static regex_t *
9988get_cached_reg_grapheme_cluster(rb_encoding *enc)
9989{
9990 int encidx = rb_enc_to_index(enc);
9991 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9992
9993 if (encidx == rb_utf8_encindex()) {
9994 if (!reg_grapheme_cluster_utf8) {
9995 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9996 }
9997
9998 return reg_grapheme_cluster_utf8;
9999 }
10000
10001 return NULL;
10002}
10003
10004static VALUE
10005rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10006{
10007 size_t grapheme_cluster_count = 0;
10008 rb_encoding *enc = get_encoding(str);
10009 const char *ptr, *end;
10010
10011 if (!rb_enc_unicode_p(enc)) {
10012 return rb_str_length(str);
10013 }
10014
10015 bool cached_reg_grapheme_cluster = true;
10016 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10017 if (!reg_grapheme_cluster) {
10018 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10019 cached_reg_grapheme_cluster = false;
10020 }
10021
10022 ptr = RSTRING_PTR(str);
10023 end = RSTRING_END(str);
10024
10025 while (ptr < end) {
10026 OnigPosition len = onig_match(reg_grapheme_cluster,
10027 (const OnigUChar *)ptr, (const OnigUChar *)end,
10028 (const OnigUChar *)ptr, NULL, 0);
10029 if (len <= 0) break;
10030 grapheme_cluster_count++;
10031 ptr += len;
10032 }
10033
10034 if (!cached_reg_grapheme_cluster) {
10035 onig_free(reg_grapheme_cluster);
10036 }
10037
10038 return SIZET2NUM(grapheme_cluster_count);
10039}
10040
10041static VALUE
10042rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10043{
10044 VALUE orig = str;
10045 rb_encoding *enc = get_encoding(str);
10046 const char *ptr0, *ptr, *end;
10047
10048 if (!rb_enc_unicode_p(enc)) {
10049 return rb_str_enumerate_chars(str, ary);
10050 }
10051
10052 if (!ary) str = rb_str_new_frozen(str);
10053
10054 bool cached_reg_grapheme_cluster = true;
10055 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10056 if (!reg_grapheme_cluster) {
10057 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10058 cached_reg_grapheme_cluster = false;
10059 }
10060
10061 ptr0 = ptr = RSTRING_PTR(str);
10062 end = RSTRING_END(str);
10063
10064 while (ptr < end) {
10065 OnigPosition len = onig_match(reg_grapheme_cluster,
10066 (const OnigUChar *)ptr, (const OnigUChar *)end,
10067 (const OnigUChar *)ptr, NULL, 0);
10068 if (len <= 0) break;
10069 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10070 ptr += len;
10071 }
10072
10073 if (!cached_reg_grapheme_cluster) {
10074 onig_free(reg_grapheme_cluster);
10075 }
10076
10077 RB_GC_GUARD(str);
10078 if (ary)
10079 return ary;
10080 else
10081 return orig;
10082}
10083
10084/*
10085 * call-seq:
10086 * each_grapheme_cluster {|gc| ... } -> self
10087 * each_grapheme_cluster -> enumerator
10088 *
10089 * :include: doc/string/each_grapheme_cluster.rdoc
10090 *
10091 */
10092
10093static VALUE
10094rb_str_each_grapheme_cluster(VALUE str)
10095{
10096 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10097 return rb_str_enumerate_grapheme_clusters(str, 0);
10098}
10099
10100/*
10101 * call-seq:
10102 * grapheme_clusters -> array_of_grapheme_clusters
10103 *
10104 * :include: doc/string/grapheme_clusters.rdoc
10105 *
10106 */
10107
10108static VALUE
10109rb_str_grapheme_clusters(VALUE str)
10110{
10111 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10112 return rb_str_enumerate_grapheme_clusters(str, ary);
10113}
10114
10115static long
10116chopped_length(VALUE str)
10117{
10118 rb_encoding *enc = STR_ENC_GET(str);
10119 const char *p, *p2, *beg, *end;
10120
10121 beg = RSTRING_PTR(str);
10122 end = beg + RSTRING_LEN(str);
10123 if (beg >= end) return 0;
10124 p = rb_enc_prev_char(beg, end, end, enc);
10125 if (!p) return 0;
10126 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10127 p2 = rb_enc_prev_char(beg, p, end, enc);
10128 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10129 }
10130 return p - beg;
10131}
10132
10133/*
10134 * call-seq:
10135 * chop! -> self or nil
10136 *
10137 * Like String#chop, but modifies +self+ in place;
10138 * returns +nil+ if +self+ is empty, +self+ otherwise.
10139 *
10140 * Related: String#chomp!.
10141 */
10142
10143static VALUE
10144rb_str_chop_bang(VALUE str)
10145{
10146 str_modify_keep_cr(str);
10147 if (RSTRING_LEN(str) > 0) {
10148 long len;
10149 len = chopped_length(str);
10150 STR_SET_LEN(str, len);
10151 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10152 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10154 }
10155 return str;
10156 }
10157 return Qnil;
10158}
10159
10160
10161/*
10162 * call-seq:
10163 * chop -> new_string
10164 *
10165 * :include: doc/string/chop.rdoc
10166 *
10167 */
10168
10169static VALUE
10170rb_str_chop(VALUE str)
10171{
10172 return rb_str_subseq(str, 0, chopped_length(str));
10173}
10174
10175static long
10176smart_chomp(VALUE str, const char *e, const char *p)
10177{
10178 rb_encoding *enc = rb_enc_get(str);
10179 if (rb_enc_mbminlen(enc) > 1) {
10180 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10181 if (rb_enc_is_newline(pp, e, enc)) {
10182 e = pp;
10183 }
10184 pp = e - rb_enc_mbminlen(enc);
10185 if (pp >= p) {
10186 pp = rb_enc_left_char_head(p, pp, e, enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10188 e = pp;
10189 }
10190 }
10191 }
10192 else {
10193 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10194 case '\n':
10195 if (--e > p && *(e-1) == '\r') {
10196 --e;
10197 }
10198 break;
10199 case '\r':
10200 --e;
10201 break;
10202 }
10203 }
10204 return e - p;
10205}
10206
10207static long
10208chompped_length(VALUE str, VALUE rs)
10209{
10210 rb_encoding *enc;
10211 int newline;
10212 char *pp, *e, *rsptr;
10213 long rslen;
10214 char *const p = RSTRING_PTR(str);
10215 long len = RSTRING_LEN(str);
10216
10217 if (len == 0) return 0;
10218 e = p + len;
10219 if (rs == rb_default_rs) {
10220 return smart_chomp(str, e, p);
10221 }
10222
10223 enc = rb_enc_get(str);
10224 RSTRING_GETMEM(rs, rsptr, rslen);
10225 if (rslen == 0) {
10226 if (rb_enc_mbminlen(enc) > 1) {
10227 while (e > p) {
10228 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10229 if (!rb_enc_is_newline(pp, e, enc)) break;
10230 e = pp;
10231 pp -= rb_enc_mbminlen(enc);
10232 if (pp >= p) {
10233 pp = rb_enc_left_char_head(p, pp, e, enc);
10234 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10235 e = pp;
10236 }
10237 }
10238 }
10239 }
10240 else {
10241 while (e > p && *(e-1) == '\n') {
10242 --e;
10243 if (e > p && *(e-1) == '\r')
10244 --e;
10245 }
10246 }
10247 return e - p;
10248 }
10249 if (rslen > len) return len;
10250
10251 enc = rb_enc_get(rs);
10252 newline = rsptr[rslen-1];
10253 if (rslen == rb_enc_mbminlen(enc)) {
10254 if (rslen == 1) {
10255 if (newline == '\n')
10256 return smart_chomp(str, e, p);
10257 }
10258 else {
10259 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10260 return smart_chomp(str, e, p);
10261 }
10262 }
10263
10264 enc = rb_enc_check(str, rs);
10265 if (is_broken_string(rs)) {
10266 return len;
10267 }
10268 pp = e - rslen;
10269 if (p[len-1] == newline &&
10270 (rslen <= 1 ||
10271 memcmp(rsptr, pp, rslen) == 0)) {
10272 if (at_char_boundary(p, pp, e, enc))
10273 return len - rslen;
10274 RB_GC_GUARD(rs);
10275 }
10276 return len;
10277}
10278
10284static VALUE
10285chomp_rs(int argc, const VALUE *argv)
10286{
10287 rb_check_arity(argc, 0, 1);
10288 if (argc > 0) {
10289 VALUE rs = argv[0];
10290 if (!NIL_P(rs)) StringValue(rs);
10291 return rs;
10292 }
10293 else {
10294 return rb_rs;
10295 }
10296}
10297
10298VALUE
10299rb_str_chomp_string(VALUE str, VALUE rs)
10300{
10301 long olen = RSTRING_LEN(str);
10302 long len = chompped_length(str, rs);
10303 if (len >= olen) return Qnil;
10304 str_modify_keep_cr(str);
10305 STR_SET_LEN(str, len);
10306 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10307 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10309 }
10310 return str;
10311}
10312
10313/*
10314 * call-seq:
10315 * chomp!(line_sep = $/) -> self or nil
10316 *
10317 * Like String#chomp, but modifies +self+ in place;
10318 * returns +nil+ if no modification made, +self+ otherwise.
10319 *
10320 */
10321
10322static VALUE
10323rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10324{
10325 VALUE rs;
10326 str_modifiable(str);
10327 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10328 rs = chomp_rs(argc, argv);
10329 if (NIL_P(rs)) return Qnil;
10330 return rb_str_chomp_string(str, rs);
10331}
10332
10333
10334/*
10335 * call-seq:
10336 * chomp(line_sep = $/) -> new_string
10337 *
10338 * :include: doc/string/chomp.rdoc
10339 *
10340 */
10341
10342static VALUE
10343rb_str_chomp(int argc, VALUE *argv, VALUE str)
10344{
10345 VALUE rs = chomp_rs(argc, argv);
10346 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10347 return rb_str_subseq(str, 0, chompped_length(str, rs));
10348}
10349
10350static long
10351lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10352{
10353 const char *const start = s;
10354
10355 if (!s || s >= e) return 0;
10356
10357 /* remove spaces at head */
10358 if (single_byte_optimizable(str)) {
10359 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10360 }
10361 else {
10362 while (s < e) {
10363 int n;
10364 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10365
10366 if (cc && !rb_isspace(cc)) break;
10367 s += n;
10368 }
10369 }
10370 return s - start;
10371}
10372
10373/*
10374 * call-seq:
10375 * lstrip! -> self or nil
10376 *
10377 * Like String#lstrip, except that any modifications are made in +self+;
10378 * returns +self+ if any modification are made, +nil+ otherwise.
10379 *
10380 * Related: String#rstrip!, String#strip!.
10381 */
10382
10383static VALUE
10384rb_str_lstrip_bang(VALUE str)
10385{
10386 rb_encoding *enc;
10387 char *start, *s;
10388 long olen, loffset;
10389
10390 str_modify_keep_cr(str);
10391 enc = STR_ENC_GET(str);
10392 RSTRING_GETMEM(str, start, olen);
10393 loffset = lstrip_offset(str, start, start+olen, enc);
10394 if (loffset > 0) {
10395 long len = olen-loffset;
10396 s = start + loffset;
10397 memmove(start, s, len);
10398 STR_SET_LEN(str, len);
10399 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10400 return str;
10401 }
10402 return Qnil;
10403}
10404
10405
10406/*
10407 * call-seq:
10408 * lstrip -> new_string
10409 *
10410 * Returns a copy of +self+ with leading whitespace removed;
10411 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10412 *
10413 * whitespace = "\x00\t\n\v\f\r "
10414 * s = whitespace + 'abc' + whitespace
10415 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10416 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10417 *
10418 * Related: String#rstrip, String#strip.
10419 */
10420
10421static VALUE
10422rb_str_lstrip(VALUE str)
10423{
10424 char *start;
10425 long len, loffset;
10426 RSTRING_GETMEM(str, start, len);
10427 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10428 if (loffset <= 0) return str_duplicate(rb_cString, str);
10429 return rb_str_subseq(str, loffset, len - loffset);
10430}
10431
10432static long
10433rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10434{
10435 const char *t;
10436
10437 rb_str_check_dummy_enc(enc);
10439 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10440 }
10441 if (!s || s >= e) return 0;
10442 t = e;
10443
10444 /* remove trailing spaces or '\0's */
10445 if (single_byte_optimizable(str)) {
10446 unsigned char c;
10447 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10448 }
10449 else {
10450 char *tp;
10451
10452 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10453 unsigned int c = rb_enc_codepoint(tp, e, enc);
10454 if (c && !rb_isspace(c)) break;
10455 t = tp;
10456 }
10457 }
10458 return e - t;
10459}
10460
10461/*
10462 * call-seq:
10463 * rstrip! -> self or nil
10464 *
10465 * Like String#rstrip, except that any modifications are made in +self+;
10466 * returns +self+ if any modification are made, +nil+ otherwise.
10467 *
10468 * Related: String#lstrip!, String#strip!.
10469 */
10470
10471static VALUE
10472rb_str_rstrip_bang(VALUE str)
10473{
10474 rb_encoding *enc;
10475 char *start;
10476 long olen, roffset;
10477
10478 str_modify_keep_cr(str);
10479 enc = STR_ENC_GET(str);
10480 RSTRING_GETMEM(str, start, olen);
10481 roffset = rstrip_offset(str, start, start+olen, enc);
10482 if (roffset > 0) {
10483 long len = olen - roffset;
10484
10485 STR_SET_LEN(str, len);
10486 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10487 return str;
10488 }
10489 return Qnil;
10490}
10491
10492
10493/*
10494 * call-seq:
10495 * rstrip -> new_string
10496 *
10497 * Returns a copy of the receiver with trailing whitespace removed;
10498 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10499 *
10500 * whitespace = "\x00\t\n\v\f\r "
10501 * s = whitespace + 'abc' + whitespace
10502 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10503 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10504 *
10505 * Related: String#lstrip, String#strip.
10506 */
10507
10508static VALUE
10509rb_str_rstrip(VALUE str)
10510{
10511 rb_encoding *enc;
10512 char *start;
10513 long olen, roffset;
10514
10515 enc = STR_ENC_GET(str);
10516 RSTRING_GETMEM(str, start, olen);
10517 roffset = rstrip_offset(str, start, start+olen, enc);
10518
10519 if (roffset <= 0) return str_duplicate(rb_cString, str);
10520 return rb_str_subseq(str, 0, olen-roffset);
10521}
10522
10523
10524/*
10525 * call-seq:
10526 * strip! -> self or nil
10527 *
10528 * Like String#strip, except that any modifications are made in +self+;
10529 * returns +self+ if any modification are made, +nil+ otherwise.
10530 *
10531 * Related: String#lstrip!, String#strip!.
10532 */
10533
10534static VALUE
10535rb_str_strip_bang(VALUE str)
10536{
10537 char *start;
10538 long olen, loffset, roffset;
10539 rb_encoding *enc;
10540
10541 str_modify_keep_cr(str);
10542 enc = STR_ENC_GET(str);
10543 RSTRING_GETMEM(str, start, olen);
10544 loffset = lstrip_offset(str, start, start+olen, enc);
10545 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10546
10547 if (loffset > 0 || roffset > 0) {
10548 long len = olen-roffset;
10549 if (loffset > 0) {
10550 len -= loffset;
10551 memmove(start, start + loffset, len);
10552 }
10553 STR_SET_LEN(str, len);
10554 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10555 return str;
10556 }
10557 return Qnil;
10558}
10559
10560
10561/*
10562 * call-seq:
10563 * strip -> new_string
10564 *
10565 * Returns a copy of the receiver with leading and trailing whitespace removed;
10566 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10567 *
10568 * whitespace = "\x00\t\n\v\f\r "
10569 * s = whitespace + 'abc' + whitespace
10570 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10571 * s.strip # => "abc"
10572 *
10573 * Related: String#lstrip, String#rstrip.
10574 */
10575
10576static VALUE
10577rb_str_strip(VALUE str)
10578{
10579 char *start;
10580 long olen, loffset, roffset;
10581 rb_encoding *enc = STR_ENC_GET(str);
10582
10583 RSTRING_GETMEM(str, start, olen);
10584 loffset = lstrip_offset(str, start, start+olen, enc);
10585 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10586
10587 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10588 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10589}
10590
10591static VALUE
10592scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10593{
10594 VALUE result = Qnil;
10595 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10596 if (pos >= 0) {
10597 VALUE match;
10598 struct re_registers *regs;
10599 if (BUILTIN_TYPE(pat) == T_STRING) {
10600 regs = NULL;
10601 end = pos + RSTRING_LEN(pat);
10602 }
10603 else {
10604 match = rb_backref_get();
10605 regs = RMATCH_REGS(match);
10606 pos = BEG(0);
10607 end = END(0);
10608 }
10609
10610 if (pos == end) {
10611 rb_encoding *enc = STR_ENC_GET(str);
10612 /*
10613 * Always consume at least one character of the input string
10614 */
10615 if (RSTRING_LEN(str) > end)
10616 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10617 RSTRING_END(str), enc);
10618 else
10619 *start = end + 1;
10620 }
10621 else {
10622 *start = end;
10623 }
10624
10625 if (!regs || regs->num_regs == 1) {
10626 result = rb_str_subseq(str, pos, end - pos);
10627 return result;
10628 }
10629 else {
10630 result = rb_ary_new2(regs->num_regs);
10631 for (int i = 1; i < regs->num_regs; i++) {
10632 VALUE s = Qnil;
10633 if (BEG(i) >= 0) {
10634 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10635 }
10636
10637 rb_ary_push(result, s);
10638 }
10639 }
10640
10641 RB_GC_GUARD(match);
10642 }
10643
10644 return result;
10645}
10646
10647
10648/*
10649 * call-seq:
10650 * scan(string_or_regexp) -> array
10651 * scan(string_or_regexp) {|matches| ... } -> self
10652 *
10653 * Matches a pattern against +self+; the pattern is:
10654 *
10655 * - +string_or_regexp+ itself, if it is a Regexp.
10656 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10657 *
10658 * Iterates through +self+, generating a collection of matching results:
10659 *
10660 * - If the pattern contains no groups, each result is the
10661 * matched string, <code>$&</code>.
10662 * - If the pattern contains groups, each result is an array
10663 * containing one entry per group.
10664 *
10665 * With no block given, returns an array of the results:
10666 *
10667 * s = 'cruel world'
10668 * s.scan(/\w+/) # => ["cruel", "world"]
10669 * s.scan(/.../) # => ["cru", "el ", "wor"]
10670 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10671 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10672 *
10673 * With a block given, calls the block with each result; returns +self+:
10674 *
10675 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10676 * print "\n"
10677 * s.scan(/(.)(.)/) {|x,y| print y, x }
10678 * print "\n"
10679 *
10680 * Output:
10681 *
10682 * <<cruel>> <<world>>
10683 * rceu lowlr
10684 *
10685 */
10686
10687static VALUE
10688rb_str_scan(VALUE str, VALUE pat)
10689{
10690 VALUE result;
10691 long start = 0;
10692 long last = -1, prev = 0;
10693 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10694
10695 pat = get_pat_quoted(pat, 1);
10696 mustnot_broken(str);
10697 if (!rb_block_given_p()) {
10698 VALUE ary = rb_ary_new();
10699
10700 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10701 last = prev;
10702 prev = start;
10703 rb_ary_push(ary, result);
10704 }
10705 if (last >= 0) rb_pat_search(pat, str, last, 1);
10706 else rb_backref_set(Qnil);
10707 return ary;
10708 }
10709
10710 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10711 last = prev;
10712 prev = start;
10713 rb_yield(result);
10714 str_mod_check(str, p, len);
10715 }
10716 if (last >= 0) rb_pat_search(pat, str, last, 1);
10717 return str;
10718}
10719
10720
10721/*
10722 * call-seq:
10723 * hex -> integer
10724 *
10725 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10726 * (with an optional sign and an optional <code>0x</code>) and returns the
10727 * corresponding number;
10728 * returns zero if there is no such leading substring:
10729 *
10730 * '0x0a'.hex # => 10
10731 * '-1234'.hex # => -4660
10732 * '0'.hex # => 0
10733 * 'non-numeric'.hex # => 0
10734 *
10735 * Related: String#oct.
10736 *
10737 */
10738
10739static VALUE
10740rb_str_hex(VALUE str)
10741{
10742 return rb_str_to_inum(str, 16, FALSE);
10743}
10744
10745
10746/*
10747 * call-seq:
10748 * oct -> integer
10749 *
10750 * Interprets the leading substring of +self+ as a string of octal digits
10751 * (with an optional sign) and returns the corresponding number;
10752 * returns zero if there is no such leading substring:
10753 *
10754 * '123'.oct # => 83
10755 * '-377'.oct # => -255
10756 * '0377non-numeric'.oct # => 255
10757 * 'non-numeric'.oct # => 0
10758 *
10759 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10760 * see Kernel#Integer.
10761 *
10762 * Related: String#hex.
10763 *
10764 */
10765
10766static VALUE
10767rb_str_oct(VALUE str)
10768{
10769 return rb_str_to_inum(str, -8, FALSE);
10770}
10771
10772#ifndef HAVE_CRYPT_R
10773# include "ruby/thread_native.h"
10774# include "ruby/atomic.h"
10775
10776static struct {
10777 rb_nativethread_lock_t lock;
10778} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10779#endif
10780
10781/*
10782 * call-seq:
10783 * crypt(salt_str) -> new_string
10784 *
10785 * Returns the string generated by calling <code>crypt(3)</code>
10786 * standard library function with <code>str</code> and
10787 * <code>salt_str</code>, in this order, as its arguments. Please do
10788 * not use this method any longer. It is legacy; provided only for
10789 * backward compatibility with ruby scripts in earlier days. It is
10790 * bad to use in contemporary programs for several reasons:
10791 *
10792 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10793 * run. The generated string lacks data portability.
10794 *
10795 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10796 * (i.e. silently ends up in unexpected results).
10797 *
10798 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10799 * thread safe.
10800 *
10801 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10802 * very very weak. According to its manpage, Linux's traditional
10803 * <code>crypt(3)</code> output has only 2**56 variations; too
10804 * easy to brute force today. And this is the default behaviour.
10805 *
10806 * * In order to make things robust some OSes implement so-called
10807 * "modular" usage. To go through, you have to do a complex
10808 * build-up of the <code>salt_str</code> parameter, by hand.
10809 * Failure in generation of a proper salt string tends not to
10810 * yield any errors; typos in parameters are normally not
10811 * detectable.
10812 *
10813 * * For instance, in the following example, the second invocation
10814 * of String#crypt is wrong; it has a typo in "round=" (lacks
10815 * "s"). However the call does not fail and something unexpected
10816 * is generated.
10817 *
10818 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10819 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10820 *
10821 * * Even in the "modular" mode, some hash functions are considered
10822 * archaic and no longer recommended at all; for instance module
10823 * <code>$1$</code> is officially abandoned by its author: see
10824 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10825 * instance module <code>$3$</code> is considered completely
10826 * broken: see the manpage of FreeBSD.
10827 *
10828 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10829 * written above, <code>crypt(3)</code> on Mac OS never fails.
10830 * This means even if you build up a proper salt string it
10831 * generates a traditional DES hash anyways, and there is no way
10832 * for you to be aware of.
10833 *
10834 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10835 *
10836 * If for some reason you cannot migrate to other secure contemporary
10837 * password hashing algorithms, install the string-crypt gem and
10838 * <code>require 'string/crypt'</code> to continue using it.
10839 */
10840
10841static VALUE
10842rb_str_crypt(VALUE str, VALUE salt)
10843{
10844#ifdef HAVE_CRYPT_R
10845 VALUE databuf;
10846 struct crypt_data *data;
10847# define CRYPT_END() ALLOCV_END(databuf)
10848#else
10849 char *tmp_buf;
10850 extern char *crypt(const char *, const char *);
10851# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10852#endif
10853 VALUE result;
10854 const char *s, *saltp;
10855 char *res;
10856#ifdef BROKEN_CRYPT
10857 char salt_8bit_clean[3];
10858#endif
10859
10860 StringValue(salt);
10861 mustnot_wchar(str);
10862 mustnot_wchar(salt);
10863 s = StringValueCStr(str);
10864 saltp = RSTRING_PTR(salt);
10865 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10866 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10867 }
10868
10869#ifdef BROKEN_CRYPT
10870 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10871 salt_8bit_clean[0] = saltp[0] & 0x7f;
10872 salt_8bit_clean[1] = saltp[1] & 0x7f;
10873 salt_8bit_clean[2] = '\0';
10874 saltp = salt_8bit_clean;
10875 }
10876#endif
10877#ifdef HAVE_CRYPT_R
10878 data = ALLOCV(databuf, sizeof(struct crypt_data));
10879# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10880 data->initialized = 0;
10881# endif
10882 res = crypt_r(s, saltp, data);
10883#else
10884 rb_nativethread_lock_lock(&crypt_mutex.lock);
10885 res = crypt(s, saltp);
10886#endif
10887 if (!res) {
10888 int err = errno;
10889 CRYPT_END();
10890 rb_syserr_fail(err, "crypt");
10891 }
10892#ifdef HAVE_CRYPT_R
10893 result = rb_str_new_cstr(res);
10894 CRYPT_END();
10895#else
10896 // We need to copy this buffer because it's static and we need to unlock the mutex
10897 // before allocating a new object (the string to be returned). If we allocate while
10898 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10899 // if other ractors are waiting on this lock.
10900 size_t res_size = strlen(res)+1;
10901 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10902 memcpy(tmp_buf, res, res_size);
10903 res = tmp_buf;
10904 CRYPT_END();
10905 result = rb_str_new_cstr(res);
10906#endif
10907 return result;
10908}
10909
10910
10911/*
10912 * call-seq:
10913 * ord -> integer
10914 *
10915 * :include: doc/string/ord.rdoc
10916 *
10917 */
10918
10919static VALUE
10920rb_str_ord(VALUE s)
10921{
10922 unsigned int c;
10923
10924 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10925 return UINT2NUM(c);
10926}
10927/*
10928 * call-seq:
10929 * sum(n = 16) -> integer
10930 *
10931 * :include: doc/string/sum.rdoc
10932 *
10933 */
10934
10935static VALUE
10936rb_str_sum(int argc, VALUE *argv, VALUE str)
10937{
10938 int bits = 16;
10939 char *ptr, *p, *pend;
10940 long len;
10941 VALUE sum = INT2FIX(0);
10942 unsigned long sum0 = 0;
10943
10944 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10945 bits = 0;
10946 }
10947 ptr = p = RSTRING_PTR(str);
10948 len = RSTRING_LEN(str);
10949 pend = p + len;
10950
10951 while (p < pend) {
10952 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10953 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10954 str_mod_check(str, ptr, len);
10955 sum0 = 0;
10956 }
10957 sum0 += (unsigned char)*p;
10958 p++;
10959 }
10960
10961 if (bits == 0) {
10962 if (sum0) {
10963 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10964 }
10965 }
10966 else {
10967 if (sum == INT2FIX(0)) {
10968 if (bits < (int)sizeof(long)*CHAR_BIT) {
10969 sum0 &= (((unsigned long)1)<<bits)-1;
10970 }
10971 sum = LONG2FIX(sum0);
10972 }
10973 else {
10974 VALUE mod;
10975
10976 if (sum0) {
10977 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10978 }
10979
10980 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10981 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10982 sum = rb_funcall(sum, '&', 1, mod);
10983 }
10984 }
10985 return sum;
10986}
10987
10988static VALUE
10989rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10990{
10991 rb_encoding *enc;
10992 VALUE w;
10993 long width, len, flen = 1, fclen = 1;
10994 VALUE res;
10995 char *p;
10996 const char *f = " ";
10997 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10998 VALUE pad;
10999 int singlebyte = 1, cr;
11000 int termlen;
11001
11002 rb_scan_args(argc, argv, "11", &w, &pad);
11003 enc = STR_ENC_GET(str);
11004 termlen = rb_enc_mbminlen(enc);
11005 width = NUM2LONG(w);
11006 if (argc == 2) {
11007 StringValue(pad);
11008 enc = rb_enc_check(str, pad);
11009 f = RSTRING_PTR(pad);
11010 flen = RSTRING_LEN(pad);
11011 fclen = str_strlen(pad, enc); /* rb_enc_check */
11012 singlebyte = single_byte_optimizable(pad);
11013 if (flen == 0 || fclen == 0) {
11014 rb_raise(rb_eArgError, "zero width padding");
11015 }
11016 }
11017 len = str_strlen(str, enc); /* rb_enc_check */
11018 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11019 n = width - len;
11020 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11021 rlen = n - llen;
11022 cr = ENC_CODERANGE(str);
11023 if (flen > 1) {
11024 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11025 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11026 }
11027 size = RSTRING_LEN(str);
11028 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11029 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11030 (len += llen2 + rlen2) >= LONG_MAX - size) {
11031 rb_raise(rb_eArgError, "argument too big");
11032 }
11033 len += size;
11034 res = str_enc_new(rb_cString, 0, len, enc);
11035 p = RSTRING_PTR(res);
11036 if (flen <= 1) {
11037 memset(p, *f, llen);
11038 p += llen;
11039 }
11040 else {
11041 while (llen >= fclen) {
11042 memcpy(p,f,flen);
11043 p += flen;
11044 llen -= fclen;
11045 }
11046 if (llen > 0) {
11047 memcpy(p, f, llen2);
11048 p += llen2;
11049 }
11050 }
11051 memcpy(p, RSTRING_PTR(str), size);
11052 p += size;
11053 if (flen <= 1) {
11054 memset(p, *f, rlen);
11055 p += rlen;
11056 }
11057 else {
11058 while (rlen >= fclen) {
11059 memcpy(p,f,flen);
11060 p += flen;
11061 rlen -= fclen;
11062 }
11063 if (rlen > 0) {
11064 memcpy(p, f, rlen2);
11065 p += rlen2;
11066 }
11067 }
11068 TERM_FILL(p, termlen);
11069 STR_SET_LEN(res, p-RSTRING_PTR(res));
11070
11071 if (argc == 2)
11072 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11073 if (cr != ENC_CODERANGE_BROKEN)
11074 ENC_CODERANGE_SET(res, cr);
11075
11076 RB_GC_GUARD(pad);
11077 return res;
11078}
11079
11080
11081/*
11082 * call-seq:
11083 * ljust(size, pad_string = ' ') -> new_string
11084 *
11085 * :include: doc/string/ljust.rdoc
11086 *
11087 * Related: String#rjust, String#center.
11088 *
11089 */
11090
11091static VALUE
11092rb_str_ljust(int argc, VALUE *argv, VALUE str)
11093{
11094 return rb_str_justify(argc, argv, str, 'l');
11095}
11096
11097/*
11098 * call-seq:
11099 * rjust(size, pad_string = ' ') -> new_string
11100 *
11101 * :include: doc/string/rjust.rdoc
11102 *
11103 * Related: String#ljust, String#center.
11104 *
11105 */
11106
11107static VALUE
11108rb_str_rjust(int argc, VALUE *argv, VALUE str)
11109{
11110 return rb_str_justify(argc, argv, str, 'r');
11111}
11112
11113
11114/*
11115 * call-seq:
11116 * center(size, pad_string = ' ') -> new_string
11117 *
11118 * :include: doc/string/center.rdoc
11119 *
11120 * Related: String#ljust, String#rjust.
11121 *
11122 */
11123
11124static VALUE
11125rb_str_center(int argc, VALUE *argv, VALUE str)
11126{
11127 return rb_str_justify(argc, argv, str, 'c');
11128}
11129
11130/*
11131 * call-seq:
11132 * partition(string_or_regexp) -> [head, match, tail]
11133 *
11134 * :include: doc/string/partition.rdoc
11135 *
11136 */
11137
11138static VALUE
11139rb_str_partition(VALUE str, VALUE sep)
11140{
11141 long pos;
11142
11143 sep = get_pat_quoted(sep, 0);
11144 if (RB_TYPE_P(sep, T_REGEXP)) {
11145 if (rb_reg_search(sep, str, 0, 0) < 0) {
11146 goto failed;
11147 }
11148 VALUE match = rb_backref_get();
11149 struct re_registers *regs = RMATCH_REGS(match);
11150
11151 pos = BEG(0);
11152 sep = rb_str_subseq(str, pos, END(0) - pos);
11153 }
11154 else {
11155 pos = rb_str_index(str, sep, 0);
11156 if (pos < 0) goto failed;
11157 }
11158 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11159 sep,
11160 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11161 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11162
11163 failed:
11164 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11165}
11166
11167/*
11168 * call-seq:
11169 * rpartition(sep) -> [head, match, tail]
11170 *
11171 * :include: doc/string/rpartition.rdoc
11172 *
11173 */
11174
11175static VALUE
11176rb_str_rpartition(VALUE str, VALUE sep)
11177{
11178 long pos = RSTRING_LEN(str);
11179
11180 sep = get_pat_quoted(sep, 0);
11181 if (RB_TYPE_P(sep, T_REGEXP)) {
11182 if (rb_reg_search(sep, str, pos, 1) < 0) {
11183 goto failed;
11184 }
11185 VALUE match = rb_backref_get();
11186 struct re_registers *regs = RMATCH_REGS(match);
11187
11188 pos = BEG(0);
11189 sep = rb_str_subseq(str, pos, END(0) - pos);
11190 }
11191 else {
11192 pos = rb_str_sublen(str, pos);
11193 pos = rb_str_rindex(str, sep, pos);
11194 if (pos < 0) {
11195 goto failed;
11196 }
11197 }
11198
11199 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11200 sep,
11201 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11202 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11203 failed:
11204 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11205}
11206
11207/*
11208 * call-seq:
11209 * start_with?(*string_or_regexp) -> true or false
11210 *
11211 * :include: doc/string/start_with_p.rdoc
11212 *
11213 */
11214
11215static VALUE
11216rb_str_start_with(int argc, VALUE *argv, VALUE str)
11217{
11218 int i;
11219
11220 for (i=0; i<argc; i++) {
11221 VALUE tmp = argv[i];
11222 if (RB_TYPE_P(tmp, T_REGEXP)) {
11223 if (rb_reg_start_with_p(tmp, str))
11224 return Qtrue;
11225 }
11226 else {
11227 const char *p, *s, *e;
11228 long slen, tlen;
11229 rb_encoding *enc;
11230
11231 StringValue(tmp);
11232 enc = rb_enc_check(str, tmp);
11233 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11234 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11235 p = RSTRING_PTR(str);
11236 e = p + slen;
11237 s = p + tlen;
11238 if (!at_char_right_boundary(p, s, e, enc))
11239 continue;
11240 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11241 return Qtrue;
11242 }
11243 }
11244 return Qfalse;
11245}
11246
11247/*
11248 * call-seq:
11249 * end_with?(*strings) -> true or false
11250 *
11251 * :include: doc/string/end_with_p.rdoc
11252 *
11253 */
11254
11255static VALUE
11256rb_str_end_with(int argc, VALUE *argv, VALUE str)
11257{
11258 int i;
11259
11260 for (i=0; i<argc; i++) {
11261 VALUE tmp = argv[i];
11262 const char *p, *s, *e;
11263 long slen, tlen;
11264 rb_encoding *enc;
11265
11266 StringValue(tmp);
11267 enc = rb_enc_check(str, tmp);
11268 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11269 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11270 p = RSTRING_PTR(str);
11271 e = p + slen;
11272 s = e - tlen;
11273 if (!at_char_boundary(p, s, e, enc))
11274 continue;
11275 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11276 return Qtrue;
11277 }
11278 return Qfalse;
11279}
11280
11290static long
11291deleted_prefix_length(VALUE str, VALUE prefix)
11292{
11293 const char *strptr, *prefixptr;
11294 long olen, prefixlen;
11295 rb_encoding *enc = rb_enc_get(str);
11296
11297 StringValue(prefix);
11298
11299 if (!is_broken_string(prefix) ||
11300 !rb_enc_asciicompat(enc) ||
11301 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11302 enc = rb_enc_check(str, prefix);
11303 }
11304
11305 /* return 0 if not start with prefix */
11306 prefixlen = RSTRING_LEN(prefix);
11307 if (prefixlen <= 0) return 0;
11308 olen = RSTRING_LEN(str);
11309 if (olen < prefixlen) return 0;
11310 strptr = RSTRING_PTR(str);
11311 prefixptr = RSTRING_PTR(prefix);
11312 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11313 if (is_broken_string(prefix)) {
11314 if (!is_broken_string(str)) {
11315 /* prefix in a valid string cannot be broken */
11316 return 0;
11317 }
11318 const char *strend = strptr + olen;
11319 const char *after_prefix = strptr + prefixlen;
11320 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11321 /* prefix does not end at char-boundary */
11322 return 0;
11323 }
11324 }
11325 /* prefix part in `str` also should be valid. */
11326
11327 return prefixlen;
11328}
11329
11330/*
11331 * call-seq:
11332 * delete_prefix!(prefix) -> self or nil
11333 *
11334 * Like String#delete_prefix, except that +self+ is modified in place.
11335 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11336 *
11337 */
11338
11339static VALUE
11340rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11341{
11342 long prefixlen;
11343 str_modify_keep_cr(str);
11344
11345 prefixlen = deleted_prefix_length(str, prefix);
11346 if (prefixlen <= 0) return Qnil;
11347
11348 return rb_str_drop_bytes(str, prefixlen);
11349}
11350
11351/*
11352 * call-seq:
11353 * delete_prefix(prefix) -> new_string
11354 *
11355 * :include: doc/string/delete_prefix.rdoc
11356 *
11357 */
11358
11359static VALUE
11360rb_str_delete_prefix(VALUE str, VALUE prefix)
11361{
11362 long prefixlen;
11363
11364 prefixlen = deleted_prefix_length(str, prefix);
11365 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11366
11367 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11368}
11369
11379static long
11380deleted_suffix_length(VALUE str, VALUE suffix)
11381{
11382 const char *strptr, *suffixptr;
11383 long olen, suffixlen;
11384 rb_encoding *enc;
11385
11386 StringValue(suffix);
11387 if (is_broken_string(suffix)) return 0;
11388 enc = rb_enc_check(str, suffix);
11389
11390 /* return 0 if not start with suffix */
11391 suffixlen = RSTRING_LEN(suffix);
11392 if (suffixlen <= 0) return 0;
11393 olen = RSTRING_LEN(str);
11394 if (olen < suffixlen) return 0;
11395 strptr = RSTRING_PTR(str);
11396 suffixptr = RSTRING_PTR(suffix);
11397 const char *strend = strptr + olen;
11398 const char *before_suffix = strend - suffixlen;
11399 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11400 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11401
11402 return suffixlen;
11403}
11404
11405/*
11406 * call-seq:
11407 * delete_suffix!(suffix) -> self or nil
11408 *
11409 * Like String#delete_suffix, except that +self+ is modified in place.
11410 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11411 *
11412 */
11413
11414static VALUE
11415rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11416{
11417 long olen, suffixlen, len;
11418 str_modifiable(str);
11419
11420 suffixlen = deleted_suffix_length(str, suffix);
11421 if (suffixlen <= 0) return Qnil;
11422
11423 olen = RSTRING_LEN(str);
11424 str_modify_keep_cr(str);
11425 len = olen - suffixlen;
11426 STR_SET_LEN(str, len);
11427 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11428 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11430 }
11431 return str;
11432}
11433
11434/*
11435 * call-seq:
11436 * delete_suffix(suffix) -> new_string
11437 *
11438 * :include: doc/string/delete_suffix.rdoc
11439 *
11440 */
11441
11442static VALUE
11443rb_str_delete_suffix(VALUE str, VALUE suffix)
11444{
11445 long suffixlen;
11446
11447 suffixlen = deleted_suffix_length(str, suffix);
11448 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11449
11450 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11451}
11452
11453void
11454rb_str_setter(VALUE val, ID id, VALUE *var)
11455{
11456 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11457 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11458 }
11459 *var = val;
11460}
11461
11462static void
11463rb_fs_setter(VALUE val, ID id, VALUE *var)
11464{
11465 val = rb_fs_check(val);
11466 if (!val) {
11467 rb_raise(rb_eTypeError,
11468 "value of %"PRIsVALUE" must be String or Regexp",
11469 rb_id2str(id));
11470 }
11471 if (!NIL_P(val)) {
11472 rb_warn_deprecated("'$;'", NULL);
11473 }
11474 *var = val;
11475}
11476
11477
11478/*
11479 * call-seq:
11480 * force_encoding(encoding) -> self
11481 *
11482 * :include: doc/string/force_encoding.rdoc
11483 *
11484 */
11485
11486static VALUE
11487rb_str_force_encoding(VALUE str, VALUE enc)
11488{
11489 str_modifiable(str);
11490
11491 rb_encoding *encoding = rb_to_encoding(enc);
11492 int idx = rb_enc_to_index(encoding);
11493
11494 // If the encoding is unchanged, we do nothing.
11495 if (ENCODING_GET(str) == idx) {
11496 return str;
11497 }
11498
11499 rb_enc_associate_index(str, idx);
11500
11501 // If the coderange was 7bit and the new encoding is ASCII-compatible
11502 // we can keep the coderange.
11503 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11504 return str;
11505 }
11506
11508 return str;
11509}
11510
11511/*
11512 * call-seq:
11513 * b -> new_string
11514 *
11515 * :include: doc/string/b.rdoc
11516 *
11517 */
11518
11519static VALUE
11520rb_str_b(VALUE str)
11521{
11522 VALUE str2;
11523 if (STR_EMBED_P(str)) {
11524 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11525 }
11526 else {
11527 str2 = str_alloc_heap(rb_cString);
11528 }
11529 str_replace_shared_without_enc(str2, str);
11530
11531 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11532 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11533 // If we know the receiver's code range then we know the result's code range.
11534 int cr = ENC_CODERANGE(str);
11535 switch (cr) {
11536 case ENC_CODERANGE_7BIT:
11538 break;
11542 break;
11543 default:
11544 ENC_CODERANGE_CLEAR(str2);
11545 break;
11546 }
11547 }
11548
11549 return str2;
11550}
11551
11552/*
11553 * call-seq:
11554 * valid_encoding? -> true or false
11555 *
11556 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11557 *
11558 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11559 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11560 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11561 */
11562
11563static VALUE
11564rb_str_valid_encoding_p(VALUE str)
11565{
11566 int cr = rb_enc_str_coderange(str);
11567
11568 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11569}
11570
11571/*
11572 * call-seq:
11573 * ascii_only? -> true or false
11574 *
11575 * Returns whether +self+ contains only ASCII characters:
11576 *
11577 * 'abc'.ascii_only? # => true
11578 * "abc\u{6666}".ascii_only? # => false
11579 *
11580 * Related: see {Querying}[rdoc-ref:String@Querying].
11581 */
11582
11583static VALUE
11584rb_str_is_ascii_only_p(VALUE str)
11585{
11586 int cr = rb_enc_str_coderange(str);
11587
11588 return RBOOL(cr == ENC_CODERANGE_7BIT);
11589}
11590
11591VALUE
11593{
11594 static const char ellipsis[] = "...";
11595 const long ellipsislen = sizeof(ellipsis) - 1;
11596 rb_encoding *const enc = rb_enc_get(str);
11597 const long blen = RSTRING_LEN(str);
11598 const char *const p = RSTRING_PTR(str), *e = p + blen;
11599 VALUE estr, ret = 0;
11600
11601 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11602 if (len * rb_enc_mbminlen(enc) >= blen ||
11603 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11604 ret = str;
11605 }
11606 else if (len <= ellipsislen ||
11607 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11608 if (rb_enc_asciicompat(enc)) {
11609 ret = rb_str_new(ellipsis, len);
11610 rb_enc_associate(ret, enc);
11611 }
11612 else {
11613 estr = rb_usascii_str_new(ellipsis, len);
11614 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11615 }
11616 }
11617 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11618 rb_str_cat(ret, ellipsis, ellipsislen);
11619 }
11620 else {
11621 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11622 rb_enc_from_encoding(enc), 0, Qnil);
11623 rb_str_append(ret, estr);
11624 }
11625 return ret;
11626}
11627
11628static VALUE
11629str_compat_and_valid(VALUE str, rb_encoding *enc)
11630{
11631 int cr;
11632 str = StringValue(str);
11633 cr = rb_enc_str_coderange(str);
11634 if (cr == ENC_CODERANGE_BROKEN) {
11635 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11636 }
11637 else {
11638 rb_encoding *e = STR_ENC_GET(str);
11639 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11640 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11641 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11642 }
11643 }
11644 return str;
11645}
11646
11647static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11648
11649VALUE
11651{
11652 rb_encoding *enc = STR_ENC_GET(str);
11653 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11654}
11655
11656VALUE
11657rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11658{
11659 int cr = ENC_CODERANGE_UNKNOWN;
11660 if (enc == STR_ENC_GET(str)) {
11661 /* cached coderange makes sense only when enc equals the
11662 * actual encoding of str */
11663 cr = ENC_CODERANGE(str);
11664 }
11665 return enc_str_scrub(enc, str, repl, cr);
11666}
11667
11668static VALUE
11669enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11670{
11671 int encidx;
11672 VALUE buf = Qnil;
11673 const char *rep, *p, *e, *p1, *sp;
11674 long replen = -1;
11675 long slen;
11676
11677 if (rb_block_given_p()) {
11678 if (!NIL_P(repl))
11679 rb_raise(rb_eArgError, "both of block and replacement given");
11680 replen = 0;
11681 }
11682
11683 if (ENC_CODERANGE_CLEAN_P(cr))
11684 return Qnil;
11685
11686 if (!NIL_P(repl)) {
11687 repl = str_compat_and_valid(repl, enc);
11688 }
11689
11690 if (rb_enc_dummy_p(enc)) {
11691 return Qnil;
11692 }
11693 encidx = rb_enc_to_index(enc);
11694
11695#define DEFAULT_REPLACE_CHAR(str) do { \
11696 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11697 rep = replace; replen = (int)sizeof(replace); \
11698 } while (0)
11699
11700 slen = RSTRING_LEN(str);
11701 p = RSTRING_PTR(str);
11702 e = RSTRING_END(str);
11703 p1 = p;
11704 sp = p;
11705
11706 if (rb_enc_asciicompat(enc)) {
11707 int rep7bit_p;
11708 if (!replen) {
11709 rep = NULL;
11710 rep7bit_p = FALSE;
11711 }
11712 else if (!NIL_P(repl)) {
11713 rep = RSTRING_PTR(repl);
11714 replen = RSTRING_LEN(repl);
11715 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11716 }
11717 else if (encidx == rb_utf8_encindex()) {
11718 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11719 rep7bit_p = FALSE;
11720 }
11721 else {
11722 DEFAULT_REPLACE_CHAR("?");
11723 rep7bit_p = TRUE;
11724 }
11725 cr = ENC_CODERANGE_7BIT;
11726
11727 p = search_nonascii(p, e);
11728 if (!p) {
11729 p = e;
11730 }
11731 while (p < e) {
11732 int ret = rb_enc_precise_mbclen(p, e, enc);
11733 if (MBCLEN_NEEDMORE_P(ret)) {
11734 break;
11735 }
11736 else if (MBCLEN_CHARFOUND_P(ret)) {
11738 p += MBCLEN_CHARFOUND_LEN(ret);
11739 }
11740 else if (MBCLEN_INVALID_P(ret)) {
11741 /*
11742 * p1~p: valid ascii/multibyte chars
11743 * p ~e: invalid bytes + unknown bytes
11744 */
11745 long clen = rb_enc_mbmaxlen(enc);
11746 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11747 if (p > p1) {
11748 rb_str_buf_cat(buf, p1, p - p1);
11749 }
11750
11751 if (e - p < clen) clen = e - p;
11752 if (clen <= 2) {
11753 clen = 1;
11754 }
11755 else {
11756 const char *q = p;
11757 clen--;
11758 for (; clen > 1; clen--) {
11759 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11760 if (MBCLEN_NEEDMORE_P(ret)) break;
11761 if (MBCLEN_INVALID_P(ret)) continue;
11763 }
11764 }
11765 if (rep) {
11766 rb_str_buf_cat(buf, rep, replen);
11767 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11768 }
11769 else {
11770 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11771 str_mod_check(str, sp, slen);
11772 repl = str_compat_and_valid(repl, enc);
11773 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11776 }
11777 p += clen;
11778 p1 = p;
11779 p = search_nonascii(p, e);
11780 if (!p) {
11781 p = e;
11782 break;
11783 }
11784 }
11785 else {
11787 }
11788 }
11789 if (NIL_P(buf)) {
11790 if (p == e) {
11791 ENC_CODERANGE_SET(str, cr);
11792 return Qnil;
11793 }
11794 buf = rb_str_buf_new(RSTRING_LEN(str));
11795 }
11796 if (p1 < p) {
11797 rb_str_buf_cat(buf, p1, p - p1);
11798 }
11799 if (p < e) {
11800 if (rep) {
11801 rb_str_buf_cat(buf, rep, replen);
11802 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11803 }
11804 else {
11805 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11806 str_mod_check(str, sp, slen);
11807 repl = str_compat_and_valid(repl, enc);
11808 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11811 }
11812 }
11813 }
11814 else {
11815 /* ASCII incompatible */
11816 long mbminlen = rb_enc_mbminlen(enc);
11817 if (!replen) {
11818 rep = NULL;
11819 }
11820 else if (!NIL_P(repl)) {
11821 rep = RSTRING_PTR(repl);
11822 replen = RSTRING_LEN(repl);
11823 }
11824 else if (encidx == ENCINDEX_UTF_16BE) {
11825 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11826 }
11827 else if (encidx == ENCINDEX_UTF_16LE) {
11828 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11829 }
11830 else if (encidx == ENCINDEX_UTF_32BE) {
11831 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11832 }
11833 else if (encidx == ENCINDEX_UTF_32LE) {
11834 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11835 }
11836 else {
11837 DEFAULT_REPLACE_CHAR("?");
11838 }
11839
11840 while (p < e) {
11841 int ret = rb_enc_precise_mbclen(p, e, enc);
11842 if (MBCLEN_NEEDMORE_P(ret)) {
11843 break;
11844 }
11845 else if (MBCLEN_CHARFOUND_P(ret)) {
11846 p += MBCLEN_CHARFOUND_LEN(ret);
11847 }
11848 else if (MBCLEN_INVALID_P(ret)) {
11849 const char *q = p;
11850 long clen = rb_enc_mbmaxlen(enc);
11851 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11852 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11853
11854 if (e - p < clen) clen = e - p;
11855 if (clen <= mbminlen * 2) {
11856 clen = mbminlen;
11857 }
11858 else {
11859 clen -= mbminlen;
11860 for (; clen > mbminlen; clen-=mbminlen) {
11861 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11862 if (MBCLEN_NEEDMORE_P(ret)) break;
11863 if (MBCLEN_INVALID_P(ret)) continue;
11865 }
11866 }
11867 if (rep) {
11868 rb_str_buf_cat(buf, rep, replen);
11869 }
11870 else {
11871 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11872 str_mod_check(str, sp, slen);
11873 repl = str_compat_and_valid(repl, enc);
11874 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11875 }
11876 p += clen;
11877 p1 = p;
11878 }
11879 else {
11881 }
11882 }
11883 if (NIL_P(buf)) {
11884 if (p == e) {
11886 return Qnil;
11887 }
11888 buf = rb_str_buf_new(RSTRING_LEN(str));
11889 }
11890 if (p1 < p) {
11891 rb_str_buf_cat(buf, p1, p - p1);
11892 }
11893 if (p < e) {
11894 if (rep) {
11895 rb_str_buf_cat(buf, rep, replen);
11896 }
11897 else {
11898 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11899 str_mod_check(str, sp, slen);
11900 repl = str_compat_and_valid(repl, enc);
11901 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11902 }
11903 }
11905 }
11906 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11907 return buf;
11908}
11909
11910/*
11911 * call-seq:
11912 * scrub(replacement_string = default_replacement) -> new_string
11913 * scrub{|bytes| ... } -> new_string
11914 *
11915 * :include: doc/string/scrub.rdoc
11916 *
11917 */
11918static VALUE
11919str_scrub(int argc, VALUE *argv, VALUE str)
11920{
11921 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11922 VALUE new = rb_str_scrub(str, repl);
11923 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11924}
11925
11926/*
11927 * call-seq:
11928 * scrub! -> self
11929 * scrub!(replacement_string = default_replacement) -> self
11930 * scrub!{|bytes| ... } -> self
11931 *
11932 * Like String#scrub, except that any replacements are made in +self+.
11933 *
11934 */
11935static VALUE
11936str_scrub_bang(int argc, VALUE *argv, VALUE str)
11937{
11938 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11939 VALUE new = rb_str_scrub(str, repl);
11940 if (!NIL_P(new)) rb_str_replace(str, new);
11941 return str;
11942}
11943
11944static ID id_normalize;
11945static ID id_normalized_p;
11946static VALUE mUnicodeNormalize;
11947
11948static VALUE
11949unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11950{
11951 static int UnicodeNormalizeRequired = 0;
11952 VALUE argv2[2];
11953
11954 if (!UnicodeNormalizeRequired) {
11955 rb_require("unicode_normalize/normalize.rb");
11956 UnicodeNormalizeRequired = 1;
11957 }
11958 argv2[0] = str;
11959 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11960 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11961}
11962
11963/*
11964 * call-seq:
11965 * unicode_normalize(form = :nfc) -> string
11966 *
11967 * Returns a copy of +self+ with
11968 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11969 *
11970 * Argument +form+ must be one of the following symbols
11971 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11972 *
11973 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11974 * - +:nfd+: Canonical decomposition.
11975 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11976 * - +:nfkd+: Compatibility decomposition.
11977 *
11978 * The encoding of +self+ must be one of:
11979 *
11980 * - Encoding::UTF_8
11981 * - Encoding::UTF_16BE
11982 * - Encoding::UTF_16LE
11983 * - Encoding::UTF_32BE
11984 * - Encoding::UTF_32LE
11985 * - Encoding::GB18030
11986 * - Encoding::UCS_2BE
11987 * - Encoding::UCS_4BE
11988 *
11989 * Examples:
11990 *
11991 * "a\u0300".unicode_normalize # => "a"
11992 * "\u00E0".unicode_normalize(:nfd) # => "a "
11993 *
11994 * Related: String#unicode_normalize!, String#unicode_normalized?.
11995 */
11996static VALUE
11997rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11998{
11999 return unicode_normalize_common(argc, argv, str, id_normalize);
12000}
12001
12002/*
12003 * call-seq:
12004 * unicode_normalize!(form = :nfc) -> self
12005 *
12006 * Like String#unicode_normalize, except that the normalization
12007 * is performed on +self+.
12008 *
12009 * Related String#unicode_normalized?.
12010 *
12011 */
12012static VALUE
12013rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12014{
12015 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12016}
12017
12018/* call-seq:
12019 * unicode_normalized?(form = :nfc) -> true or false
12020 *
12021 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12022 * +false+ otherwise.
12023 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12024 *
12025 * Examples:
12026 *
12027 * "a\u0300".unicode_normalized? # => false
12028 * "a\u0300".unicode_normalized?(:nfd) # => true
12029 * "\u00E0".unicode_normalized? # => true
12030 * "\u00E0".unicode_normalized?(:nfd) # => false
12031 *
12032 *
12033 * Raises an exception if +self+ is not in a Unicode encoding:
12034 *
12035 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12036 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12037 *
12038 * Related: String#unicode_normalize, String#unicode_normalize!.
12039 *
12040 */
12041static VALUE
12042rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12043{
12044 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12045}
12046
12047/**********************************************************************
12048 * Document-class: Symbol
12049 *
12050 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12051 *
12052 * You can create a +Symbol+ object explicitly with:
12053 *
12054 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12055 *
12056 * The same +Symbol+ object will be
12057 * created for a given name or string for the duration of a program's
12058 * execution, regardless of the context or meaning of that name. Thus
12059 * if <code>Fred</code> is a constant in one context, a method in
12060 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12061 * will be the same object in all three contexts.
12062 *
12063 * module One
12064 * class Fred
12065 * end
12066 * $f1 = :Fred
12067 * end
12068 * module Two
12069 * Fred = 1
12070 * $f2 = :Fred
12071 * end
12072 * def Fred()
12073 * end
12074 * $f3 = :Fred
12075 * $f1.object_id #=> 2514190
12076 * $f2.object_id #=> 2514190
12077 * $f3.object_id #=> 2514190
12078 *
12079 * Constant, method, and variable names are returned as symbols:
12080 *
12081 * module One
12082 * Two = 2
12083 * def three; 3 end
12084 * @four = 4
12085 * @@five = 5
12086 * $six = 6
12087 * end
12088 * seven = 7
12089 *
12090 * One.constants
12091 * # => [:Two]
12092 * One.instance_methods(true)
12093 * # => [:three]
12094 * One.instance_variables
12095 * # => [:@four]
12096 * One.class_variables
12097 * # => [:@@five]
12098 * global_variables.grep(/six/)
12099 * # => [:$six]
12100 * local_variables
12101 * # => [:seven]
12102 *
12103 * A +Symbol+ object differs from a String object in that
12104 * a +Symbol+ object represents an identifier, while a String object
12105 * represents text or data.
12106 *
12107 * == What's Here
12108 *
12109 * First, what's elsewhere. Class +Symbol+:
12110 *
12111 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12112 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12113 *
12114 * Here, class +Symbol+ provides methods that are useful for:
12115 *
12116 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12117 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12118 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12119 *
12120 * === Methods for Querying
12121 *
12122 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12123 * - #=~: Returns the index of the first substring in symbol that matches a
12124 * given Regexp or other object; returns +nil+ if no match is found.
12125 * - #[], #slice : Returns a substring of symbol
12126 * determined by a given index, start/length, or range, or string.
12127 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12128 * - #encoding: Returns the Encoding object that represents the encoding
12129 * of symbol.
12130 * - #end_with?: Returns +true+ if symbol ends with
12131 * any of the given strings.
12132 * - #match: Returns a MatchData object if symbol
12133 * matches a given Regexp; +nil+ otherwise.
12134 * - #match?: Returns +true+ if symbol
12135 * matches a given Regexp; +false+ otherwise.
12136 * - #length, #size: Returns the number of characters in symbol.
12137 * - #start_with?: Returns +true+ if symbol starts with
12138 * any of the given strings.
12139 *
12140 * === Methods for Comparing
12141 *
12142 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12143 * or larger than symbol.
12144 * - #==, #===: Returns +true+ if a given symbol has the same content and
12145 * encoding.
12146 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12147 * symbol is smaller than, equal to, or larger than symbol.
12148 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12149 * after Unicode case folding; +false+ otherwise.
12150 *
12151 * === Methods for Converting
12152 *
12153 * - #capitalize: Returns symbol with the first character upcased
12154 * and all other characters downcased.
12155 * - #downcase: Returns symbol with all characters downcased.
12156 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12157 * - #name: Returns the frozen string corresponding to symbol.
12158 * - #succ, #next: Returns the symbol that is the successor to symbol.
12159 * - #swapcase: Returns symbol with all upcase characters downcased
12160 * and all downcase characters upcased.
12161 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12162 * - #to_s, #id2name: Returns the string corresponding to +self+.
12163 * - #to_sym, #intern: Returns +self+.
12164 * - #upcase: Returns symbol with all characters upcased.
12165 *
12166 */
12167
12168
12169/*
12170 * call-seq:
12171 * symbol == object -> true or false
12172 *
12173 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12174 */
12175
12176#define sym_equal rb_obj_equal
12177
12178static int
12179sym_printable(const char *s, const char *send, rb_encoding *enc)
12180{
12181 while (s < send) {
12182 int n;
12183 int c = rb_enc_precise_mbclen(s, send, enc);
12184
12185 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12186 n = MBCLEN_CHARFOUND_LEN(c);
12187 c = rb_enc_mbc_to_codepoint(s, send, enc);
12188 if (!rb_enc_isprint(c, enc)) return FALSE;
12189 s += n;
12190 }
12191 return TRUE;
12192}
12193
12194int
12195rb_str_symname_p(VALUE sym)
12196{
12197 rb_encoding *enc;
12198 const char *ptr;
12199 long len;
12200 rb_encoding *resenc = rb_default_internal_encoding();
12201
12202 if (resenc == NULL) resenc = rb_default_external_encoding();
12203 enc = STR_ENC_GET(sym);
12204 ptr = RSTRING_PTR(sym);
12205 len = RSTRING_LEN(sym);
12206 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12207 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12208 return FALSE;
12209 }
12210 return TRUE;
12211}
12212
12213VALUE
12214rb_str_quote_unprintable(VALUE str)
12215{
12216 rb_encoding *enc;
12217 const char *ptr;
12218 long len;
12219 rb_encoding *resenc;
12220
12221 Check_Type(str, T_STRING);
12222 resenc = rb_default_internal_encoding();
12223 if (resenc == NULL) resenc = rb_default_external_encoding();
12224 enc = STR_ENC_GET(str);
12225 ptr = RSTRING_PTR(str);
12226 len = RSTRING_LEN(str);
12227 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12228 !sym_printable(ptr, ptr + len, enc)) {
12229 return rb_str_escape(str);
12230 }
12231 return str;
12232}
12233
12234VALUE
12235rb_id_quote_unprintable(ID id)
12236{
12237 VALUE str = rb_id2str(id);
12238 if (!rb_str_symname_p(str)) {
12239 return rb_str_escape(str);
12240 }
12241 return str;
12242}
12243
12244/*
12245 * call-seq:
12246 * inspect -> string
12247 *
12248 * Returns a string representation of +self+ (including the leading colon):
12249 *
12250 * :foo.inspect # => ":foo"
12251 *
12252 * Related: Symbol#to_s, Symbol#name.
12253 *
12254 */
12255
12256static VALUE
12257sym_inspect(VALUE sym)
12258{
12259 VALUE str = rb_sym2str(sym);
12260 const char *ptr;
12261 long len;
12262 char *dest;
12263
12264 if (!rb_str_symname_p(str)) {
12265 str = rb_str_inspect(str);
12266 len = RSTRING_LEN(str);
12267 rb_str_resize(str, len + 1);
12268 dest = RSTRING_PTR(str);
12269 memmove(dest + 1, dest, len);
12270 }
12271 else {
12272 rb_encoding *enc = STR_ENC_GET(str);
12273 VALUE orig_str = str;
12274
12275 len = RSTRING_LEN(orig_str);
12276 str = rb_enc_str_new(0, len + 1, enc);
12277
12278 // Get data pointer after allocation
12279 ptr = RSTRING_PTR(orig_str);
12280 dest = RSTRING_PTR(str);
12281 memcpy(dest + 1, ptr, len);
12282
12283 RB_GC_GUARD(orig_str);
12284 }
12285 dest[0] = ':';
12286
12288
12289 return str;
12290}
12291
12292VALUE
12294{
12295 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12296 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12297 return str;
12298}
12299
12300VALUE
12301rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12302{
12303 VALUE obj;
12304
12305 if (argc < 1) {
12306 rb_raise(rb_eArgError, "no receiver given");
12307 }
12308 obj = argv[0];
12309 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12310}
12311
12312/*
12313 * call-seq:
12314 * succ
12315 *
12316 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12317 *
12318 * :foo.succ # => :fop
12319 *
12320 * Related: String#succ.
12321 */
12322
12323static VALUE
12324sym_succ(VALUE sym)
12325{
12326 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12327}
12328
12329/*
12330 * call-seq:
12331 * symbol <=> object -> -1, 0, +1, or nil
12332 *
12333 * If +object+ is a symbol,
12334 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12335 *
12336 * :bar <=> :foo # => -1
12337 * :foo <=> :foo # => 0
12338 * :foo <=> :bar # => 1
12339 *
12340 * Otherwise, returns +nil+:
12341 *
12342 * :foo <=> 'bar' # => nil
12343 *
12344 * Related: String#<=>.
12345 */
12346
12347static VALUE
12348sym_cmp(VALUE sym, VALUE other)
12349{
12350 if (!SYMBOL_P(other)) {
12351 return Qnil;
12352 }
12353 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12354}
12355
12356/*
12357 * call-seq:
12358 * casecmp(object) -> -1, 0, 1, or nil
12359 *
12360 * :include: doc/symbol/casecmp.rdoc
12361 *
12362 */
12363
12364static VALUE
12365sym_casecmp(VALUE sym, VALUE other)
12366{
12367 if (!SYMBOL_P(other)) {
12368 return Qnil;
12369 }
12370 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12371}
12372
12373/*
12374 * call-seq:
12375 * casecmp?(object) -> true, false, or nil
12376 *
12377 * :include: doc/symbol/casecmp_p.rdoc
12378 *
12379 */
12380
12381static VALUE
12382sym_casecmp_p(VALUE sym, VALUE other)
12383{
12384 if (!SYMBOL_P(other)) {
12385 return Qnil;
12386 }
12387 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12388}
12389
12390/*
12391 * call-seq:
12392 * symbol =~ object -> integer or nil
12393 *
12394 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12395 * including possible updates to global variables;
12396 * see String#=~.
12397 *
12398 */
12399
12400static VALUE
12401sym_match(VALUE sym, VALUE other)
12402{
12403 return rb_str_match(rb_sym2str(sym), other);
12404}
12405
12406/*
12407 * call-seq:
12408 * match(pattern, offset = 0) -> matchdata or nil
12409 * match(pattern, offset = 0) {|matchdata| } -> object
12410 *
12411 * Equivalent to <tt>self.to_s.match</tt>,
12412 * including possible updates to global variables;
12413 * see String#match.
12414 *
12415 */
12416
12417static VALUE
12418sym_match_m(int argc, VALUE *argv, VALUE sym)
12419{
12420 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12421}
12422
12423/*
12424 * call-seq:
12425 * match?(pattern, offset) -> true or false
12426 *
12427 * Equivalent to <tt>sym.to_s.match?</tt>;
12428 * see String#match.
12429 *
12430 */
12431
12432static VALUE
12433sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12434{
12435 return rb_str_match_m_p(argc, argv, sym);
12436}
12437
12438/*
12439 * call-seq:
12440 * symbol[index] -> string or nil
12441 * symbol[start, length] -> string or nil
12442 * symbol[range] -> string or nil
12443 * symbol[regexp, capture = 0] -> string or nil
12444 * symbol[substring] -> string or nil
12445 *
12446 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12447 *
12448 */
12449
12450static VALUE
12451sym_aref(int argc, VALUE *argv, VALUE sym)
12452{
12453 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12454}
12455
12456/*
12457 * call-seq:
12458 * length -> integer
12459 *
12460 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12461 */
12462
12463static VALUE
12464sym_length(VALUE sym)
12465{
12466 return rb_str_length(rb_sym2str(sym));
12467}
12468
12469/*
12470 * call-seq:
12471 * empty? -> true or false
12472 *
12473 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12474 *
12475 */
12476
12477static VALUE
12478sym_empty(VALUE sym)
12479{
12480 return rb_str_empty(rb_sym2str(sym));
12481}
12482
12483/*
12484 * call-seq:
12485 * upcase(mapping) -> symbol
12486 *
12487 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12488 *
12489 * See String#upcase.
12490 *
12491 */
12492
12493static VALUE
12494sym_upcase(int argc, VALUE *argv, VALUE sym)
12495{
12496 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12497}
12498
12499/*
12500 * call-seq:
12501 * downcase(mapping) -> symbol
12502 *
12503 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12504 *
12505 * See String#downcase.
12506 *
12507 * Related: Symbol#upcase.
12508 *
12509 */
12510
12511static VALUE
12512sym_downcase(int argc, VALUE *argv, VALUE sym)
12513{
12514 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12515}
12516
12517/*
12518 * call-seq:
12519 * capitalize(mapping) -> symbol
12520 *
12521 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12522 *
12523 * See String#capitalize.
12524 *
12525 */
12526
12527static VALUE
12528sym_capitalize(int argc, VALUE *argv, VALUE sym)
12529{
12530 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12531}
12532
12533/*
12534 * call-seq:
12535 * swapcase(mapping) -> symbol
12536 *
12537 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12538 *
12539 * See String#swapcase.
12540 *
12541 */
12542
12543static VALUE
12544sym_swapcase(int argc, VALUE *argv, VALUE sym)
12545{
12546 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12547}
12548
12549/*
12550 * call-seq:
12551 * start_with?(*string_or_regexp) -> true or false
12552 *
12553 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12554 *
12555 */
12556
12557static VALUE
12558sym_start_with(int argc, VALUE *argv, VALUE sym)
12559{
12560 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12561}
12562
12563/*
12564 * call-seq:
12565 * end_with?(*strings) -> true or false
12566 *
12567 *
12568 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12569 *
12570 */
12571
12572static VALUE
12573sym_end_with(int argc, VALUE *argv, VALUE sym)
12574{
12575 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12576}
12577
12578/*
12579 * call-seq:
12580 * encoding -> encoding
12581 *
12582 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12583 *
12584 */
12585
12586static VALUE
12587sym_encoding(VALUE sym)
12588{
12589 return rb_obj_encoding(rb_sym2str(sym));
12590}
12591
12592static VALUE
12593string_for_symbol(VALUE name)
12594{
12595 if (!RB_TYPE_P(name, T_STRING)) {
12596 VALUE tmp = rb_check_string_type(name);
12597 if (NIL_P(tmp)) {
12598 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12599 name);
12600 }
12601 name = tmp;
12602 }
12603 return name;
12604}
12605
12606ID
12608{
12609 if (SYMBOL_P(name)) {
12610 return SYM2ID(name);
12611 }
12612 name = string_for_symbol(name);
12613 return rb_intern_str(name);
12614}
12615
12616VALUE
12618{
12619 if (SYMBOL_P(name)) {
12620 return name;
12621 }
12622 name = string_for_symbol(name);
12623 return rb_str_intern(name);
12624}
12625
12626/*
12627 * call-seq:
12628 * Symbol.all_symbols -> array_of_symbols
12629 *
12630 * Returns an array of all symbols currently in Ruby's symbol table:
12631 *
12632 * Symbol.all_symbols.size # => 9334
12633 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12634 *
12635 */
12636
12637static VALUE
12638sym_all_symbols(VALUE _)
12639{
12640 return rb_sym_all_symbols();
12641}
12642
12643VALUE
12644rb_str_to_interned_str(VALUE str)
12645{
12646 return rb_fstring(str);
12647}
12648
12649VALUE
12650rb_interned_str(const char *ptr, long len)
12651{
12652 struct RString fake_str;
12653 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12654}
12655
12656VALUE
12658{
12659 return rb_interned_str(ptr, strlen(ptr));
12660}
12661
12662VALUE
12663rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12664{
12665 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12666 rb_enc_autoload(enc);
12667 }
12668
12669 struct RString fake_str;
12670 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12671}
12672
12673VALUE
12674rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12675{
12676 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12677 rb_enc_autoload(enc);
12678 }
12679
12680 struct RString fake_str;
12681 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12682}
12683
12684VALUE
12686{
12687 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12688}
12689
12690#if USE_YJIT
12691void
12692rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12693{
12694 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12695 ssize_t code = RB_NUM2SSIZE(codepoint);
12696
12697 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12698 rb_str_buf_cat_byte(str, (char) code);
12699 return;
12700 }
12701 }
12702
12703 rb_str_concat(str, codepoint);
12704}
12705#endif
12706
12707static int
12708fstring_set_class_i(VALUE *str, void *data)
12709{
12710 RBASIC_SET_CLASS(*str, rb_cString);
12711
12712 return ST_CONTINUE;
12713}
12714
12715void
12716Init_String(void)
12717{
12718 rb_cString = rb_define_class("String", rb_cObject);
12719
12720 rb_concurrent_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12721
12723 rb_define_alloc_func(rb_cString, empty_str_alloc);
12724 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12725 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12726 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12727 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12728 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12731 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12732 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12733 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12734 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12737 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12738 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12739 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12740 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12743 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12744 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12745 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12746 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12747 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12749 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12751 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12752 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12753 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12754 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12755 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12756 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12758 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12759 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12760 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12761 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12762 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12763 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12764 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12765 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12767 rb_define_method(rb_cString, "+@", str_uplus, 0);
12768 rb_define_method(rb_cString, "-@", str_uminus, 0);
12769 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12770 rb_define_alias(rb_cString, "dedup", "-@");
12771
12772 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12773 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12774 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12775 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12778 rb_define_method(rb_cString, "undump", str_undump, 0);
12779
12780 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12781 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12782 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12783 sym_fold = ID2SYM(rb_intern_const("fold"));
12784
12785 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12786 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12787 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12788 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12789
12790 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12791 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12792 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12793 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12794
12795 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12796 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12797 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12798 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12799 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12800 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12801 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12802 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12803 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12804 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12805 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12806 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12808 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12809 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12810 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12811 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12812 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12813
12814 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12815 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12816 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12817
12818 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12819
12820 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12821 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12822 rb_define_method(rb_cString, "center", rb_str_center, -1);
12823
12824 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12825 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12826 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12827 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12828 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12829 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12830 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12831 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12832 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12833
12834 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12835 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12836 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12837 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12838 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12839 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12840 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12841 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12842 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12843
12844 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12845 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12846 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12847 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12848 rb_define_method(rb_cString, "count", rb_str_count, -1);
12849
12850 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12851 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12852 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12853 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12854
12855 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12856 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12857 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12858 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12859 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12860
12861 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12862
12863 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12864 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12865
12866 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12867 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12868
12869 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12870 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12871 rb_define_method(rb_cString, "b", rb_str_b, 0);
12872 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12873 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12874
12875 /* define UnicodeNormalize module here so that we don't have to look it up */
12876 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12877 id_normalize = rb_intern_const("normalize");
12878 id_normalized_p = rb_intern_const("normalized?");
12879
12880 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12881 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12882 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12883
12884 rb_fs = Qnil;
12885 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12886 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12887 rb_gc_register_address(&rb_fs);
12888
12889 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12893 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12894
12895 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12896 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12897 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12898 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12899 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12900 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12901
12902 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12903 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12904 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12905 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12906
12907 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12908 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12909 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12910 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12911 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12912 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12913 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12914
12915 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12916 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12917 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12918 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12919
12920 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12921 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12922
12923 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12924}
12925
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1696
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1479
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1597
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2663
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:943
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2125
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2143
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1311
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3539
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:175
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1299
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3223
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1316
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:931
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1181
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2986
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1200
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12663
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2293
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3690
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1129
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1421
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1322
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:950
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12685
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:815
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:444
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1490
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2670
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2934
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1746
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:700
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1861
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1071
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1867
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4225
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3722
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1927
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1716
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1486
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2445
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3755
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1397
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12293
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2518
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1373
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1710
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3014
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5415
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4131
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3111
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11592
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1768
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1752
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1163
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:985
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1492
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1955
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4117
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3523
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2382
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1973
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6653
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3119
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12657
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1403
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3721
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3061
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4240
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3345
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7328
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2748
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12650
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4187
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4004
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4162
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3697
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3236
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5925
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11650
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1666
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2908
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3208
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3327
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1175
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2702
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7442
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1385
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1682
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2396
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5843
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9539
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1169
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:911
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1814
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2090
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2167
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3094
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1419
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:999
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12617
ID rb_to_id(VALUE str)
Definition string.c:12607
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1866
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3501
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4469
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1415
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2885
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2767
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1409
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2780
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1743
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1586
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
Definition string.c:8401
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:295
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113