Ruby 3.5.0dev (2025-07-05 revision b6817392957b8879d2f847280abd481f4cd062fe)
string.c (b6817392957b8879d2f847280abd481f4cd062fe)
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/hash.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/ractor_safe_set.h"
39#include "internal/re.h"
40#include "internal/sanitizers.h"
41#include "internal/string.h"
42#include "internal/transcode.h"
43#include "probes.h"
44#include "ruby/encoding.h"
45#include "ruby/re.h"
46#include "ruby/thread.h"
47#include "ruby/util.h"
48#include "ruby_assert.h"
49#include "shape.h"
50#include "vm_sync.h"
52
53#if defined HAVE_CRYPT_R
54# if defined HAVE_CRYPT_H
55# include <crypt.h>
56# endif
57#elif !defined HAVE_CRYPT
58# include "missing/crypt.h"
59# define HAVE_CRYPT_R 1
60#endif
61
62#define BEG(no) (regs->beg[(no)])
63#define END(no) (regs->end[(no)])
64
65#undef rb_str_new
66#undef rb_usascii_str_new
67#undef rb_utf8_str_new
68#undef rb_enc_str_new
69#undef rb_str_new_cstr
70#undef rb_usascii_str_new_cstr
71#undef rb_utf8_str_new_cstr
72#undef rb_enc_str_new_cstr
73#undef rb_external_str_new_cstr
74#undef rb_locale_str_new_cstr
75#undef rb_str_dup_frozen
76#undef rb_str_buf_new_cstr
77#undef rb_str_buf_cat
78#undef rb_str_buf_cat2
79#undef rb_str_cat2
80#undef rb_str_cat_cstr
81#undef rb_fstring_cstr
82
85
86/* Flags of RString
87 *
88 * 0: STR_SHARED (equal to ELTS_SHARED)
89 * The string is shared. The buffer this string points to is owned by
90 * another string (the shared root).
91 * 1: RSTRING_NOEMBED
92 * The string is not embedded. When a string is embedded, the contents
93 * follow the header. When a string is not embedded, the contents is
94 * on a separately allocated buffer.
95 * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
96 * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
97 * It emits a deprecation warning when mutated for the first time.
98 * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
99 * The string was allocated by the `Symbol#to_s` method.
100 * It emits a deprecation warning when mutated for the first time.
101 * 4: STR_PRECOMPUTED_HASH
102 * The string is embedded and has its precomputed hashcode stored
103 * after the terminator.
104 * 5: STR_SHARED_ROOT
105 * Other strings may point to the contents of this string. When this
106 * flag is set, STR_SHARED must not be set.
107 * 6: STR_BORROWED
108 * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
109 * to be unshared by rb_str_tmp_frozen_release.
110 * 7: STR_TMPLOCK
111 * The pointer to the buffer is passed to a system call such as
112 * read(2). Any modification and realloc is prohibited.
113 * 8-9: ENC_CODERANGE
114 * Stores the coderange of the string.
115 * 10-16: ENCODING
116 * Stores the encoding of the string.
117 * 17: RSTRING_FSTR
118 * The string is a fstring. The string is deduplicated in the fstring
119 * table.
120 * 18: STR_NOFREE
121 * Do not free this string's buffer when the string is reclaimed
122 * by the garbage collector. Used for when the string buffer is a C
123 * string literal.
124 * 19: STR_FAKESTR
125 * The string is not allocated or managed by the garbage collector.
126 * Typically, the string object header (struct RString) is temporarily
127 * allocated on C stack.
128 */
129
130#define RUBY_MAX_CHAR_LEN 16
131#define STR_PRECOMPUTED_HASH FL_USER4
132#define STR_SHARED_ROOT FL_USER5
133#define STR_BORROWED FL_USER6
134#define STR_TMPLOCK FL_USER7
135#define STR_NOFREE FL_USER18
136#define STR_FAKESTR FL_USER19
137
138#define STR_SET_NOEMBED(str) do {\
139 FL_SET((str), STR_NOEMBED);\
140 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
141} while (0)
142#define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
143
144#define STR_SET_LEN(str, n) do { \
145 RSTRING(str)->len = (n); \
146} while (0)
147
148static inline bool
149str_encindex_fastpath(int encindex)
150{
151 // The overwhelming majority of strings are in one of these 3 encodings.
152 switch (encindex) {
153 case ENCINDEX_ASCII_8BIT:
154 case ENCINDEX_UTF_8:
155 case ENCINDEX_US_ASCII:
156 return true;
157 default:
158 return false;
159 }
160}
161
162static inline bool
163str_enc_fastpath(VALUE str)
164{
165 return str_encindex_fastpath(ENCODING_GET_INLINED(str));
166}
167
168#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
169#define TERM_FILL(ptr, termlen) do {\
170 char *const term_fill_ptr = (ptr);\
171 const int term_fill_len = (termlen);\
172 *term_fill_ptr = '\0';\
173 if (UNLIKELY(term_fill_len > 1))\
174 memset(term_fill_ptr, 0, term_fill_len);\
175} while (0)
176
177#define RESIZE_CAPA(str,capacity) do {\
178 const int termlen = TERM_LEN(str);\
179 RESIZE_CAPA_TERM(str,capacity,termlen);\
180} while (0)
181#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
182 if (STR_EMBED_P(str)) {\
183 if (str_embed_capa(str) < capacity + termlen) {\
184 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
185 const long tlen = RSTRING_LEN(str);\
186 memcpy(tmp, RSTRING_PTR(str), tlen);\
187 RSTRING(str)->as.heap.ptr = tmp;\
188 RSTRING(str)->len = tlen;\
189 STR_SET_NOEMBED(str);\
190 RSTRING(str)->as.heap.aux.capa = (capacity);\
191 }\
192 }\
193 else {\
194 RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
195 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
196 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
197 RSTRING(str)->as.heap.aux.capa = (capacity);\
198 }\
199} while (0)
200
201#define STR_SET_SHARED(str, shared_str) do { \
202 if (!FL_TEST(str, STR_FAKESTR)) { \
203 RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
204 RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
205 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
206 FL_SET((str), STR_SHARED); \
207 FL_SET((shared_str), STR_SHARED_ROOT); \
208 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
209 FL_SET_RAW((shared_str), STR_BORROWED); \
210 } \
211} while (0)
212
213#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
214#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
215/* TODO: include the terminator size in capa. */
216
217#define STR_ENC_GET(str) get_encoding(str)
218
219#if !defined SHARABLE_MIDDLE_SUBSTRING
220# define SHARABLE_MIDDLE_SUBSTRING 0
221#endif
222#if !SHARABLE_MIDDLE_SUBSTRING
223#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#else
225#define SHARABLE_SUBSTRING_P(beg, len, end) 1
226#endif
227
228
229static inline long
230str_embed_capa(VALUE str)
231{
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233}
234
235bool
236rb_str_reembeddable_p(VALUE str)
237{
238 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
239}
240
241static inline size_t
242rb_str_embed_size(long capa)
243{
244 return offsetof(struct RString, as.embed.ary) + capa;
245}
246
247size_t
248rb_str_size_as_embedded(VALUE str)
249{
250 size_t real_size;
251 if (STR_EMBED_P(str)) {
252 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
253 }
254 /* if the string is not currently embedded, but it can be embedded, how
255 * much space would it require */
256 else if (rb_str_reembeddable_p(str)) {
257 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
258 }
259 else {
260 real_size = sizeof(struct RString);
261 }
262
263 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
264 real_size += sizeof(st_index_t);
265 }
266
267 return real_size;
268}
269
270static inline bool
271STR_EMBEDDABLE_P(long len, long termlen)
272{
273 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
274}
275
276static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
277static VALUE str_new_frozen(VALUE klass, VALUE orig);
278static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
279static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
280static VALUE str_new(VALUE klass, const char *ptr, long len);
281static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
282static inline void str_modifiable(VALUE str);
283static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
284static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
285
286static inline void
287str_make_independent(VALUE str)
288{
289 long len = RSTRING_LEN(str);
290 int termlen = TERM_LEN(str);
291 str_make_independent_expand((str), len, 0L, termlen);
292}
293
294static inline int str_dependent_p(VALUE str);
295
296void
297rb_str_make_independent(VALUE str)
298{
299 if (str_dependent_p(str)) {
300 str_make_independent(str);
301 }
302}
303
304void
305rb_str_make_embedded(VALUE str)
306{
307 RUBY_ASSERT(rb_str_reembeddable_p(str));
308 RUBY_ASSERT(!STR_EMBED_P(str));
309
310 char *buf = RSTRING(str)->as.heap.ptr;
311 long len = RSTRING(str)->len;
312
313 STR_SET_EMBED(str);
314 STR_SET_LEN(str, len);
315
316 if (len > 0) {
317 memcpy(RSTRING_PTR(str), buf, len);
318 ruby_xfree(buf);
319 }
320
321 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
322}
323
324void
325rb_debug_rstring_null_ptr(const char *func)
326{
327 fprintf(stderr, "%s is returning NULL!! "
328 "SIGSEGV is highly expected to follow immediately.\n"
329 "If you could reproduce, attach your debugger here, "
330 "and look at the passed string.\n",
331 func);
332}
333
334/* symbols for [up|down|swap]case/capitalize options */
335static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
336
337static rb_encoding *
338get_encoding(VALUE str)
339{
340 return rb_enc_from_index(ENCODING_GET(str));
341}
342
343static void
344mustnot_broken(VALUE str)
345{
346 if (is_broken_string(str)) {
347 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
348 }
349}
350
351static void
352mustnot_wchar(VALUE str)
353{
354 rb_encoding *enc = STR_ENC_GET(str);
355 if (rb_enc_mbminlen(enc) > 1) {
356 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
357 }
358}
359
360static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
361
362#if SIZEOF_LONG == SIZEOF_VOIDP
363#define PRECOMPUTED_FAKESTR_HASH 1
364#else
365#endif
366
367static inline bool
368BARE_STRING_P(VALUE str)
369{
370 return RBASIC_CLASS(str) == rb_cString && !rb_shape_obj_has_ivars(str);
371}
372
373static inline st_index_t
374str_do_hash(VALUE str)
375{
376 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
377 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
378 if (e && !is_ascii_string(str)) {
379 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
380 }
381 return h;
382}
383
384static VALUE
385str_store_precomputed_hash(VALUE str, st_index_t hash)
386{
387 RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
388 RUBY_ASSERT(STR_EMBED_P(str));
389
390#if RUBY_DEBUG
391 size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
392 size_t free_bytes = str_embed_capa(str) - used_bytes;
393 RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
394#endif
395
396 memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
397
398 FL_SET(str, STR_PRECOMPUTED_HASH);
399
400 return str;
401}
402
403VALUE
404rb_fstring(VALUE str)
405{
406 VALUE fstr;
407 int bare;
408
409 Check_Type(str, T_STRING);
410
411 if (FL_TEST(str, RSTRING_FSTR))
412 return str;
413
414 bare = BARE_STRING_P(str);
415 if (!bare) {
416 if (STR_EMBED_P(str)) {
417 OBJ_FREEZE(str);
418 return str;
419 }
420
421 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
423 return str;
424 }
425 }
426
427 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
428 rb_str_resize(str, RSTRING_LEN(str));
429
430 fstr = register_fstring(str, false, false);
431
432 if (!bare) {
433 str_replace_shared_without_enc(str, fstr);
434 OBJ_FREEZE(str);
435 return str;
436 }
437 return fstr;
438}
439
440static VALUE fstring_table_obj;
441
442static VALUE
443fstring_ractor_safe_set_hash(VALUE str)
444{
445#ifdef PRECOMPUTED_FAKESTR_HASH
446 st_index_t h;
447 if (FL_TEST_RAW(str, STR_FAKESTR)) {
448 // register_fstring precomputes the hash and stores it in capa for fake strings
449 h = (st_index_t)RSTRING(str)->as.heap.aux.capa;
450 }
451 else {
452 h = rb_str_hash(str);
453 }
454 // rb_str_hash doesn't include the encoding for ascii only strings, so
455 // we add it to avoid common collisions between `:sym.name` (ASCII) and `"sym"` (UTF-8)
456 return (VALUE)rb_hash_end(rb_hash_uint32(h, (uint32_t)ENCODING_GET_INLINED(str)));
457#else
458 return (VALUE)rb_str_hash(str);
459#endif
460}
461
462static bool
463fstring_ractor_safe_set_cmp(VALUE a, VALUE b)
464{
465 long alen, blen;
466 const char *aptr, *bptr;
467
470
471 RSTRING_GETMEM(a, aptr, alen);
472 RSTRING_GETMEM(b, bptr, blen);
473 return (alen == blen &&
474 ENCODING_GET(a) == ENCODING_GET(b) &&
475 memcmp(aptr, bptr, alen) == 0);
476}
477
479 bool copy;
480 bool force_precompute_hash;
481};
482
483static VALUE
484fstring_ractor_safe_set_create(VALUE str, void *data)
485{
486 struct fstr_create_arg *arg = data;
487
488 // Unless the string is empty or binary, its coderange has been precomputed.
489 int coderange = ENC_CODERANGE(str);
490
491 if (FL_TEST_RAW(str, STR_FAKESTR)) {
492 if (arg->copy) {
493 VALUE new_str;
494 long len = RSTRING_LEN(str);
495 long capa = len + sizeof(st_index_t);
496 int term_len = TERM_LEN(str);
497
498 if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
499 new_str = str_alloc_embed(rb_cString, capa + term_len);
500 memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
501 STR_SET_LEN(new_str, RSTRING_LEN(str));
502 TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
503 rb_enc_copy(new_str, str);
504 str_store_precomputed_hash(new_str, str_do_hash(str));
505 }
506 else {
507 new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
508 rb_enc_copy(new_str, str);
509#ifdef PRECOMPUTED_FAKESTR_HASH
510 if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
511 str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
512 }
513#endif
514 }
515 str = new_str;
516 }
517 else {
518 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
519 RSTRING(str)->len,
520 ENCODING_GET(str));
521 }
522 OBJ_FREEZE(str);
523 }
524 else {
525 if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
526 str = str_new_frozen(rb_cString, str);
527 }
528 if (STR_SHARED_P(str)) { /* str should not be shared */
529 /* shared substring */
530 str_make_independent(str);
532 }
533 if (!BARE_STRING_P(str)) {
534 str = str_new_frozen(rb_cString, str);
535 }
536 }
537
538 ENC_CODERANGE_SET(str, coderange);
539 RBASIC(str)->flags |= RSTRING_FSTR;
540
543 RUBY_ASSERT(!FL_TEST_RAW(str, STR_FAKESTR));
544 RUBY_ASSERT(!rb_obj_exivar_p(str));
546 RUBY_ASSERT(!rb_objspace_garbage_object_p(str));
547
548 return str;
549}
550
551static struct rb_ractor_safe_set_funcs fstring_ractor_safe_set_funcs = {
552 .hash = fstring_ractor_safe_set_hash,
553 .cmp = fstring_ractor_safe_set_cmp,
554 .create = fstring_ractor_safe_set_create,
555};
556
557void
558Init_fstring_table(void)
559{
560 fstring_table_obj = rb_ractor_safe_set_new(&fstring_ractor_safe_set_funcs, 8192);
561 rb_gc_register_address(&fstring_table_obj);
562}
563
564static VALUE
565register_fstring(VALUE str, bool copy, bool force_precompute_hash)
566{
567 struct fstr_create_arg args = {
568 .copy = copy,
569 .force_precompute_hash = force_precompute_hash
570 };
571
572#if SIZEOF_VOIDP == SIZEOF_LONG
573 if (FL_TEST_RAW(str, STR_FAKESTR)) {
574 // if the string hasn't been interned, we'll need the hash twice, so we
575 // compute it once and store it in capa
576 RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
577 }
578#endif
579
580 VALUE result = rb_ractor_safe_set_find_or_insert(&fstring_table_obj, str, &args);
581
582 RUBY_ASSERT(!rb_objspace_garbage_object_p(result));
584 RUBY_ASSERT(OBJ_FROZEN(result));
585 RUBY_ASSERT(!FL_TEST_RAW(result, STR_FAKESTR));
587
588 return result;
589}
590
591bool
592rb_obj_is_fstring_table(VALUE obj)
593{
594 ASSERT_vm_locking();
595
596 return obj == fstring_table_obj;
597}
598
599void
600rb_gc_free_fstring(VALUE obj)
601{
602 // Assume locking and barrier (which there is no assert for)
603 ASSERT_vm_locking();
604
605 rb_ractor_safe_set_delete_by_identity(fstring_table_obj, obj);
606
607 RB_DEBUG_COUNTER_INC(obj_str_fstr);
608
609 FL_UNSET(obj, RSTRING_FSTR);
610}
611
612void
613rb_fstring_foreach_with_replace(int (*callback)(VALUE *str, void *data), void *data)
614{
615 if (fstring_table_obj) {
616 rb_ractor_safe_set_foreach_with_replace(fstring_table_obj, callback, data);
617 }
618}
619
620static VALUE
621setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
622{
623 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
624 RBASIC_SET_SHAPE_ID((VALUE)fake_str, ROOT_SHAPE_ID);
625
626 if (!name) {
628 name = "";
629 }
630
631 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
632
633 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
634 fake_str->len = len;
635 fake_str->as.heap.ptr = (char *)name;
636 fake_str->as.heap.aux.capa = len;
637 return (VALUE)fake_str;
638}
639
640/*
641 * set up a fake string which refers a static string literal.
642 */
643VALUE
644rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
645{
646 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
647}
648
649/*
650 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
651 * shared string which refers a static string literal. `ptr` must
652 * point a constant string.
653 */
654VALUE
655rb_fstring_new(const char *ptr, long len)
656{
657 struct RString fake_str;
658 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
659}
660
661VALUE
662rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
663{
664 struct RString fake_str;
665 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
666}
667
668VALUE
669rb_fstring_cstr(const char *ptr)
670{
671 return rb_fstring_new(ptr, strlen(ptr));
672}
673
674static inline bool
675single_byte_optimizable(VALUE str)
676{
677 int encindex = ENCODING_GET(str);
678 switch (encindex) {
679 case ENCINDEX_ASCII_8BIT:
680 case ENCINDEX_US_ASCII:
681 return true;
682 case ENCINDEX_UTF_8:
683 // For UTF-8 it's worth scanning the string coderange when unknown.
685 }
686 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
687 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
688 return true;
689 }
690
691 if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
692 return true;
693 }
694
695 /* Conservative. Possibly single byte.
696 * "\xa1" in Shift_JIS for example. */
697 return false;
698}
699
701
702static inline const char *
703search_nonascii(const char *p, const char *e)
704{
705 const uintptr_t *s, *t;
706
707#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
708# if SIZEOF_UINTPTR_T == 8
709# define NONASCII_MASK UINT64_C(0x8080808080808080)
710# elif SIZEOF_UINTPTR_T == 4
711# define NONASCII_MASK UINT32_C(0x80808080)
712# else
713# error "don't know what to do."
714# endif
715#else
716# if SIZEOF_UINTPTR_T == 8
717# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
718# elif SIZEOF_UINTPTR_T == 4
719# define NONASCII_MASK 0x80808080UL /* or...? */
720# else
721# error "don't know what to do."
722# endif
723#endif
724
725 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
726#if !UNALIGNED_WORD_ACCESS
727 if ((uintptr_t)p % SIZEOF_VOIDP) {
728 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
729 p += l;
730 switch (l) {
731 default: UNREACHABLE;
732#if SIZEOF_VOIDP > 4
733 case 7: if (p[-7]&0x80) return p-7;
734 case 6: if (p[-6]&0x80) return p-6;
735 case 5: if (p[-5]&0x80) return p-5;
736 case 4: if (p[-4]&0x80) return p-4;
737#endif
738 case 3: if (p[-3]&0x80) return p-3;
739 case 2: if (p[-2]&0x80) return p-2;
740 case 1: if (p[-1]&0x80) return p-1;
741 case 0: break;
742 }
743 }
744#endif
745#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
746#define aligned_ptr(value) \
747 __builtin_assume_aligned((value), sizeof(uintptr_t))
748#else
749#define aligned_ptr(value) (uintptr_t *)(value)
750#endif
751 s = aligned_ptr(p);
752 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
753#undef aligned_ptr
754 for (;s < t; s++) {
755 if (*s & NONASCII_MASK) {
756#ifdef WORDS_BIGENDIAN
757 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
758#else
759 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
760#endif
761 }
762 }
763 p = (const char *)s;
764 }
765
766 switch (e - p) {
767 default: UNREACHABLE;
768#if SIZEOF_VOIDP > 4
769 case 7: if (e[-7]&0x80) return e-7;
770 case 6: if (e[-6]&0x80) return e-6;
771 case 5: if (e[-5]&0x80) return e-5;
772 case 4: if (e[-4]&0x80) return e-4;
773#endif
774 case 3: if (e[-3]&0x80) return e-3;
775 case 2: if (e[-2]&0x80) return e-2;
776 case 1: if (e[-1]&0x80) return e-1;
777 case 0: return NULL;
778 }
779}
780
781static int
782coderange_scan(const char *p, long len, rb_encoding *enc)
783{
784 const char *e = p + len;
785
786 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
787 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
788 p = search_nonascii(p, e);
790 }
791
792 if (rb_enc_asciicompat(enc)) {
793 p = search_nonascii(p, e);
794 if (!p) return ENC_CODERANGE_7BIT;
795 for (;;) {
796 int ret = rb_enc_precise_mbclen(p, e, enc);
798 p += MBCLEN_CHARFOUND_LEN(ret);
799 if (p == e) break;
800 p = search_nonascii(p, e);
801 if (!p) break;
802 }
803 }
804 else {
805 while (p < e) {
806 int ret = rb_enc_precise_mbclen(p, e, enc);
808 p += MBCLEN_CHARFOUND_LEN(ret);
809 }
810 }
811 return ENC_CODERANGE_VALID;
812}
813
814long
815rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
816{
817 const char *p = s;
818
819 if (*cr == ENC_CODERANGE_BROKEN)
820 return e - s;
821
822 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
823 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
824 if (*cr == ENC_CODERANGE_VALID) return e - s;
825 p = search_nonascii(p, e);
827 return e - s;
828 }
829 else if (rb_enc_asciicompat(enc)) {
830 p = search_nonascii(p, e);
831 if (!p) {
832 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
833 return e - s;
834 }
835 for (;;) {
836 int ret = rb_enc_precise_mbclen(p, e, enc);
837 if (!MBCLEN_CHARFOUND_P(ret)) {
839 return p - s;
840 }
841 p += MBCLEN_CHARFOUND_LEN(ret);
842 if (p == e) break;
843 p = search_nonascii(p, e);
844 if (!p) break;
845 }
846 }
847 else {
848 while (p < e) {
849 int ret = rb_enc_precise_mbclen(p, e, enc);
850 if (!MBCLEN_CHARFOUND_P(ret)) {
852 return p - s;
853 }
854 p += MBCLEN_CHARFOUND_LEN(ret);
855 }
856 }
858 return e - s;
859}
860
861static inline void
862str_enc_copy(VALUE str1, VALUE str2)
863{
864 rb_enc_set_index(str1, ENCODING_GET(str2));
865}
866
867/* Like str_enc_copy, but does not check frozen status of str1.
868 * You should use this only if you're certain that str1 is not frozen. */
869static inline void
870str_enc_copy_direct(VALUE str1, VALUE str2)
871{
872 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
873 if (inlined_encoding == ENCODING_INLINE_MAX) {
874 rb_enc_set_index(str1, rb_enc_get_index(str2));
875 }
876 else {
877 ENCODING_SET_INLINED(str1, inlined_encoding);
878 }
879}
880
881static void
882rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
883{
884 /* this function is designed for copying encoding and coderange
885 * from src to new string "dest" which is made from the part of src.
886 */
887 str_enc_copy(dest, src);
888 if (RSTRING_LEN(dest) == 0) {
889 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
891 else
893 return;
894 }
895 switch (ENC_CODERANGE(src)) {
898 break;
900 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
901 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
903 else
905 break;
906 default:
907 break;
908 }
909}
910
911static void
912rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
913{
914 str_enc_copy(dest, src);
916}
917
918static int
919enc_coderange_scan(VALUE str, rb_encoding *enc)
920{
921 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
922}
923
924int
925rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
926{
927 return enc_coderange_scan(str, enc);
928}
929
930int
932{
933 int cr = ENC_CODERANGE(str);
934
935 if (cr == ENC_CODERANGE_UNKNOWN) {
936 cr = enc_coderange_scan(str, get_encoding(str));
937 ENC_CODERANGE_SET(str, cr);
938 }
939 return cr;
940}
941
942static inline bool
943rb_enc_str_asciicompat(VALUE str)
944{
945 int encindex = ENCODING_GET_INLINED(str);
946 return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
947}
948
949int
951{
952 switch(ENC_CODERANGE(str)) {
954 return rb_enc_str_asciicompat(str) && is_ascii_string(str);
956 return true;
957 default:
958 return false;
959 }
960}
961
962static inline void
963str_mod_check(VALUE s, const char *p, long len)
964{
965 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
966 rb_raise(rb_eRuntimeError, "string modified");
967 }
968}
969
970static size_t
971str_capacity(VALUE str, const int termlen)
972{
973 if (STR_EMBED_P(str)) {
974 return str_embed_capa(str) - termlen;
975 }
976 else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
977 return RSTRING(str)->len;
978 }
979 else {
980 return RSTRING(str)->as.heap.aux.capa;
981 }
982}
983
984size_t
986{
987 return str_capacity(str, TERM_LEN(str));
988}
989
990static inline void
991must_not_null(const char *ptr)
992{
993 if (!ptr) {
994 rb_raise(rb_eArgError, "NULL pointer given");
995 }
996}
997
998static inline VALUE
999str_alloc_embed(VALUE klass, size_t capa)
1000{
1001 size_t size = rb_str_embed_size(capa);
1002 RUBY_ASSERT(size > 0);
1003 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1004
1005 NEWOBJ_OF(str, struct RString, klass,
1007
1008 return (VALUE)str;
1009}
1010
1011static inline VALUE
1012str_alloc_heap(VALUE klass)
1013{
1014 NEWOBJ_OF(str, struct RString, klass,
1015 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
1016
1017 return (VALUE)str;
1018}
1019
1020static inline VALUE
1021empty_str_alloc(VALUE klass)
1022{
1023 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
1024 VALUE str = str_alloc_embed(klass, 0);
1025 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
1027 return str;
1028}
1029
1030static VALUE
1031str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (enc == NULL) {
1040 enc = rb_ascii8bit_encoding();
1041 }
1042
1043 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1044
1045 int termlen = rb_enc_mbminlen(enc);
1046
1047 if (STR_EMBEDDABLE_P(len, termlen)) {
1048 str = str_alloc_embed(klass, len + termlen);
1049 if (len == 0) {
1050 ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
1051 }
1052 }
1053 else {
1054 str = str_alloc_heap(klass);
1055 RSTRING(str)->as.heap.aux.capa = len;
1056 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1057 * integer overflow. If we can STATIC_ASSERT that, the following
1058 * mul_add_mul can be reverted to a simple ALLOC_N. */
1059 RSTRING(str)->as.heap.ptr =
1060 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1061 }
1062
1063 rb_enc_raw_set(str, enc);
1064
1065 if (ptr) {
1066 memcpy(RSTRING_PTR(str), ptr, len);
1067 }
1068
1069 STR_SET_LEN(str, len);
1070 TERM_FILL(RSTRING_PTR(str) + len, termlen);
1071 return str;
1072}
1073
1074static VALUE
1075str_new(VALUE klass, const char *ptr, long len)
1076{
1077 return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1078}
1079
1080VALUE
1081rb_str_new(const char *ptr, long len)
1082{
1083 return str_new(rb_cString, ptr, len);
1084}
1085
1086VALUE
1087rb_usascii_str_new(const char *ptr, long len)
1088{
1089 return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1090}
1091
1092VALUE
1093rb_utf8_str_new(const char *ptr, long len)
1094{
1095 return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1096}
1097
1098VALUE
1099rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1100{
1101 return str_enc_new(rb_cString, ptr, len, enc);
1102}
1103
1104VALUE
1106{
1107 must_not_null(ptr);
1108 /* rb_str_new_cstr() can take pointer from non-malloc-generated
1109 * memory regions, and that cannot be detected by the MSAN. Just
1110 * trust the programmer that the argument passed here is a sane C
1111 * string. */
1112 __msan_unpoison_string(ptr);
1113 return rb_str_new(ptr, strlen(ptr));
1114}
1115
1116VALUE
1118{
1119 return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
1120}
1121
1122VALUE
1124{
1125 return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
1126}
1127
1128VALUE
1130{
1131 must_not_null(ptr);
1132 if (rb_enc_mbminlen(enc) != 1) {
1133 rb_raise(rb_eArgError, "wchar encoding given");
1134 }
1135 return rb_enc_str_new(ptr, strlen(ptr), enc);
1136}
1137
1138static VALUE
1139str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1140{
1141 VALUE str;
1142
1143 if (len < 0) {
1144 rb_raise(rb_eArgError, "negative string size (or size too big)");
1145 }
1146
1147 if (!ptr) {
1148 str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1149 }
1150 else {
1151 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1152 str = str_alloc_heap(klass);
1153 RSTRING(str)->len = len;
1154 RSTRING(str)->as.heap.ptr = (char *)ptr;
1155 RSTRING(str)->as.heap.aux.capa = len;
1156 RBASIC(str)->flags |= STR_NOFREE;
1157 rb_enc_associate_index(str, encindex);
1158 }
1159 return str;
1160}
1161
1162VALUE
1163rb_str_new_static(const char *ptr, long len)
1164{
1165 return str_new_static(rb_cString, ptr, len, 0);
1166}
1167
1168VALUE
1170{
1171 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1172}
1173
1174VALUE
1176{
1177 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1178}
1179
1180VALUE
1182{
1183 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1184}
1185
1186static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1187 rb_encoding *from, rb_encoding *to,
1188 int ecflags, VALUE ecopts);
1189
1190static inline bool
1191is_enc_ascii_string(VALUE str, rb_encoding *enc)
1192{
1193 int encidx = rb_enc_to_index(enc);
1194 if (rb_enc_get_index(str) == encidx)
1195 return is_ascii_string(str);
1196 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1197}
1198
1199VALUE
1200rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1201{
1202 long len;
1203 const char *ptr;
1204 VALUE newstr;
1205
1206 if (!to) return str;
1207 if (!from) from = rb_enc_get(str);
1208 if (from == to) return str;
1209 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1210 rb_is_ascii8bit_enc(to)) {
1211 if (STR_ENC_GET(str) != to) {
1212 str = rb_str_dup(str);
1213 rb_enc_associate(str, to);
1214 }
1215 return str;
1216 }
1217
1218 RSTRING_GETMEM(str, ptr, len);
1219 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1220 from, to, ecflags, ecopts);
1221 if (NIL_P(newstr)) {
1222 /* some error, return original */
1223 return str;
1224 }
1225 return newstr;
1226}
1227
1228VALUE
1229rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1230 rb_encoding *from, int ecflags, VALUE ecopts)
1231{
1232 long olen;
1233
1234 olen = RSTRING_LEN(newstr);
1235 if (ofs < -olen || olen < ofs)
1236 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1237 if (ofs < 0) ofs += olen;
1238 if (!from) {
1239 STR_SET_LEN(newstr, ofs);
1240 return rb_str_cat(newstr, ptr, len);
1241 }
1242
1243 rb_str_modify(newstr);
1244 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1245 rb_enc_get(newstr),
1246 ecflags, ecopts);
1247}
1248
1249VALUE
1250rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1251{
1252 STR_SET_LEN(str, 0);
1253 rb_enc_associate(str, enc);
1254 rb_str_cat(str, ptr, len);
1255 return str;
1256}
1257
1258static VALUE
1259str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1260 rb_encoding *from, rb_encoding *to,
1261 int ecflags, VALUE ecopts)
1262{
1263 rb_econv_t *ec;
1265 long olen;
1266 VALUE econv_wrapper;
1267 const unsigned char *start, *sp;
1268 unsigned char *dest, *dp;
1269 size_t converted_output = (size_t)ofs;
1270
1271 olen = rb_str_capacity(newstr);
1272
1273 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1274 RBASIC_CLEAR_CLASS(econv_wrapper);
1275 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1276 if (!ec) return Qnil;
1277 DATA_PTR(econv_wrapper) = ec;
1278
1279 sp = (unsigned char*)ptr;
1280 start = sp;
1281 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1282 (dp = dest + converted_output),
1283 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1285 /* destination buffer short */
1286 size_t converted_input = sp - start;
1287 size_t rest = len - converted_input;
1288 converted_output = dp - dest;
1289 rb_str_set_len(newstr, converted_output);
1290 if (converted_input && converted_output &&
1291 rest < (LONG_MAX / converted_output)) {
1292 rest = (rest * converted_output) / converted_input;
1293 }
1294 else {
1295 rest = olen;
1296 }
1297 olen += rest < 2 ? 2 : rest;
1298 rb_str_resize(newstr, olen);
1299 }
1300 DATA_PTR(econv_wrapper) = 0;
1301 RB_GC_GUARD(econv_wrapper);
1302 rb_econv_close(ec);
1303 switch (ret) {
1304 case econv_finished:
1305 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1306 rb_str_set_len(newstr, len);
1307 rb_enc_associate(newstr, to);
1308 return newstr;
1309
1310 default:
1311 return Qnil;
1312 }
1313}
1314
1315VALUE
1317{
1318 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1319}
1320
1321VALUE
1323{
1324 rb_encoding *ienc;
1325 VALUE str;
1326 const int eidx = rb_enc_to_index(eenc);
1327
1328 if (!ptr) {
1329 return rb_enc_str_new(ptr, len, eenc);
1330 }
1331
1332 /* ASCII-8BIT case, no conversion */
1333 if ((eidx == rb_ascii8bit_encindex()) ||
1334 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1335 return rb_str_new(ptr, len);
1336 }
1337 /* no default_internal or same encoding, no conversion */
1338 ienc = rb_default_internal_encoding();
1339 if (!ienc || eenc == ienc) {
1340 return rb_enc_str_new(ptr, len, eenc);
1341 }
1342 /* ASCII compatible, and ASCII only string, no conversion in
1343 * default_internal */
1344 if ((eidx == rb_ascii8bit_encindex()) ||
1345 (eidx == rb_usascii_encindex()) ||
1346 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1347 return rb_enc_str_new(ptr, len, ienc);
1348 }
1349 /* convert from the given encoding to default_internal */
1350 str = rb_enc_str_new(NULL, 0, ienc);
1351 /* when the conversion failed for some reason, just ignore the
1352 * default_internal and result in the given encoding as-is. */
1353 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1354 rb_str_initialize(str, ptr, len, eenc);
1355 }
1356 return str;
1357}
1358
1359VALUE
1360rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1361{
1362 int eidx = rb_enc_to_index(eenc);
1363 if (eidx == rb_usascii_encindex() &&
1364 !is_ascii_string(str)) {
1365 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1366 return str;
1367 }
1368 rb_enc_associate_index(str, eidx);
1369 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1370}
1371
1372VALUE
1373rb_external_str_new(const char *ptr, long len)
1374{
1375 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1376}
1377
1378VALUE
1380{
1381 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1382}
1383
1384VALUE
1385rb_locale_str_new(const char *ptr, long len)
1386{
1387 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1388}
1389
1390VALUE
1392{
1393 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1394}
1395
1396VALUE
1398{
1399 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1400}
1401
1402VALUE
1404{
1405 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1406}
1407
1408VALUE
1410{
1411 return rb_str_export_to_enc(str, rb_default_external_encoding());
1412}
1413
1414VALUE
1416{
1417 return rb_str_export_to_enc(str, rb_locale_encoding());
1418}
1419
1420VALUE
1422{
1423 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1424}
1425
1426static VALUE
1427str_replace_shared_without_enc(VALUE str2, VALUE str)
1428{
1429 const int termlen = TERM_LEN(str);
1430 char *ptr;
1431 long len;
1432
1433 RSTRING_GETMEM(str, ptr, len);
1434 if (str_embed_capa(str2) >= len + termlen) {
1435 char *ptr2 = RSTRING(str2)->as.embed.ary;
1436 STR_SET_EMBED(str2);
1437 memcpy(ptr2, RSTRING_PTR(str), len);
1438 TERM_FILL(ptr2+len, termlen);
1439 }
1440 else {
1441 VALUE root;
1442 if (STR_SHARED_P(str)) {
1443 root = RSTRING(str)->as.heap.aux.shared;
1444 RSTRING_GETMEM(str, ptr, len);
1445 }
1446 else {
1447 root = rb_str_new_frozen(str);
1448 RSTRING_GETMEM(root, ptr, len);
1449 }
1450 RUBY_ASSERT(OBJ_FROZEN(root));
1451
1452 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1453 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1454 rb_fatal("about to free a possible shared root");
1455 }
1456 char *ptr2 = STR_HEAP_PTR(str2);
1457 if (ptr2 != ptr) {
1458 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1459 }
1460 }
1461 FL_SET(str2, STR_NOEMBED);
1462 RSTRING(str2)->as.heap.ptr = ptr;
1463 STR_SET_SHARED(str2, root);
1464 }
1465
1466 STR_SET_LEN(str2, len);
1467
1468 return str2;
1469}
1470
1471static VALUE
1472str_replace_shared(VALUE str2, VALUE str)
1473{
1474 str_replace_shared_without_enc(str2, str);
1475 rb_enc_cr_str_exact_copy(str2, str);
1476 return str2;
1477}
1478
1479static VALUE
1480str_new_shared(VALUE klass, VALUE str)
1481{
1482 return str_replace_shared(str_alloc_heap(klass), str);
1483}
1484
1485VALUE
1487{
1488 return str_new_shared(rb_obj_class(str), str);
1489}
1490
1491VALUE
1493{
1494 if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1495 return str_new_frozen(rb_obj_class(orig), orig);
1496}
1497
1498static VALUE
1499rb_str_new_frozen_String(VALUE orig)
1500{
1501 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1502 return str_new_frozen(rb_cString, orig);
1503}
1504
1505
1506VALUE
1507rb_str_frozen_bare_string(VALUE orig)
1508{
1509 if (RB_LIKELY(BARE_STRING_P(orig) && OBJ_FROZEN_RAW(orig))) return orig;
1510 return str_new_frozen(rb_cString, orig);
1511}
1512
1513VALUE
1514rb_str_tmp_frozen_acquire(VALUE orig)
1515{
1516 if (OBJ_FROZEN_RAW(orig)) return orig;
1517 return str_new_frozen_buffer(0, orig, FALSE);
1518}
1519
1520VALUE
1521rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1522{
1523 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1524 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1525
1526 VALUE str = str_alloc_heap(0);
1527 OBJ_FREEZE(str);
1528 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1529 FL_SET(str, STR_SHARED_ROOT);
1530
1531 size_t capa = str_capacity(orig, TERM_LEN(orig));
1532
1533 /* If the string is embedded then we want to create a copy that is heap
1534 * allocated. If the string is shared then the shared root must be
1535 * embedded, so we want to create a copy. If the string is a shared root
1536 * then it must be embedded, so we want to create a copy. */
1537 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1538 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1539 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1540 }
1541 else {
1542 /* orig must be heap allocated and not shared, so we can safely transfer
1543 * the pointer to str. */
1544 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1545 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1546 RBASIC(orig)->flags &= ~STR_NOFREE;
1547 STR_SET_SHARED(orig, str);
1548 }
1549
1550 RSTRING(str)->len = RSTRING(orig)->len;
1551 RSTRING(str)->as.heap.aux.capa = capa;
1552
1553 return str;
1554}
1555
1556void
1557rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1558{
1559 if (RBASIC_CLASS(tmp) != 0)
1560 return;
1561
1562 if (STR_EMBED_P(tmp)) {
1564 }
1565 else if (FL_TEST_RAW(orig, STR_SHARED | STR_TMPLOCK) == STR_TMPLOCK &&
1566 !OBJ_FROZEN_RAW(orig)) {
1567 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1568
1569 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1570 RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1571 RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1572
1573 /* Unshare orig since the root (tmp) only has this one child. */
1574 FL_UNSET_RAW(orig, STR_SHARED);
1575 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1576 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1578
1579 /* Make tmp embedded and empty so it is safe for sweeping. */
1580 STR_SET_EMBED(tmp);
1581 STR_SET_LEN(tmp, 0);
1582 }
1583 }
1584}
1585
1586static VALUE
1587str_new_frozen(VALUE klass, VALUE orig)
1588{
1589 return str_new_frozen_buffer(klass, orig, TRUE);
1590}
1591
1592static VALUE
1593heap_str_make_shared(VALUE klass, VALUE orig)
1594{
1595 RUBY_ASSERT(!STR_EMBED_P(orig));
1596 RUBY_ASSERT(!STR_SHARED_P(orig));
1597
1598 VALUE str = str_alloc_heap(klass);
1599 STR_SET_LEN(str, RSTRING_LEN(orig));
1600 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1601 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1602 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1603 RBASIC(orig)->flags &= ~STR_NOFREE;
1604 STR_SET_SHARED(orig, str);
1605 if (klass == 0)
1606 FL_UNSET_RAW(str, STR_BORROWED);
1607 return str;
1608}
1609
1610static VALUE
1611str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1612{
1613 VALUE str;
1614
1615 long len = RSTRING_LEN(orig);
1616 rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1617 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1618
1619 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1620 str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1621 RUBY_ASSERT(STR_EMBED_P(str));
1622 }
1623 else {
1624 if (FL_TEST_RAW(orig, STR_SHARED)) {
1625 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1626 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1627 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1628 RUBY_ASSERT(ofs >= 0);
1629 RUBY_ASSERT(rest >= 0);
1630 RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1632
1633 if ((ofs > 0) || (rest > 0) ||
1634 (klass != RBASIC(shared)->klass) ||
1635 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1636 str = str_new_shared(klass, shared);
1637 RUBY_ASSERT(!STR_EMBED_P(str));
1638 RSTRING(str)->as.heap.ptr += ofs;
1639 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1640 }
1641 else {
1642 if (RBASIC_CLASS(shared) == 0)
1643 FL_SET_RAW(shared, STR_BORROWED);
1644 return shared;
1645 }
1646 }
1647 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1648 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1649 STR_SET_EMBED(str);
1650 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1651 STR_SET_LEN(str, RSTRING_LEN(orig));
1652 ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1653 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1654 }
1655 else {
1656 str = heap_str_make_shared(klass, orig);
1657 }
1658 }
1659
1660 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1661 OBJ_FREEZE(str);
1662 return str;
1663}
1664
1665VALUE
1666rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1667{
1668 return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1669}
1670
1671static VALUE
1672str_new_empty_String(VALUE str)
1673{
1674 VALUE v = rb_str_new(0, 0);
1675 rb_enc_copy(v, str);
1676 return v;
1677}
1678
1679#define STR_BUF_MIN_SIZE 63
1680
1681VALUE
1683{
1684 if (STR_EMBEDDABLE_P(capa, 1)) {
1685 return str_alloc_embed(rb_cString, capa + 1);
1686 }
1687
1688 VALUE str = str_alloc_heap(rb_cString);
1689
1690 RSTRING(str)->as.heap.aux.capa = capa;
1691 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1692 RSTRING(str)->as.heap.ptr[0] = '\0';
1693
1694 return str;
1695}
1696
1697VALUE
1699{
1700 VALUE str;
1701 long len = strlen(ptr);
1702
1703 str = rb_str_buf_new(len);
1704 rb_str_buf_cat(str, ptr, len);
1705
1706 return str;
1707}
1708
1709VALUE
1711{
1712 return str_new(0, 0, len);
1713}
1714
1715void
1717{
1718 if (STR_EMBED_P(str)) {
1719 RB_DEBUG_COUNTER_INC(obj_str_embed);
1720 }
1721 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1722 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1723 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1724 }
1725 else {
1726 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1727 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1728 }
1729}
1730
1731size_t
1732rb_str_memsize(VALUE str)
1733{
1734 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1735 return STR_HEAP_SIZE(str);
1736 }
1737 else {
1738 return 0;
1739 }
1740}
1741
1742VALUE
1744{
1745 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1746}
1747
1748static inline void str_discard(VALUE str);
1749static void str_shared_replace(VALUE str, VALUE str2);
1750
1751void
1753{
1754 if (str != str2) str_shared_replace(str, str2);
1755}
1756
1757static void
1758str_shared_replace(VALUE str, VALUE str2)
1759{
1760 rb_encoding *enc;
1761 int cr;
1762 int termlen;
1763
1764 RUBY_ASSERT(str2 != str);
1765 enc = STR_ENC_GET(str2);
1766 cr = ENC_CODERANGE(str2);
1767 str_discard(str);
1768 termlen = rb_enc_mbminlen(enc);
1769
1770 STR_SET_LEN(str, RSTRING_LEN(str2));
1771
1772 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1773 STR_SET_EMBED(str);
1774 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1775 rb_enc_associate(str, enc);
1776 ENC_CODERANGE_SET(str, cr);
1777 }
1778 else {
1779 if (STR_EMBED_P(str2)) {
1780 RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1781 long len = RSTRING_LEN(str2);
1782 RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1783
1784 char *new_ptr = ALLOC_N(char, len + termlen);
1785 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1786 RSTRING(str2)->as.heap.ptr = new_ptr;
1787 STR_SET_LEN(str2, len);
1788 RSTRING(str2)->as.heap.aux.capa = len;
1789 STR_SET_NOEMBED(str2);
1790 }
1791
1792 STR_SET_NOEMBED(str);
1793 FL_UNSET(str, STR_SHARED);
1794 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1795
1796 if (FL_TEST(str2, STR_SHARED)) {
1797 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1798 STR_SET_SHARED(str, shared);
1799 }
1800 else {
1801 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1802 }
1803
1804 /* abandon str2 */
1805 STR_SET_EMBED(str2);
1806 RSTRING_PTR(str2)[0] = 0;
1807 STR_SET_LEN(str2, 0);
1808 rb_enc_associate(str, enc);
1809 ENC_CODERANGE_SET(str, cr);
1810 }
1811}
1812
1813VALUE
1815{
1816 VALUE str;
1817
1818 if (RB_TYPE_P(obj, T_STRING)) {
1819 return obj;
1820 }
1821 str = rb_funcall(obj, idTo_s, 0);
1822 return rb_obj_as_string_result(str, obj);
1823}
1824
1825VALUE
1826rb_obj_as_string_result(VALUE str, VALUE obj)
1827{
1828 if (!RB_TYPE_P(str, T_STRING))
1829 return rb_any_to_s(obj);
1830 return str;
1831}
1832
1833static VALUE
1834str_replace(VALUE str, VALUE str2)
1835{
1836 long len;
1837
1838 len = RSTRING_LEN(str2);
1839 if (STR_SHARED_P(str2)) {
1840 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1842 STR_SET_NOEMBED(str);
1843 STR_SET_LEN(str, len);
1844 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1845 STR_SET_SHARED(str, shared);
1846 rb_enc_cr_str_exact_copy(str, str2);
1847 }
1848 else {
1849 str_replace_shared(str, str2);
1850 }
1851
1852 return str;
1853}
1854
1855static inline VALUE
1856ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1857{
1858 size_t size = rb_str_embed_size(capa);
1859 RUBY_ASSERT(size > 0);
1860 RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1861
1862 NEWOBJ_OF(str, struct RString, klass,
1864
1865 return (VALUE)str;
1866}
1867
1868static inline VALUE
1869ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1870{
1871 NEWOBJ_OF(str, struct RString, klass,
1872 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1873
1874 return (VALUE)str;
1875}
1876
1877static inline VALUE
1878str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1879{
1880 int encidx = 0;
1881 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1882 encidx = rb_enc_get_index(str);
1883 flags &= ~ENCODING_MASK;
1884 }
1885 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1886 if (encidx) rb_enc_associate_index(dup, encidx);
1887 return dup;
1888}
1889
1890static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1891
1892static inline VALUE
1893str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1894{
1895 VALUE flags = FL_TEST_RAW(str, flag_mask);
1896 long len = RSTRING_LEN(str);
1897
1898 RUBY_ASSERT(STR_EMBED_P(dup));
1899 RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1900 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1901 STR_SET_LEN(dup, RSTRING_LEN(str));
1902 return str_duplicate_setup_encoding(str, dup, flags);
1903}
1904
1905static inline VALUE
1906str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1907{
1908 VALUE flags = FL_TEST_RAW(str, flag_mask);
1909 VALUE root = str;
1910 if (FL_TEST_RAW(str, STR_SHARED)) {
1911 root = RSTRING(str)->as.heap.aux.shared;
1912 }
1913 else if (UNLIKELY(!OBJ_FROZEN_RAW(str))) {
1914 root = str = str_new_frozen(klass, str);
1915 flags = FL_TEST_RAW(str, flag_mask);
1916 }
1917 RUBY_ASSERT(!STR_SHARED_P(root));
1919
1920 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1921 FL_SET(root, STR_SHARED_ROOT);
1922 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1923 flags |= RSTRING_NOEMBED | STR_SHARED;
1924
1925 STR_SET_LEN(dup, RSTRING_LEN(str));
1926 return str_duplicate_setup_encoding(str, dup, flags);
1927}
1928
1929static inline VALUE
1930str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1931{
1932 if (STR_EMBED_P(str)) {
1933 return str_duplicate_setup_embed(klass, str, dup);
1934 }
1935 else {
1936 return str_duplicate_setup_heap(klass, str, dup);
1937 }
1938}
1939
1940static inline VALUE
1941str_duplicate(VALUE klass, VALUE str)
1942{
1943 VALUE dup;
1944 if (STR_EMBED_P(str)) {
1945 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1946 }
1947 else {
1948 dup = str_alloc_heap(klass);
1949 }
1950
1951 return str_duplicate_setup(klass, str, dup);
1952}
1953
1954VALUE
1956{
1957 return str_duplicate(rb_obj_class(str), str);
1958}
1959
1960/* :nodoc: */
1961VALUE
1962rb_str_dup_m(VALUE str)
1963{
1964 if (LIKELY(BARE_STRING_P(str))) {
1965 return str_duplicate(rb_cString, str);
1966 }
1967 else {
1968 return rb_obj_dup(str);
1969 }
1970}
1971
1972VALUE
1974{
1975 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1976 return str_duplicate(rb_cString, str);
1977}
1978
1979VALUE
1980rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1981{
1982 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1983 VALUE new_str, klass = rb_cString;
1984
1985 if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1986 new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1987 str_duplicate_setup_embed(klass, str, new_str);
1988 }
1989 else {
1990 new_str = ec_str_alloc_heap(ec, klass);
1991 str_duplicate_setup_heap(klass, str, new_str);
1992 }
1993 if (chilled) {
1994 FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1995 }
1996 return new_str;
1997}
1998
1999VALUE
2000rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
2001{
2002 VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
2003 if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
2004 rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
2005 FL_SET_RAW(str, STR_CHILLED_LITERAL);
2006 return rb_str_freeze(str);
2007}
2008
2009/*
2010 * The documentation block below uses an include (instead of inline text)
2011 * because the included text has non-ASCII characters (which are not allowed in a C file).
2012 */
2013
2014/*
2015 *
2016 * call-seq:
2017 * String.new(string = ''.encode(Encoding::ASCII_8BIT) , **options) -> new_string
2018 *
2019 * :include: doc/string/new.rdoc
2020 *
2021 */
2022
2023static VALUE
2024rb_str_init(int argc, VALUE *argv, VALUE str)
2025{
2026 static ID keyword_ids[2];
2027 VALUE orig, opt, venc, vcapa;
2028 VALUE kwargs[2];
2029 rb_encoding *enc = 0;
2030 int n;
2031
2032 if (!keyword_ids[0]) {
2033 keyword_ids[0] = rb_id_encoding();
2034 CONST_ID(keyword_ids[1], "capacity");
2035 }
2036
2037 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2038 if (!NIL_P(opt)) {
2039 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2040 venc = kwargs[0];
2041 vcapa = kwargs[1];
2042 if (!UNDEF_P(venc) && !NIL_P(venc)) {
2043 enc = rb_to_encoding(venc);
2044 }
2045 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2046 long capa = NUM2LONG(vcapa);
2047 long len = 0;
2048 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2049
2050 if (capa < STR_BUF_MIN_SIZE) {
2051 capa = STR_BUF_MIN_SIZE;
2052 }
2053 if (n == 1) {
2054 StringValue(orig);
2055 len = RSTRING_LEN(orig);
2056 if (capa < len) {
2057 capa = len;
2058 }
2059 if (orig == str) n = 0;
2060 }
2061 str_modifiable(str);
2062 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2063 /* make noembed always */
2064 const size_t size = (size_t)capa + termlen;
2065 const char *const old_ptr = RSTRING_PTR(str);
2066 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2067 char *new_ptr = ALLOC_N(char, size);
2068 if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2069 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2070 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2071 RSTRING(str)->as.heap.ptr = new_ptr;
2072 }
2073 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2074 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2075 (size_t)capa + termlen, STR_HEAP_SIZE(str));
2076 }
2077 STR_SET_LEN(str, len);
2078 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2079 if (n == 1) {
2080 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2081 rb_enc_cr_str_exact_copy(str, orig);
2082 }
2083 FL_SET(str, STR_NOEMBED);
2084 RSTRING(str)->as.heap.aux.capa = capa;
2085 }
2086 else if (n == 1) {
2087 rb_str_replace(str, orig);
2088 }
2089 if (enc) {
2090 rb_enc_associate(str, enc);
2092 }
2093 }
2094 else if (n == 1) {
2095 rb_str_replace(str, orig);
2096 }
2097 return str;
2098}
2099
2100/* :nodoc: */
2101static VALUE
2102rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2103{
2104 if (klass != rb_cString) {
2105 return rb_class_new_instance_pass_kw(argc, argv, klass);
2106 }
2107
2108 static ID keyword_ids[2];
2109 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2110 VALUE kwargs[2];
2111 rb_encoding *enc = NULL;
2112
2113 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2114 if (NIL_P(opt)) {
2115 return rb_class_new_instance_pass_kw(argc, argv, klass);
2116 }
2117
2118 keyword_ids[0] = rb_id_encoding();
2119 CONST_ID(keyword_ids[1], "capacity");
2120 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2121 encoding = kwargs[0];
2122 capacity = kwargs[1];
2123
2124 if (n == 1) {
2125 orig = StringValue(orig);
2126 }
2127 else {
2128 orig = Qnil;
2129 }
2130
2131 if (UNDEF_P(encoding)) {
2132 if (!NIL_P(orig)) {
2133 encoding = rb_obj_encoding(orig);
2134 }
2135 }
2136
2137 if (!UNDEF_P(encoding)) {
2138 enc = rb_to_encoding(encoding);
2139 }
2140
2141 // If capacity is nil, we're basically just duping `orig`.
2142 if (UNDEF_P(capacity)) {
2143 if (NIL_P(orig)) {
2144 VALUE empty_str = str_new(klass, "", 0);
2145 if (enc) {
2146 rb_enc_associate(empty_str, enc);
2147 }
2148 return empty_str;
2149 }
2150 VALUE copy = str_duplicate(klass, orig);
2151 rb_enc_associate(copy, enc);
2152 ENC_CODERANGE_CLEAR(copy);
2153 return copy;
2154 }
2155
2156 long capa = 0;
2157 capa = NUM2LONG(capacity);
2158 if (capa < 0) {
2159 capa = 0;
2160 }
2161
2162 if (!NIL_P(orig)) {
2163 long orig_capa = rb_str_capacity(orig);
2164 if (orig_capa > capa) {
2165 capa = orig_capa;
2166 }
2167 }
2168
2169 VALUE str = str_enc_new(klass, NULL, capa, enc);
2170 STR_SET_LEN(str, 0);
2171 TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2172
2173 if (!NIL_P(orig)) {
2174 rb_str_buf_append(str, orig);
2175 }
2176
2177 return str;
2178}
2179
2180#ifdef NONASCII_MASK
2181#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2182
2183/*
2184 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2185 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2186 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2187 *
2188 * if (!(byte & 0x80))
2189 * byte |= 0x40; // turn on bit6
2190 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2191 *
2192 * This function calculates whether a byte is leading or not for all bytes
2193 * in the argument word by concurrently using the above logic, and then
2194 * adds up the number of leading bytes in the word.
2195 */
2196static inline uintptr_t
2197count_utf8_lead_bytes_with_word(const uintptr_t *s)
2198{
2199 uintptr_t d = *s;
2200
2201 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2202 d = (d>>6) | (~d>>7);
2203 d &= NONASCII_MASK >> 7;
2204
2205 /* Gather all bytes. */
2206#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2207 /* use only if it can use POPCNT */
2208 return rb_popcount_intptr(d);
2209#else
2210 d += (d>>8);
2211 d += (d>>16);
2212# if SIZEOF_VOIDP == 8
2213 d += (d>>32);
2214# endif
2215 return (d&0xF);
2216#endif
2217}
2218#endif
2219
2220static inline long
2221enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2222{
2223 long c;
2224 const char *q;
2225
2226 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2227 long diff = (long)(e - p);
2228 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2229 }
2230#ifdef NONASCII_MASK
2231 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2232 uintptr_t len = 0;
2233 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2234 const uintptr_t *s, *t;
2235 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2236 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2237 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2238 while (p < (const char *)s) {
2239 if (is_utf8_lead_byte(*p)) len++;
2240 p++;
2241 }
2242 while (s < t) {
2243 len += count_utf8_lead_bytes_with_word(s);
2244 s++;
2245 }
2246 p = (const char *)s;
2247 }
2248 while (p < e) {
2249 if (is_utf8_lead_byte(*p)) len++;
2250 p++;
2251 }
2252 return (long)len;
2253 }
2254#endif
2255 else if (rb_enc_asciicompat(enc)) {
2256 c = 0;
2257 if (ENC_CODERANGE_CLEAN_P(cr)) {
2258 while (p < e) {
2259 if (ISASCII(*p)) {
2260 q = search_nonascii(p, e);
2261 if (!q)
2262 return c + (e - p);
2263 c += q - p;
2264 p = q;
2265 }
2266 p += rb_enc_fast_mbclen(p, e, enc);
2267 c++;
2268 }
2269 }
2270 else {
2271 while (p < e) {
2272 if (ISASCII(*p)) {
2273 q = search_nonascii(p, e);
2274 if (!q)
2275 return c + (e - p);
2276 c += q - p;
2277 p = q;
2278 }
2279 p += rb_enc_mbclen(p, e, enc);
2280 c++;
2281 }
2282 }
2283 return c;
2284 }
2285
2286 for (c=0; p<e; c++) {
2287 p += rb_enc_mbclen(p, e, enc);
2288 }
2289 return c;
2290}
2291
2292long
2293rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2294{
2295 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2296}
2297
2298/* To get strlen with cr
2299 * Note that given cr is not used.
2300 */
2301long
2302rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2303{
2304 long c;
2305 const char *q;
2306 int ret;
2307
2308 *cr = 0;
2309 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2310 long diff = (long)(e - p);
2311 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2312 }
2313 else if (rb_enc_asciicompat(enc)) {
2314 c = 0;
2315 while (p < e) {
2316 if (ISASCII(*p)) {
2317 q = search_nonascii(p, e);
2318 if (!q) {
2319 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2320 return c + (e - p);
2321 }
2322 c += q - p;
2323 p = q;
2324 }
2325 ret = rb_enc_precise_mbclen(p, e, enc);
2326 if (MBCLEN_CHARFOUND_P(ret)) {
2327 *cr |= ENC_CODERANGE_VALID;
2328 p += MBCLEN_CHARFOUND_LEN(ret);
2329 }
2330 else {
2332 p++;
2333 }
2334 c++;
2335 }
2336 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2337 return c;
2338 }
2339
2340 for (c=0; p<e; c++) {
2341 ret = rb_enc_precise_mbclen(p, e, enc);
2342 if (MBCLEN_CHARFOUND_P(ret)) {
2343 *cr |= ENC_CODERANGE_VALID;
2344 p += MBCLEN_CHARFOUND_LEN(ret);
2345 }
2346 else {
2348 if (p + rb_enc_mbminlen(enc) <= e)
2349 p += rb_enc_mbminlen(enc);
2350 else
2351 p = e;
2352 }
2353 }
2354 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2355 return c;
2356}
2357
2358/* enc must be str's enc or rb_enc_check(str, str2) */
2359static long
2360str_strlen(VALUE str, rb_encoding *enc)
2361{
2362 const char *p, *e;
2363 int cr;
2364
2365 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2366 if (!enc) enc = STR_ENC_GET(str);
2367 p = RSTRING_PTR(str);
2368 e = RSTRING_END(str);
2369 cr = ENC_CODERANGE(str);
2370
2371 if (cr == ENC_CODERANGE_UNKNOWN) {
2372 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2373 if (cr) ENC_CODERANGE_SET(str, cr);
2374 return n;
2375 }
2376 else {
2377 return enc_strlen(p, e, enc, cr);
2378 }
2379}
2380
2381long
2383{
2384 return str_strlen(str, NULL);
2385}
2386
2387/*
2388 * call-seq:
2389 * length -> integer
2390 *
2391 * :include: doc/string/length.rdoc
2392 *
2393 */
2394
2395VALUE
2397{
2398 return LONG2NUM(str_strlen(str, NULL));
2399}
2400
2401/*
2402 * call-seq:
2403 * bytesize -> integer
2404 *
2405 * :include: doc/string/bytesize.rdoc
2406 *
2407 */
2408
2409VALUE
2410rb_str_bytesize(VALUE str)
2411{
2412 return LONG2NUM(RSTRING_LEN(str));
2413}
2414
2415/*
2416 * call-seq:
2417 * empty? -> true or false
2418 *
2419 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2420 *
2421 * "hello".empty? # => false
2422 * " ".empty? # => false
2423 * "".empty? # => true
2424 *
2425 */
2426
2427static VALUE
2428rb_str_empty(VALUE str)
2429{
2430 return RBOOL(RSTRING_LEN(str) == 0);
2431}
2432
2433/*
2434 * call-seq:
2435 * self + other_string -> new_string
2436 *
2437 * Returns a new string containing +other_string+ concatenated to +self+:
2438 *
2439 * 'Hello from ' + self.to_s # => "Hello from main"
2440 *
2441 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2442 */
2443
2444VALUE
2446{
2447 VALUE str3;
2448 rb_encoding *enc;
2449 char *ptr1, *ptr2, *ptr3;
2450 long len1, len2;
2451 int termlen;
2452
2453 StringValue(str2);
2454 enc = rb_enc_check_str(str1, str2);
2455 RSTRING_GETMEM(str1, ptr1, len1);
2456 RSTRING_GETMEM(str2, ptr2, len2);
2457 termlen = rb_enc_mbminlen(enc);
2458 if (len1 > LONG_MAX - len2) {
2459 rb_raise(rb_eArgError, "string size too big");
2460 }
2461 str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2462 ptr3 = RSTRING_PTR(str3);
2463 memcpy(ptr3, ptr1, len1);
2464 memcpy(ptr3+len1, ptr2, len2);
2465 TERM_FILL(&ptr3[len1+len2], termlen);
2466
2467 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2469 RB_GC_GUARD(str1);
2470 RB_GC_GUARD(str2);
2471 return str3;
2472}
2473
2474/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2475VALUE
2476rb_str_opt_plus(VALUE str1, VALUE str2)
2477{
2480 long len1, len2;
2481 MAYBE_UNUSED(char) *ptr1, *ptr2;
2482 RSTRING_GETMEM(str1, ptr1, len1);
2483 RSTRING_GETMEM(str2, ptr2, len2);
2484 int enc1 = rb_enc_get_index(str1);
2485 int enc2 = rb_enc_get_index(str2);
2486
2487 if (enc1 < 0) {
2488 return Qundef;
2489 }
2490 else if (enc2 < 0) {
2491 return Qundef;
2492 }
2493 else if (enc1 != enc2) {
2494 return Qundef;
2495 }
2496 else if (len1 > LONG_MAX - len2) {
2497 return Qundef;
2498 }
2499 else {
2500 return rb_str_plus(str1, str2);
2501 }
2502
2503}
2504
2505/*
2506 * call-seq:
2507 * self * n -> new_string
2508 *
2509 * Returns a new string containing +n+ copies of +self+:
2510 *
2511 * 'Ho!' * 3 # => "Ho!Ho!Ho!"
2512 * 'No!' * 0 # => ""
2513 *
2514 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2515 */
2516
2517VALUE
2519{
2520 VALUE str2;
2521 long n, len;
2522 char *ptr2;
2523 int termlen;
2524
2525 if (times == INT2FIX(1)) {
2526 return str_duplicate(rb_cString, str);
2527 }
2528 if (times == INT2FIX(0)) {
2529 str2 = str_alloc_embed(rb_cString, 0);
2530 rb_enc_copy(str2, str);
2531 return str2;
2532 }
2533 len = NUM2LONG(times);
2534 if (len < 0) {
2535 rb_raise(rb_eArgError, "negative argument");
2536 }
2537 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2538 if (STR_EMBEDDABLE_P(len, 1)) {
2539 str2 = str_alloc_embed(rb_cString, len + 1);
2540 memset(RSTRING_PTR(str2), 0, len + 1);
2541 }
2542 else {
2543 str2 = str_alloc_heap(rb_cString);
2544 RSTRING(str2)->as.heap.aux.capa = len;
2545 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2546 }
2547 STR_SET_LEN(str2, len);
2548 rb_enc_copy(str2, str);
2549 return str2;
2550 }
2551 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2552 rb_raise(rb_eArgError, "argument too big");
2553 }
2554
2555 len *= RSTRING_LEN(str);
2556 termlen = TERM_LEN(str);
2557 str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2558 ptr2 = RSTRING_PTR(str2);
2559 if (len) {
2560 n = RSTRING_LEN(str);
2561 memcpy(ptr2, RSTRING_PTR(str), n);
2562 while (n <= len/2) {
2563 memcpy(ptr2 + n, ptr2, n);
2564 n *= 2;
2565 }
2566 memcpy(ptr2 + n, ptr2, len-n);
2567 }
2568 STR_SET_LEN(str2, len);
2569 TERM_FILL(&ptr2[len], termlen);
2570 rb_enc_cr_str_copy_for_substr(str2, str);
2571
2572 return str2;
2573}
2574
2575/*
2576 * call-seq:
2577 * self % object -> new_string
2578 *
2579 * Returns the result of formatting +object+ into the format specifications
2580 * contained in +self+
2581 * (see {Format Specifications}[rdoc-ref:format_specifications.rdoc]):
2582 *
2583 * '%05d' % 123 # => "00123"
2584 *
2585 * If +self+ contains multiple format specifications,
2586 * +object+ must be an array or hash containing the objects to be formatted:
2587 *
2588 * '%-5s: %016x' % [ 'ID', self.object_id ] # => "ID : 00002b054ec93168"
2589 * 'foo = %{foo}' % {foo: 'bar'} # => "foo = bar"
2590 * 'foo = %{foo}, baz = %{baz}' % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2591 *
2592 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
2593 */
2594
2595static VALUE
2596rb_str_format_m(VALUE str, VALUE arg)
2597{
2598 VALUE tmp = rb_check_array_type(arg);
2599
2600 if (!NIL_P(tmp)) {
2601 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2602 }
2603 return rb_str_format(1, &arg, str);
2604}
2605
2606static inline void
2607rb_check_lockedtmp(VALUE str)
2608{
2609 if (FL_TEST(str, STR_TMPLOCK)) {
2610 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2611 }
2612}
2613
2614// If none of these flags are set, we know we have an modifiable string.
2615// If any is set, we need to do more detailed checks.
2616#define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2617static inline void
2618str_modifiable(VALUE str)
2619{
2620 RUBY_ASSERT(ruby_thread_has_gvl_p());
2621
2622 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2623 if (CHILLED_STRING_P(str)) {
2624 CHILLED_STRING_MUTATED(str);
2625 }
2626 rb_check_lockedtmp(str);
2627 rb_check_frozen(str);
2628 }
2629}
2630
2631static inline int
2632str_dependent_p(VALUE str)
2633{
2634 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2635 return FALSE;
2636 }
2637 else {
2638 return TRUE;
2639 }
2640}
2641
2642// If none of these flags are set, we know we have an independent string.
2643// If any is set, we need to do more detailed checks.
2644#define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2645static inline int
2646str_independent(VALUE str)
2647{
2648 RUBY_ASSERT(ruby_thread_has_gvl_p());
2649
2650 if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2651 str_modifiable(str);
2652 return !str_dependent_p(str);
2653 }
2654 return TRUE;
2655}
2656
2657static void
2658str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2659{
2660 RUBY_ASSERT(ruby_thread_has_gvl_p());
2661
2662 char *ptr;
2663 char *oldptr;
2664 long capa = len + expand;
2665
2666 if (len > capa) len = capa;
2667
2668 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2669 ptr = RSTRING(str)->as.heap.ptr;
2670 STR_SET_EMBED(str);
2671 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2672 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2673 STR_SET_LEN(str, len);
2674 return;
2675 }
2676
2677 ptr = ALLOC_N(char, (size_t)capa + termlen);
2678 oldptr = RSTRING_PTR(str);
2679 if (oldptr) {
2680 memcpy(ptr, oldptr, len);
2681 }
2682 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2683 xfree(oldptr);
2684 }
2685 STR_SET_NOEMBED(str);
2686 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2687 TERM_FILL(ptr + len, termlen);
2688 RSTRING(str)->as.heap.ptr = ptr;
2689 STR_SET_LEN(str, len);
2690 RSTRING(str)->as.heap.aux.capa = capa;
2691}
2692
2693void
2694rb_str_modify(VALUE str)
2695{
2696 if (!str_independent(str))
2697 str_make_independent(str);
2699}
2700
2701void
2703{
2704 RUBY_ASSERT(ruby_thread_has_gvl_p());
2705
2706 int termlen = TERM_LEN(str);
2707 long len = RSTRING_LEN(str);
2708
2709 if (expand < 0) {
2710 rb_raise(rb_eArgError, "negative expanding string size");
2711 }
2712 if (expand >= LONG_MAX - len) {
2713 rb_raise(rb_eArgError, "string size too big");
2714 }
2715
2716 if (!str_independent(str)) {
2717 str_make_independent_expand(str, len, expand, termlen);
2718 }
2719 else if (expand > 0) {
2720 RESIZE_CAPA_TERM(str, len + expand, termlen);
2721 }
2723}
2724
2725/* As rb_str_modify(), but don't clear coderange */
2726static void
2727str_modify_keep_cr(VALUE str)
2728{
2729 if (!str_independent(str))
2730 str_make_independent(str);
2732 /* Force re-scan later */
2734}
2735
2736static inline void
2737str_discard(VALUE str)
2738{
2739 str_modifiable(str);
2740 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2741 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2742 RSTRING(str)->as.heap.ptr = 0;
2743 STR_SET_LEN(str, 0);
2744 }
2745}
2746
2747void
2749{
2750 int encindex = rb_enc_get_index(str);
2751
2752 if (RB_UNLIKELY(encindex == -1)) {
2753 rb_raise(rb_eTypeError, "not encoding capable object");
2754 }
2755
2756 if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2757 return;
2758 }
2759
2760 rb_encoding *enc = rb_enc_from_index(encindex);
2761 if (!rb_enc_asciicompat(enc)) {
2762 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2763 }
2764}
2765
2766VALUE
2768{
2769 RUBY_ASSERT(ruby_thread_has_gvl_p());
2770
2771 VALUE s = *ptr;
2772 if (!RB_TYPE_P(s, T_STRING)) {
2773 s = rb_str_to_str(s);
2774 *ptr = s;
2775 }
2776 return s;
2777}
2778
2779char *
2781{
2782 VALUE str = rb_string_value(ptr);
2783 return RSTRING_PTR(str);
2784}
2785
2786static int
2787zero_filled(const char *s, int n)
2788{
2789 for (; n > 0; --n) {
2790 if (*s++) return 0;
2791 }
2792 return 1;
2793}
2794
2795static const char *
2796str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2797{
2798 const char *e = s + len;
2799
2800 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2801 if (zero_filled(s, minlen)) return s;
2802 }
2803 return 0;
2804}
2805
2806static char *
2807str_fill_term(VALUE str, char *s, long len, int termlen)
2808{
2809 /* This function assumes that (capa + termlen) bytes of memory
2810 * is allocated, like many other functions in this file.
2811 */
2812 if (str_dependent_p(str)) {
2813 if (!zero_filled(s + len, termlen))
2814 str_make_independent_expand(str, len, 0L, termlen);
2815 }
2816 else {
2817 TERM_FILL(s + len, termlen);
2818 return s;
2819 }
2820 return RSTRING_PTR(str);
2821}
2822
2823void
2824rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2825{
2826 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2827 long len = RSTRING_LEN(str);
2828
2829 RUBY_ASSERT(capa >= len);
2830 if (capa - len < termlen) {
2831 rb_check_lockedtmp(str);
2832 str_make_independent_expand(str, len, 0L, termlen);
2833 }
2834 else if (str_dependent_p(str)) {
2835 if (termlen > oldtermlen)
2836 str_make_independent_expand(str, len, 0L, termlen);
2837 }
2838 else {
2839 if (!STR_EMBED_P(str)) {
2840 /* modify capa instead of realloc */
2841 RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2842 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2843 }
2844 if (termlen > oldtermlen) {
2845 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2846 }
2847 }
2848
2849 return;
2850}
2851
2852static char *
2853str_null_check(VALUE str, int *w)
2854{
2855 char *s = RSTRING_PTR(str);
2856 long len = RSTRING_LEN(str);
2857 rb_encoding *enc = rb_enc_get(str);
2858 const int minlen = rb_enc_mbminlen(enc);
2859
2860 if (minlen > 1) {
2861 *w = 1;
2862 if (str_null_char(s, len, minlen, enc)) {
2863 return NULL;
2864 }
2865 return str_fill_term(str, s, len, minlen);
2866 }
2867 *w = 0;
2868 if (!s || memchr(s, 0, len)) {
2869 return NULL;
2870 }
2871 if (s[len]) {
2872 s = str_fill_term(str, s, len, minlen);
2873 }
2874 return s;
2875}
2876
2877char *
2878rb_str_to_cstr(VALUE str)
2879{
2880 int w;
2881 return str_null_check(str, &w);
2882}
2883
2884char *
2886{
2887 VALUE str = rb_string_value(ptr);
2888 int w;
2889 char *s = str_null_check(str, &w);
2890 if (!s) {
2891 if (w) {
2892 rb_raise(rb_eArgError, "string contains null char");
2893 }
2894 rb_raise(rb_eArgError, "string contains null byte");
2895 }
2896 return s;
2897}
2898
2899char *
2900rb_str_fill_terminator(VALUE str, const int newminlen)
2901{
2902 char *s = RSTRING_PTR(str);
2903 long len = RSTRING_LEN(str);
2904 return str_fill_term(str, s, len, newminlen);
2905}
2906
2907VALUE
2909{
2910 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2911 return str;
2912}
2913
2914/*
2915 * call-seq:
2916 * String.try_convert(object) -> object, new_string, or nil
2917 *
2918 * Attempts to convert the given +object+ to a string.
2919 *
2920 * If +object+ is already a string, returns +object+, unmodified.
2921 *
2922 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2923 * calls <tt>object.to_str</tt> and returns the result.
2924 *
2925 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2926 *
2927 * Raises an exception unless <tt>object.to_str</tt> returns a string.
2928 */
2929static VALUE
2930rb_str_s_try_convert(VALUE dummy, VALUE str)
2931{
2932 return rb_check_string_type(str);
2933}
2934
2935static char*
2936str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2937{
2938 long nth = *nthp;
2939 if (rb_enc_mbmaxlen(enc) == 1) {
2940 p += nth;
2941 }
2942 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2943 p += nth * rb_enc_mbmaxlen(enc);
2944 }
2945 else if (rb_enc_asciicompat(enc)) {
2946 const char *p2, *e2;
2947 int n;
2948
2949 while (p < e && 0 < nth) {
2950 e2 = p + nth;
2951 if (e < e2) {
2952 *nthp = nth;
2953 return (char *)e;
2954 }
2955 if (ISASCII(*p)) {
2956 p2 = search_nonascii(p, e2);
2957 if (!p2) {
2958 nth -= e2 - p;
2959 *nthp = nth;
2960 return (char *)e2;
2961 }
2962 nth -= p2 - p;
2963 p = p2;
2964 }
2965 n = rb_enc_mbclen(p, e, enc);
2966 p += n;
2967 nth--;
2968 }
2969 *nthp = nth;
2970 if (nth != 0) {
2971 return (char *)e;
2972 }
2973 return (char *)p;
2974 }
2975 else {
2976 while (p < e && nth--) {
2977 p += rb_enc_mbclen(p, e, enc);
2978 }
2979 }
2980 if (p > e) p = e;
2981 *nthp = nth;
2982 return (char*)p;
2983}
2984
2985char*
2986rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2987{
2988 return str_nth_len(p, e, &nth, enc);
2989}
2990
2991static char*
2992str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2993{
2994 if (singlebyte)
2995 p += nth;
2996 else {
2997 p = str_nth_len(p, e, &nth, enc);
2998 }
2999 if (!p) return 0;
3000 if (p > e) p = e;
3001 return (char *)p;
3002}
3003
3004/* char offset to byte offset */
3005static long
3006str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
3007{
3008 const char *pp = str_nth(p, e, nth, enc, singlebyte);
3009 if (!pp) return e - p;
3010 return pp - p;
3011}
3012
3013long
3014rb_str_offset(VALUE str, long pos)
3015{
3016 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3017 STR_ENC_GET(str), single_byte_optimizable(str));
3018}
3019
3020#ifdef NONASCII_MASK
3021static char *
3022str_utf8_nth(const char *p, const char *e, long *nthp)
3023{
3024 long nth = *nthp;
3025 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
3026 const uintptr_t *s, *t;
3027 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
3028 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
3029 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
3030 while (p < (const char *)s) {
3031 if (is_utf8_lead_byte(*p)) nth--;
3032 p++;
3033 }
3034 do {
3035 nth -= count_utf8_lead_bytes_with_word(s);
3036 s++;
3037 } while (s < t && (int)SIZEOF_VOIDP <= nth);
3038 p = (char *)s;
3039 }
3040 while (p < e) {
3041 if (is_utf8_lead_byte(*p)) {
3042 if (nth == 0) break;
3043 nth--;
3044 }
3045 p++;
3046 }
3047 *nthp = nth;
3048 return (char *)p;
3049}
3050
3051static long
3052str_utf8_offset(const char *p, const char *e, long nth)
3053{
3054 const char *pp = str_utf8_nth(p, e, &nth);
3055 return pp - p;
3056}
3057#endif
3058
3059/* byte offset to char offset */
3060long
3061rb_str_sublen(VALUE str, long pos)
3062{
3063 if (single_byte_optimizable(str) || pos < 0)
3064 return pos;
3065 else {
3066 char *p = RSTRING_PTR(str);
3067 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3068 }
3069}
3070
3071static VALUE
3072str_subseq(VALUE str, long beg, long len)
3073{
3074 VALUE str2;
3075
3076 RUBY_ASSERT(beg >= 0);
3077 RUBY_ASSERT(len >= 0);
3078 RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3079
3080 const int termlen = TERM_LEN(str);
3081 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3082 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3083 RB_GC_GUARD(str);
3084 return str2;
3085 }
3086
3087 str2 = str_alloc_heap(rb_cString);
3088 if (str_embed_capa(str2) >= len + termlen) {
3089 char *ptr2 = RSTRING(str2)->as.embed.ary;
3090 STR_SET_EMBED(str2);
3091 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3092 TERM_FILL(ptr2+len, termlen);
3093
3094 STR_SET_LEN(str2, len);
3095 RB_GC_GUARD(str);
3096 }
3097 else {
3098 str_replace_shared(str2, str);
3099 RUBY_ASSERT(!STR_EMBED_P(str2));
3100 ENC_CODERANGE_CLEAR(str2);
3101 RSTRING(str2)->as.heap.ptr += beg;
3102 if (RSTRING_LEN(str2) > len) {
3103 STR_SET_LEN(str2, len);
3104 }
3105 }
3106
3107 return str2;
3108}
3109
3110VALUE
3111rb_str_subseq(VALUE str, long beg, long len)
3112{
3113 VALUE str2 = str_subseq(str, beg, len);
3114 rb_enc_cr_str_copy_for_substr(str2, str);
3115 return str2;
3116}
3117
3118char *
3119rb_str_subpos(VALUE str, long beg, long *lenp)
3120{
3121 long len = *lenp;
3122 long slen = -1L;
3123 const long blen = RSTRING_LEN(str);
3124 rb_encoding *enc = STR_ENC_GET(str);
3125 char *p, *s = RSTRING_PTR(str), *e = s + blen;
3126
3127 if (len < 0) return 0;
3128 if (beg < 0 && -beg < 0) return 0;
3129 if (!blen) {
3130 len = 0;
3131 }
3132 if (single_byte_optimizable(str)) {
3133 if (beg > blen) return 0;
3134 if (beg < 0) {
3135 beg += blen;
3136 if (beg < 0) return 0;
3137 }
3138 if (len > blen - beg)
3139 len = blen - beg;
3140 if (len < 0) return 0;
3141 p = s + beg;
3142 goto end;
3143 }
3144 if (beg < 0) {
3145 if (len > -beg) len = -beg;
3146 if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3147 (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3148 beg = -beg;
3149 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3150 p = e;
3151 if (!p) return 0;
3152 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3153 if (!p) return 0;
3154 len = e - p;
3155 goto end;
3156 }
3157 else {
3158 slen = str_strlen(str, enc);
3159 beg += slen;
3160 if (beg < 0) return 0;
3161 p = s + beg;
3162 if (len == 0) goto end;
3163 }
3164 }
3165 else if (beg > 0 && beg > blen) {
3166 return 0;
3167 }
3168 if (len == 0) {
3169 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3170 p = s + beg;
3171 }
3172#ifdef NONASCII_MASK
3173 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3174 enc == rb_utf8_encoding()) {
3175 p = str_utf8_nth(s, e, &beg);
3176 if (beg > 0) return 0;
3177 len = str_utf8_offset(p, e, len);
3178 }
3179#endif
3180 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3181 int char_sz = rb_enc_mbmaxlen(enc);
3182
3183 p = s + beg * char_sz;
3184 if (p > e) {
3185 return 0;
3186 }
3187 else if (len * char_sz > e - p)
3188 len = e - p;
3189 else
3190 len *= char_sz;
3191 }
3192 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3193 if (beg > 0) return 0;
3194 len = 0;
3195 }
3196 else {
3197 len = str_offset(p, e, len, enc, 0);
3198 }
3199 end:
3200 *lenp = len;
3201 RB_GC_GUARD(str);
3202 return p;
3203}
3204
3205static VALUE str_substr(VALUE str, long beg, long len, int empty);
3206
3207VALUE
3208rb_str_substr(VALUE str, long beg, long len)
3209{
3210 return str_substr(str, beg, len, TRUE);
3211}
3212
3213VALUE
3214rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3215{
3216 return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3217}
3218
3219static VALUE
3220str_substr(VALUE str, long beg, long len, int empty)
3221{
3222 char *p = rb_str_subpos(str, beg, &len);
3223
3224 if (!p) return Qnil;
3225 if (!len && !empty) return Qnil;
3226
3227 beg = p - RSTRING_PTR(str);
3228
3229 VALUE str2 = str_subseq(str, beg, len);
3230 rb_enc_cr_str_copy_for_substr(str2, str);
3231 return str2;
3232}
3233
3234/* :nodoc: */
3235VALUE
3237{
3238 if (CHILLED_STRING_P(str)) {
3239 FL_UNSET_RAW(str, STR_CHILLED);
3240 }
3241
3242 if (OBJ_FROZEN(str)) return str;
3243 rb_str_resize(str, RSTRING_LEN(str));
3244 return rb_obj_freeze(str);
3245}
3246
3247/*
3248 * call-seq:
3249 * +string -> new_string or self
3250 *
3251 * Returns +self+ if +self+ is not frozen and can be mutated
3252 * without warning issuance.
3253 *
3254 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3255 *
3256 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3257 */
3258static VALUE
3259str_uplus(VALUE str)
3260{
3261 if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3262 return rb_str_dup(str);
3263 }
3264 else {
3265 return str;
3266 }
3267}
3268
3269/*
3270 * call-seq:
3271 * -self -> frozen_string
3272 *
3273 * Returns a frozen string equal to +self+.
3274 *
3275 * The returned string is +self+ if and only if all of the following are true:
3276 *
3277 * - +self+ is already frozen.
3278 * - +self+ is an instance of \String (rather than of a subclass of \String)
3279 * - +self+ has no instance variables set on it.
3280 *
3281 * Otherwise, the returned string is a frozen copy of +self+.
3282 *
3283 * Returning +self+, when possible, saves duplicating +self+;
3284 * see {Data deduplication}[https://en.wikipedia.org/wiki/Data_deduplication].
3285 *
3286 * It may also save duplicating other, already-existing, strings:
3287 *
3288 * s0 = 'foo'
3289 * s1 = 'foo'
3290 * s0.object_id == s1.object_id # => false
3291 * (-s0).object_id == (-s1).object_id # => true
3292 *
3293 * Note that method #-@ is convenient for defining a constant:
3294 *
3295 * FileName = -'config/database.yml'
3296 *
3297 * While its alias #dedup is better suited for chaining:
3298 *
3299 * 'foo'.dedup.gsub!('o')
3300 *
3301 * Related: see {Freezing/Unfreezing}[rdoc-ref:String@Freezing-2FUnfreezing].
3302 */
3303static VALUE
3304str_uminus(VALUE str)
3305{
3306 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3307 str = rb_str_dup(str);
3308 }
3309 return rb_fstring(str);
3310}
3311
3312RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3313#define rb_str_dup_frozen rb_str_new_frozen
3314
3315VALUE
3317{
3318 rb_check_frozen(str);
3319 if (FL_TEST(str, STR_TMPLOCK)) {
3320 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3321 }
3322 FL_SET(str, STR_TMPLOCK);
3323 return str;
3324}
3325
3326VALUE
3328{
3329 rb_check_frozen(str);
3330 if (!FL_TEST(str, STR_TMPLOCK)) {
3331 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3332 }
3333 FL_UNSET(str, STR_TMPLOCK);
3334 return str;
3335}
3336
3337VALUE
3338rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3339{
3340 rb_str_locktmp(str);
3341 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3342}
3343
3344void
3346{
3347 RUBY_ASSERT(ruby_thread_has_gvl_p());
3348
3349 long capa;
3350 const int termlen = TERM_LEN(str);
3351
3352 str_modifiable(str);
3353 if (STR_SHARED_P(str)) {
3354 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3355 }
3356 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3357 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3358 }
3359
3360 int cr = ENC_CODERANGE(str);
3361 if (len == 0) {
3362 /* Empty string does not contain non-ASCII */
3364 }
3365 else if (cr == ENC_CODERANGE_UNKNOWN) {
3366 /* Leave unknown. */
3367 }
3368 else if (len > RSTRING_LEN(str)) {
3369 if (ENC_CODERANGE_CLEAN_P(cr)) {
3370 /* Update the coderange regarding the extended part. */
3371 const char *const prev_end = RSTRING_END(str);
3372 const char *const new_end = RSTRING_PTR(str) + len;
3373 rb_encoding *enc = rb_enc_get(str);
3374 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3375 ENC_CODERANGE_SET(str, cr);
3376 }
3377 else if (cr == ENC_CODERANGE_BROKEN) {
3378 /* May be valid now, by appended part. */
3380 }
3381 }
3382 else if (len < RSTRING_LEN(str)) {
3383 if (cr != ENC_CODERANGE_7BIT) {
3384 /* ASCII-only string is keeping after truncated. Valid
3385 * and broken may be invalid or valid, leave unknown. */
3387 }
3388 }
3389
3390 STR_SET_LEN(str, len);
3391 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3392}
3393
3394VALUE
3395rb_str_resize(VALUE str, long len)
3396{
3397 if (len < 0) {
3398 rb_raise(rb_eArgError, "negative string size (or size too big)");
3399 }
3400
3401 int independent = str_independent(str);
3402 long slen = RSTRING_LEN(str);
3403 const int termlen = TERM_LEN(str);
3404
3405 if (slen > len || (termlen != 1 && slen < len)) {
3407 }
3408
3409 {
3410 long capa;
3411 if (STR_EMBED_P(str)) {
3412 if (len == slen) return str;
3413 if (str_embed_capa(str) >= len + termlen) {
3414 STR_SET_LEN(str, len);
3415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3416 return str;
3417 }
3418 str_make_independent_expand(str, slen, len - slen, termlen);
3419 }
3420 else if (str_embed_capa(str) >= len + termlen) {
3421 char *ptr = STR_HEAP_PTR(str);
3422 STR_SET_EMBED(str);
3423 if (slen > len) slen = len;
3424 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3425 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3426 STR_SET_LEN(str, len);
3427 if (independent) ruby_xfree(ptr);
3428 return str;
3429 }
3430 else if (!independent) {
3431 if (len == slen) return str;
3432 str_make_independent_expand(str, slen, len - slen, termlen);
3433 }
3434 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3435 (capa - len) > (len < 1024 ? len : 1024)) {
3436 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3437 (size_t)len + termlen, STR_HEAP_SIZE(str));
3438 RSTRING(str)->as.heap.aux.capa = len;
3439 }
3440 else if (len == slen) return str;
3441 STR_SET_LEN(str, len);
3442 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3443 }
3444 return str;
3445}
3446
3447static void
3448str_ensure_available_capa(VALUE str, long len)
3449{
3450 str_modify_keep_cr(str);
3451
3452 const int termlen = TERM_LEN(str);
3453 long olen = RSTRING_LEN(str);
3454
3455 if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3456 rb_raise(rb_eArgError, "string sizes too big");
3457 }
3458
3459 long total = olen + len;
3460 long capa = str_capacity(str, termlen);
3461
3462 if (capa < total) {
3463 if (total >= LONG_MAX / 2) {
3464 capa = total;
3465 }
3466 while (total > capa) {
3467 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3468 }
3469 RESIZE_CAPA_TERM(str, capa, termlen);
3470 }
3471}
3472
3473static VALUE
3474str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3475{
3476 if (keep_cr) {
3477 str_modify_keep_cr(str);
3478 }
3479 else {
3480 rb_str_modify(str);
3481 }
3482 if (len == 0) return 0;
3483
3484 long total, olen, off = -1;
3485 char *sptr;
3486 const int termlen = TERM_LEN(str);
3487
3488 RSTRING_GETMEM(str, sptr, olen);
3489 if (ptr >= sptr && ptr <= sptr + olen) {
3490 off = ptr - sptr;
3491 }
3492
3493 long capa = str_capacity(str, termlen);
3494
3495 if (olen > LONG_MAX - len) {
3496 rb_raise(rb_eArgError, "string sizes too big");
3497 }
3498 total = olen + len;
3499 if (capa < total) {
3500 if (total >= LONG_MAX / 2) {
3501 capa = total;
3502 }
3503 while (total > capa) {
3504 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3505 }
3506 RESIZE_CAPA_TERM(str, capa, termlen);
3507 sptr = RSTRING_PTR(str);
3508 }
3509 if (off != -1) {
3510 ptr = sptr + off;
3511 }
3512 memcpy(sptr + olen, ptr, len);
3513 STR_SET_LEN(str, total);
3514 TERM_FILL(sptr + total, termlen); /* sentinel */
3515
3516 return str;
3517}
3518
3519#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3520#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3521
3522VALUE
3523rb_str_cat(VALUE str, const char *ptr, long len)
3524{
3525 if (len == 0) return str;
3526 if (len < 0) {
3527 rb_raise(rb_eArgError, "negative string size (or size too big)");
3528 }
3529 return str_buf_cat(str, ptr, len);
3530}
3531
3532VALUE
3533rb_str_cat_cstr(VALUE str, const char *ptr)
3534{
3535 must_not_null(ptr);
3536 return rb_str_buf_cat(str, ptr, strlen(ptr));
3537}
3538
3539static void
3540rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3541{
3542 RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3543
3544 // We can't write directly to shared strings without impacting others, so we must make the string independent.
3545 if (UNLIKELY(!str_independent(str))) {
3546 str_make_independent(str);
3547 }
3548
3549 long string_length = -1;
3550 const int null_terminator_length = 1;
3551 char *sptr;
3552 RSTRING_GETMEM(str, sptr, string_length);
3553
3554 // Ensure the resulting string wouldn't be too long.
3555 if (UNLIKELY(string_length > LONG_MAX - 1)) {
3556 rb_raise(rb_eArgError, "string sizes too big");
3557 }
3558
3559 long string_capacity = str_capacity(str, null_terminator_length);
3560
3561 // Get the code range before any modifications since those might clear the code range.
3562 int cr = ENC_CODERANGE(str);
3563
3564 // Check if the string has spare string_capacity to write the new byte.
3565 if (LIKELY(string_capacity >= string_length + 1)) {
3566 // In fast path we can write the new byte and note the string's new length.
3567 sptr[string_length] = byte;
3568 STR_SET_LEN(str, string_length + 1);
3569 TERM_FILL(sptr + string_length + 1, null_terminator_length);
3570 }
3571 else {
3572 // If there's not enough string_capacity, make a call into the general string concatenation function.
3573 str_buf_cat(str, (char *)&byte, 1);
3574 }
3575
3576 // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3577 // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3578 // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3579 // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3580 if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3581 if (ISASCII(byte)) {
3583 }
3584 else {
3586
3587 // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3588 if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3589 rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3590 }
3591 }
3592 }
3593}
3594
3595RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3596RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3597RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3598
3599static VALUE
3600rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3601 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3602{
3603 int str_encindex = ENCODING_GET(str);
3604 int res_encindex;
3605 int str_cr, res_cr;
3606 rb_encoding *str_enc, *ptr_enc;
3607
3608 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3609
3610 if (str_encindex == ptr_encindex) {
3611 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3612 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3613 }
3614 }
3615 else {
3616 str_enc = rb_enc_from_index(str_encindex);
3617 ptr_enc = rb_enc_from_index(ptr_encindex);
3618 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3619 if (len == 0)
3620 return str;
3621 if (RSTRING_LEN(str) == 0) {
3622 rb_str_buf_cat(str, ptr, len);
3623 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3624 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3625 return str;
3626 }
3627 goto incompatible;
3628 }
3629 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3630 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3631 }
3632 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3633 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3634 str_cr = rb_enc_str_coderange(str);
3635 }
3636 }
3637 }
3638 if (ptr_cr_ret)
3639 *ptr_cr_ret = ptr_cr;
3640
3641 if (str_encindex != ptr_encindex &&
3642 str_cr != ENC_CODERANGE_7BIT &&
3643 ptr_cr != ENC_CODERANGE_7BIT) {
3644 str_enc = rb_enc_from_index(str_encindex);
3645 ptr_enc = rb_enc_from_index(ptr_encindex);
3646 goto incompatible;
3647 }
3648
3649 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3650 res_encindex = str_encindex;
3651 res_cr = ENC_CODERANGE_UNKNOWN;
3652 }
3653 else if (str_cr == ENC_CODERANGE_7BIT) {
3654 if (ptr_cr == ENC_CODERANGE_7BIT) {
3655 res_encindex = str_encindex;
3656 res_cr = ENC_CODERANGE_7BIT;
3657 }
3658 else {
3659 res_encindex = ptr_encindex;
3660 res_cr = ptr_cr;
3661 }
3662 }
3663 else if (str_cr == ENC_CODERANGE_VALID) {
3664 res_encindex = str_encindex;
3665 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3666 res_cr = str_cr;
3667 else
3668 res_cr = ptr_cr;
3669 }
3670 else { /* str_cr == ENC_CODERANGE_BROKEN */
3671 res_encindex = str_encindex;
3672 res_cr = str_cr;
3673 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3674 }
3675
3676 if (len < 0) {
3677 rb_raise(rb_eArgError, "negative string size (or size too big)");
3678 }
3679 str_buf_cat(str, ptr, len);
3680 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3681 return str;
3682
3683 incompatible:
3684 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3685 rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3687}
3688
3689VALUE
3690rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3691{
3692 return rb_enc_cr_str_buf_cat(str, ptr, len,
3693 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3694}
3695
3696VALUE
3698{
3699 /* ptr must reference NUL terminated ASCII string. */
3700 int encindex = ENCODING_GET(str);
3701 rb_encoding *enc = rb_enc_from_index(encindex);
3702 if (rb_enc_asciicompat(enc)) {
3703 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3704 encindex, ENC_CODERANGE_7BIT, 0);
3705 }
3706 else {
3707 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3708 while (*ptr) {
3709 unsigned int c = (unsigned char)*ptr;
3710 int len = rb_enc_codelen(c, enc);
3711 rb_enc_mbcput(c, buf, enc);
3712 rb_enc_cr_str_buf_cat(str, buf, len,
3713 encindex, ENC_CODERANGE_VALID, 0);
3714 ptr++;
3715 }
3716 return str;
3717 }
3718}
3719
3720VALUE
3722{
3723 int str2_cr = rb_enc_str_coderange(str2);
3724
3725 if (str_enc_fastpath(str)) {
3726 switch (str2_cr) {
3727 case ENC_CODERANGE_7BIT:
3728 // If RHS is 7bit we can do simple concatenation
3729 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3730 RB_GC_GUARD(str2);
3731 return str;
3733 // If RHS is valid, we can do simple concatenation if encodings are the same
3734 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3735 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3736 int str_cr = ENC_CODERANGE(str);
3737 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3738 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3739 }
3740 RB_GC_GUARD(str2);
3741 return str;
3742 }
3743 }
3744 }
3745
3746 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3747 ENCODING_GET(str2), str2_cr, &str2_cr);
3748
3749 ENC_CODERANGE_SET(str2, str2_cr);
3750
3751 return str;
3752}
3753
3754VALUE
3756{
3757 StringValue(str2);
3758 return rb_str_buf_append(str, str2);
3759}
3760
3761VALUE
3762rb_str_concat_literals(size_t num, const VALUE *strary)
3763{
3764 VALUE str;
3765 size_t i, s = 0;
3766 unsigned long len = 1;
3767
3768 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3769 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3770
3771 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3772 str = rb_str_buf_new(len);
3773 str_enc_copy_direct(str, strary[0]);
3774
3775 for (i = s; i < num; ++i) {
3776 const VALUE v = strary[i];
3777 int encidx = ENCODING_GET(v);
3778
3779 rb_str_buf_append(str, v);
3780 if (encidx != ENCINDEX_US_ASCII) {
3781 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3782 rb_enc_set_index(str, encidx);
3783 }
3784 }
3785 return str;
3786}
3787
3788/*
3789 * call-seq:
3790 * concat(*objects) -> string
3791 *
3792 * Concatenates each object in +objects+ to +self+ and returns +self+:
3793 *
3794 * s = 'foo'
3795 * s.concat('bar', 'baz') # => "foobarbaz"
3796 * s # => "foobarbaz"
3797 *
3798 * For each given object +object+ that is an Integer,
3799 * the value is considered a codepoint and converted to a character before concatenation:
3800 *
3801 * s = 'foo'
3802 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3803 *
3804 * Related: String#<<, which takes a single argument.
3805 */
3806static VALUE
3807rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3808{
3809 str_modifiable(str);
3810
3811 if (argc == 1) {
3812 return rb_str_concat(str, argv[0]);
3813 }
3814 else if (argc > 1) {
3815 int i;
3816 VALUE arg_str = rb_str_tmp_new(0);
3817 rb_enc_copy(arg_str, str);
3818 for (i = 0; i < argc; i++) {
3819 rb_str_concat(arg_str, argv[i]);
3820 }
3821 rb_str_buf_append(str, arg_str);
3822 }
3823
3824 return str;
3825}
3826
3827/*
3828 * call-seq:
3829 * append_as_bytes(*objects) -> self
3830 *
3831 * Concatenates each object in +objects+ into +self+; returns +self+;
3832 * performs no encoding validation or conversion:
3833 *
3834 * s = 'foo'
3835 * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3836 * s.valid_encoding? # => false
3837 * s.append_as_bytes("\xAC 12")
3838 * s.valid_encoding? # => true
3839 *
3840 * When a given object is an integer,
3841 * the value is considered an 8-bit byte;
3842 * if the integer occupies more than one byte (i.e,. is greater than 255),
3843 * appends only the low-order byte (similar to String#setbyte):
3844 *
3845 * s = ""
3846 * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3847 * s.bytesize # => 2
3848 *
3849 * Related: see {Modifying}[rdoc-ref:String@Modifying].
3850 */
3851
3852VALUE
3853rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3854{
3855 long needed_capacity = 0;
3856 volatile VALUE t0;
3857 enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3858
3859 for (int index = 0; index < argc; index++) {
3860 VALUE obj = argv[index];
3861 enum ruby_value_type type = types[index] = rb_type(obj);
3862 switch (type) {
3863 case T_FIXNUM:
3864 case T_BIGNUM:
3865 needed_capacity++;
3866 break;
3867 case T_STRING:
3868 needed_capacity += RSTRING_LEN(obj);
3869 break;
3870 default:
3871 rb_raise(
3873 "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3874 rb_obj_class(obj)
3875 );
3876 break;
3877 }
3878 }
3879
3880 str_ensure_available_capa(str, needed_capacity);
3881 char *sptr = RSTRING_END(str);
3882
3883 for (int index = 0; index < argc; index++) {
3884 VALUE obj = argv[index];
3885 enum ruby_value_type type = types[index];
3886 switch (type) {
3887 case T_FIXNUM:
3888 case T_BIGNUM: {
3889 argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3890 char byte = (char)(NUM2INT(obj) & 0xFF);
3891 *sptr = byte;
3892 sptr++;
3893 break;
3894 }
3895 case T_STRING: {
3896 const char *ptr;
3897 long len;
3898 RSTRING_GETMEM(obj, ptr, len);
3899 memcpy(sptr, ptr, len);
3900 sptr += len;
3901 break;
3902 }
3903 default:
3904 rb_bug("append_as_bytes arguments should have been validated");
3905 }
3906 }
3907
3908 STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3909 TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3910
3911 int cr = ENC_CODERANGE(str);
3912 switch (cr) {
3913 case ENC_CODERANGE_7BIT: {
3914 for (int index = 0; index < argc; index++) {
3915 VALUE obj = argv[index];
3916 enum ruby_value_type type = types[index];
3917 switch (type) {
3918 case T_FIXNUM:
3919 case T_BIGNUM: {
3920 if (!ISASCII(NUM2INT(obj))) {
3921 goto clear_cr;
3922 }
3923 break;
3924 }
3925 case T_STRING: {
3926 if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3927 goto clear_cr;
3928 }
3929 break;
3930 }
3931 default:
3932 rb_bug("append_as_bytes arguments should have been validated");
3933 }
3934 }
3935 break;
3936 }
3938 if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3939 goto keep_cr;
3940 }
3941 else {
3942 goto clear_cr;
3943 }
3944 break;
3945 default:
3946 goto clear_cr;
3947 break;
3948 }
3949
3950 RB_GC_GUARD(t0);
3951
3952 clear_cr:
3953 // If no fast path was hit, we clear the coderange.
3954 // append_as_bytes is predominently meant to be used in
3955 // buffering situation, hence it's likely the coderange
3956 // will never be scanned, so it's not worth spending time
3957 // precomputing the coderange except for simple and common
3958 // situations.
3960 keep_cr:
3961 return str;
3962}
3963
3964/*
3965 * call-seq:
3966 * self << object -> self
3967 *
3968 * Appends a string representation of +object+ to +self+;
3969 * returns +self+.
3970 *
3971 * If +object+ is a string, appends it to +self+:
3972 *
3973 * s = 'foo'
3974 * s << 'bar' # => "foobar"
3975 * s # => "foobar"
3976 *
3977 * If +object+ is an integer,
3978 * its value is considered a codepoint;
3979 * converts the value to a character before concatenating:
3980 *
3981 * s = 'foo'
3982 * s << 33 # => "foo!"
3983 *
3984 * Additionally, if the codepoint is in range <tt>0..0xff</tt>
3985 * and the encoding of +self+ is Encoding::US_ASCII,
3986 * changes the encoding to Encoding::ASCII_8BIT:
3987 *
3988 * s = 'foo'.encode(Encoding::US_ASCII)
3989 * s.encoding # => #<Encoding:US-ASCII>
3990 * s << 0xff # => "foo\xFF"
3991 * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3992 *
3993 * Raises RangeError if that codepoint is not representable in the encoding of +self+:
3994 *
3995 * s = 'foo'
3996 * s.encoding # => <Encoding:UTF-8>
3997 * s << 0x00110000 # 1114112 out of char range (RangeError)
3998 * s = 'foo'.encode(Encoding::EUC_JP)
3999 * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
4000 *
4001 * Related: see {Modifying}[rdoc-ref:String@Modifying].
4002 */
4003VALUE
4005{
4006 unsigned int code;
4007 rb_encoding *enc = STR_ENC_GET(str1);
4008 int encidx;
4009
4010 if (RB_INTEGER_TYPE_P(str2)) {
4011 if (rb_num_to_uint(str2, &code) == 0) {
4012 }
4013 else if (FIXNUM_P(str2)) {
4014 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
4015 }
4016 else {
4017 rb_raise(rb_eRangeError, "bignum out of char range");
4018 }
4019 }
4020 else {
4021 return rb_str_append(str1, str2);
4022 }
4023
4024 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
4025
4026 if (encidx >= 0) {
4027 rb_str_buf_cat_byte(str1, (unsigned char)code);
4028 }
4029 else {
4030 long pos = RSTRING_LEN(str1);
4031 int cr = ENC_CODERANGE(str1);
4032 int len;
4033 char *buf;
4034
4035 switch (len = rb_enc_codelen(code, enc)) {
4036 case ONIGERR_INVALID_CODE_POINT_VALUE:
4037 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4038 break;
4039 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
4040 case 0:
4041 rb_raise(rb_eRangeError, "%u out of char range", code);
4042 break;
4043 }
4044 buf = ALLOCA_N(char, len + 1);
4045 rb_enc_mbcput(code, buf, enc);
4046 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
4047 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
4048 }
4049 rb_str_resize(str1, pos+len);
4050 memcpy(RSTRING_PTR(str1) + pos, buf, len);
4051 if (cr == ENC_CODERANGE_7BIT && code > 127) {
4053 }
4054 else if (cr == ENC_CODERANGE_BROKEN) {
4056 }
4057 ENC_CODERANGE_SET(str1, cr);
4058 }
4059 return str1;
4060}
4061
4062int
4063rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
4064{
4065 int encidx = rb_enc_to_index(enc);
4066
4067 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
4068 /* US-ASCII automatically extended to ASCII-8BIT */
4069 if (code > 0xFF) {
4070 rb_raise(rb_eRangeError, "%u out of char range", code);
4071 }
4072 if (encidx == ENCINDEX_US_ASCII && code > 127) {
4073 return ENCINDEX_ASCII_8BIT;
4074 }
4075 return encidx;
4076 }
4077 else {
4078 return -1;
4079 }
4080}
4081
4082/*
4083 * call-seq:
4084 * prepend(*other_strings) -> string
4085 *
4086 * Prepends each string in +other_strings+ to +self+ and returns +self+:
4087 *
4088 * s = 'foo'
4089 * s.prepend('bar', 'baz') # => "barbazfoo"
4090 * s # => "barbazfoo"
4091 *
4092 * Related: String#concat.
4093 */
4094
4095static VALUE
4096rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4097{
4098 str_modifiable(str);
4099
4100 if (argc == 1) {
4101 rb_str_update(str, 0L, 0L, argv[0]);
4102 }
4103 else if (argc > 1) {
4104 int i;
4105 VALUE arg_str = rb_str_tmp_new(0);
4106 rb_enc_copy(arg_str, str);
4107 for (i = 0; i < argc; i++) {
4108 rb_str_append(arg_str, argv[i]);
4109 }
4110 rb_str_update(str, 0L, 0L, arg_str);
4111 }
4112
4113 return str;
4114}
4115
4116st_index_t
4118{
4119 if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4120 st_index_t precomputed_hash;
4121 memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4122
4123 RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4124 return precomputed_hash;
4125 }
4126
4127 return str_do_hash(str);
4128}
4129
4130int
4132{
4133 long len1, len2;
4134 const char *ptr1, *ptr2;
4135 RSTRING_GETMEM(str1, ptr1, len1);
4136 RSTRING_GETMEM(str2, ptr2, len2);
4137 return (len1 != len2 ||
4138 !rb_str_comparable(str1, str2) ||
4139 memcmp(ptr1, ptr2, len1) != 0);
4140}
4141
4142/*
4143 * call-seq:
4144 * hash -> integer
4145 *
4146 * Returns the integer hash value for +self+.
4147 * The value is based on the length, content and encoding of +self+.
4148 *
4149 * Related: Object#hash.
4150 */
4151
4152static VALUE
4153rb_str_hash_m(VALUE str)
4154{
4155 st_index_t hval = rb_str_hash(str);
4156 return ST2FIX(hval);
4157}
4158
4159#define lesser(a,b) (((a)>(b))?(b):(a))
4160
4161int
4163{
4164 int idx1, idx2;
4165 int rc1, rc2;
4166
4167 if (RSTRING_LEN(str1) == 0) return TRUE;
4168 if (RSTRING_LEN(str2) == 0) return TRUE;
4169 idx1 = ENCODING_GET(str1);
4170 idx2 = ENCODING_GET(str2);
4171 if (idx1 == idx2) return TRUE;
4172 rc1 = rb_enc_str_coderange(str1);
4173 rc2 = rb_enc_str_coderange(str2);
4174 if (rc1 == ENC_CODERANGE_7BIT) {
4175 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4176 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
4177 return TRUE;
4178 }
4179 if (rc2 == ENC_CODERANGE_7BIT) {
4180 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
4181 return TRUE;
4182 }
4183 return FALSE;
4184}
4185
4186int
4188{
4189 long len1, len2;
4190 const char *ptr1, *ptr2;
4191 int retval;
4192
4193 if (str1 == str2) return 0;
4194 RSTRING_GETMEM(str1, ptr1, len1);
4195 RSTRING_GETMEM(str2, ptr2, len2);
4196 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4197 if (len1 == len2) {
4198 if (!rb_str_comparable(str1, str2)) {
4199 if (ENCODING_GET(str1) > ENCODING_GET(str2))
4200 return 1;
4201 return -1;
4202 }
4203 return 0;
4204 }
4205 if (len1 > len2) return 1;
4206 return -1;
4207 }
4208 if (retval > 0) return 1;
4209 return -1;
4210}
4211
4212/*
4213 * call-seq:
4214 * self == object -> true or false
4215 *
4216 * Returns whether +object+ is equal to +self+.
4217 *
4218 * When +object+ is a string, returns whether +object+ has the same length and content as +self+:
4219 *
4220 * s = 'foo'
4221 * s == 'foo' # => true
4222 * s == 'food' # => false
4223 * s == 'FOO' # => false
4224 *
4225 * Returns +false+ if the two strings' encodings are not compatible:
4226 *
4227 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1) == ("\u{c4 d6 dc}") # => false
4228 *
4229 * When +object+ is not a string:
4230 *
4231 * - If +object+ responds to method <tt>to_str</tt>,
4232 * <tt>object == self</tt> is called and its return value is returned.
4233 * - If +object+ does not respond to <tt>to_str</tt>,
4234 * +false+ is returned.
4235 *
4236 * Related: {Comparing}[rdoc-ref:String@Comparing].
4237 */
4238
4239VALUE
4241{
4242 if (str1 == str2) return Qtrue;
4243 if (!RB_TYPE_P(str2, T_STRING)) {
4244 if (!rb_respond_to(str2, idTo_str)) {
4245 return Qfalse;
4246 }
4247 return rb_equal(str2, str1);
4248 }
4249 return rb_str_eql_internal(str1, str2);
4250}
4251
4252/*
4253 * call-seq:
4254 * eql?(object) -> true or false
4255 *
4256 * Returns +true+ if +object+ has the same length and content;
4257 * as +self+; +false+ otherwise:
4258 *
4259 * s = 'foo'
4260 * s.eql?('foo') # => true
4261 * s.eql?('food') # => false
4262 * s.eql?('FOO') # => false
4263 *
4264 * Returns +false+ if the two strings' encodings are not compatible:
4265 *
4266 * "\u{e4 f6 fc}".encode(Encoding::ISO_8859_1).eql?("\u{c4 d6 dc}") # => false
4267 *
4268 */
4269
4270VALUE
4271rb_str_eql(VALUE str1, VALUE str2)
4272{
4273 if (str1 == str2) return Qtrue;
4274 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4275 return rb_str_eql_internal(str1, str2);
4276}
4277
4278/*
4279 * call-seq:
4280 * self <=> other_string -> -1, 0, 1, or nil
4281 *
4282 * Compares +self+ and +other_string+, returning:
4283 *
4284 * - -1 if +other_string+ is larger.
4285 * - 0 if the two are equal.
4286 * - 1 if +other_string+ is smaller.
4287 * - +nil+ if the two are incomparable.
4288 *
4289 * Examples:
4290 *
4291 * 'foo' <=> 'foo' # => 0
4292 * 'foo' <=> 'food' # => -1
4293 * 'food' <=> 'foo' # => 1
4294 * 'FOO' <=> 'foo' # => -1
4295 * 'foo' <=> 'FOO' # => 1
4296 * 'foo' <=> 1 # => nil
4297 *
4298 * Related: see {Comparing}[rdoc-ref:String@Comparing].
4299 */
4300
4301static VALUE
4302rb_str_cmp_m(VALUE str1, VALUE str2)
4303{
4304 int result;
4305 VALUE s = rb_check_string_type(str2);
4306 if (NIL_P(s)) {
4307 return rb_invcmp(str1, str2);
4308 }
4309 result = rb_str_cmp(str1, s);
4310 return INT2FIX(result);
4311}
4312
4313static VALUE str_casecmp(VALUE str1, VALUE str2);
4314static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4315
4316/*
4317 * call-seq:
4318 * casecmp(other_string) -> -1, 0, 1, or nil
4319 *
4320 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4321 *
4322 * - -1 if <tt>other_string.downcase</tt> is larger.
4323 * - 0 if the two are equal.
4324 * - 1 if <tt>other_string.downcase</tt> is smaller.
4325 * - +nil+ if the two are incomparable.
4326 *
4327 * Examples:
4328 *
4329 * 'foo'.casecmp('foo') # => 0
4330 * 'foo'.casecmp('food') # => -1
4331 * 'food'.casecmp('foo') # => 1
4332 * 'FOO'.casecmp('foo') # => 0
4333 * 'foo'.casecmp('FOO') # => 0
4334 * 'foo'.casecmp(1) # => nil
4335 *
4336 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4337 *
4338 * Related: String#casecmp?.
4339 *
4340 */
4341
4342static VALUE
4343rb_str_casecmp(VALUE str1, VALUE str2)
4344{
4345 VALUE s = rb_check_string_type(str2);
4346 if (NIL_P(s)) {
4347 return Qnil;
4348 }
4349 return str_casecmp(str1, s);
4350}
4351
4352static VALUE
4353str_casecmp(VALUE str1, VALUE str2)
4354{
4355 long len;
4356 rb_encoding *enc;
4357 const char *p1, *p1end, *p2, *p2end;
4358
4359 enc = rb_enc_compatible(str1, str2);
4360 if (!enc) {
4361 return Qnil;
4362 }
4363
4364 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4365 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4366 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4367 while (p1 < p1end && p2 < p2end) {
4368 if (*p1 != *p2) {
4369 unsigned int c1 = TOLOWER(*p1 & 0xff);
4370 unsigned int c2 = TOLOWER(*p2 & 0xff);
4371 if (c1 != c2)
4372 return INT2FIX(c1 < c2 ? -1 : 1);
4373 }
4374 p1++;
4375 p2++;
4376 }
4377 }
4378 else {
4379 while (p1 < p1end && p2 < p2end) {
4380 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4381 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4382
4383 if (0 <= c1 && 0 <= c2) {
4384 c1 = TOLOWER(c1);
4385 c2 = TOLOWER(c2);
4386 if (c1 != c2)
4387 return INT2FIX(c1 < c2 ? -1 : 1);
4388 }
4389 else {
4390 int r;
4391 l1 = rb_enc_mbclen(p1, p1end, enc);
4392 l2 = rb_enc_mbclen(p2, p2end, enc);
4393 len = l1 < l2 ? l1 : l2;
4394 r = memcmp(p1, p2, len);
4395 if (r != 0)
4396 return INT2FIX(r < 0 ? -1 : 1);
4397 if (l1 != l2)
4398 return INT2FIX(l1 < l2 ? -1 : 1);
4399 }
4400 p1 += l1;
4401 p2 += l2;
4402 }
4403 }
4404 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4405 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4406 return INT2FIX(-1);
4407}
4408
4409/*
4410 * call-seq:
4411 * casecmp?(other_string) -> true, false, or nil
4412 *
4413 * Returns +true+ if +self+ and +other_string+ are equal after
4414 * Unicode case folding, otherwise +false+:
4415 *
4416 * 'foo'.casecmp?('foo') # => true
4417 * 'foo'.casecmp?('food') # => false
4418 * 'food'.casecmp?('foo') # => false
4419 * 'FOO'.casecmp?('foo') # => true
4420 * 'foo'.casecmp?('FOO') # => true
4421 *
4422 * Returns +nil+ if the two values are incomparable:
4423 *
4424 * 'foo'.casecmp?(1) # => nil
4425 *
4426 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4427 *
4428 * Related: String#casecmp.
4429 *
4430 */
4431
4432static VALUE
4433rb_str_casecmp_p(VALUE str1, VALUE str2)
4434{
4435 VALUE s = rb_check_string_type(str2);
4436 if (NIL_P(s)) {
4437 return Qnil;
4438 }
4439 return str_casecmp_p(str1, s);
4440}
4441
4442static VALUE
4443str_casecmp_p(VALUE str1, VALUE str2)
4444{
4445 rb_encoding *enc;
4446 VALUE folded_str1, folded_str2;
4447 VALUE fold_opt = sym_fold;
4448
4449 enc = rb_enc_compatible(str1, str2);
4450 if (!enc) {
4451 return Qnil;
4452 }
4453
4454 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4455 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4456
4457 return rb_str_eql(folded_str1, folded_str2);
4458}
4459
4460static long
4461strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4462 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4463{
4464 const char *search_start = str_ptr;
4465 long pos, search_len = str_len - offset;
4466
4467 for (;;) {
4468 const char *t;
4469 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4470 if (pos < 0) return pos;
4471 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4472 if (t == search_start + pos) break;
4473 search_len -= t - search_start;
4474 if (search_len <= 0) return -1;
4475 offset += t - search_start;
4476 search_start = t;
4477 }
4478 return pos + offset;
4479}
4480
4481/* found index in byte */
4482#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4483#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4484
4485static long
4486rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4487{
4488 const char *str_ptr, *str_ptr_end, *sub_ptr;
4489 long str_len, sub_len;
4490 rb_encoding *enc;
4491
4492 enc = rb_enc_check(str, sub);
4493 if (is_broken_string(sub)) return -1;
4494
4495 str_ptr = RSTRING_PTR(str);
4496 str_ptr_end = RSTRING_END(str);
4497 str_len = RSTRING_LEN(str);
4498 sub_ptr = RSTRING_PTR(sub);
4499 sub_len = RSTRING_LEN(sub);
4500
4501 if (str_len < sub_len) return -1;
4502
4503 if (offset != 0) {
4504 long str_len_char, sub_len_char;
4505 int single_byte = single_byte_optimizable(str);
4506 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4507 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4508 if (offset < 0) {
4509 offset += str_len_char;
4510 if (offset < 0) return -1;
4511 }
4512 if (str_len_char - offset < sub_len_char) return -1;
4513 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4514 str_ptr += offset;
4515 }
4516 if (sub_len == 0) return offset;
4517
4518 /* need proceed one character at a time */
4519 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4520}
4521
4522
4523/*
4524 * call-seq:
4525 * index(substring, offset = 0) -> integer or nil
4526 * index(regexp, offset = 0) -> integer or nil
4527 *
4528 * :include: doc/string/index.rdoc
4529 *
4530 */
4531
4532static VALUE
4533rb_str_index_m(int argc, VALUE *argv, VALUE str)
4534{
4535 VALUE sub;
4536 VALUE initpos;
4537 rb_encoding *enc = STR_ENC_GET(str);
4538 long pos;
4539
4540 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4541 long slen = str_strlen(str, enc); /* str's enc */
4542 pos = NUM2LONG(initpos);
4543 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4544 if (RB_TYPE_P(sub, T_REGEXP)) {
4546 }
4547 return Qnil;
4548 }
4549 }
4550 else {
4551 pos = 0;
4552 }
4553
4554 if (RB_TYPE_P(sub, T_REGEXP)) {
4555 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4556 enc, single_byte_optimizable(str));
4557
4558 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4559 VALUE match = rb_backref_get();
4560 struct re_registers *regs = RMATCH_REGS(match);
4561 pos = rb_str_sublen(str, BEG(0));
4562 return LONG2NUM(pos);
4563 }
4564 }
4565 else {
4566 StringValue(sub);
4567 pos = rb_str_index(str, sub, pos);
4568 if (pos >= 0) {
4569 pos = rb_str_sublen(str, pos);
4570 return LONG2NUM(pos);
4571 }
4572 }
4573 return Qnil;
4574}
4575
4576/* Ensure that the given pos is a valid character boundary.
4577 * Note that in this function, "character" means a code point
4578 * (Unicode scalar value), not a grapheme cluster.
4579 */
4580static void
4581str_ensure_byte_pos(VALUE str, long pos)
4582{
4583 if (!single_byte_optimizable(str)) {
4584 const char *s = RSTRING_PTR(str);
4585 const char *e = RSTRING_END(str);
4586 const char *p = s + pos;
4587 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4588 rb_raise(rb_eIndexError,
4589 "offset %ld does not land on character boundary", pos);
4590 }
4591 }
4592}
4593
4594/*
4595 * call-seq:
4596 * byteindex(object, offset = 0) -> integer or nil
4597 *
4598 * Returns the 0-based integer index of a substring of +self+
4599 * specified by +object+ (a string or Regexp) and +offset+,
4600 * or +nil+ if there is no such substring;
4601 * the returned index is the count of _bytes_ (not characters).
4602 *
4603 * When +object+ is a string,
4604 * returns the index of the first found substring equal to +object+:
4605 *
4606 * s = 'foo' # => "foo"
4607 * s.size # => 3 # Three 1-byte characters.
4608 * s.bytesize # => 3 # Three bytes.
4609 * s.byteindex('f') # => 0
4610 * s.byteindex('o') # => 1
4611 * s.byteindex('oo') # => 1
4612 * s.byteindex('ooo') # => nil
4613 *
4614 * When +object+ is a Regexp,
4615 * returns the index of the first found substring matching +object+;
4616 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4617 *
4618 * s = 'foo'
4619 * s.byteindex(/f/) # => 0
4620 * $~ # => #<MatchData "f">
4621 * s.byteindex(/o/) # => 1
4622 * s.byteindex(/oo/) # => 1
4623 * s.byteindex(/ooo/) # => nil
4624 * $~ # => nil
4625 *
4626 * \Integer argument +offset+, if given, specifies the 0-based index
4627 * of the byte where searching is to begin.
4628 *
4629 * When +offset+ is non-negative,
4630 * searching begins at byte position +offset+:
4631 *
4632 * s = 'foo'
4633 * s.byteindex('o', 1) # => 1
4634 * s.byteindex('o', 2) # => 2
4635 * s.byteindex('o', 3) # => nil
4636 *
4637 * When +offset+ is negative, counts backward from the end of +self+:
4638 *
4639 * s = 'foo'
4640 * s.byteindex('o', -1) # => 2
4641 * s.byteindex('o', -2) # => 1
4642 * s.byteindex('o', -3) # => 1
4643 * s.byteindex('o', -4) # => nil
4644 *
4645 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4646 *
4647 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4648 * s.size # => 2 # Two 3-byte characters.
4649 * s.bytesize # => 6 # Six bytes.
4650 * s.byteindex("\uFFFF") # => 0
4651 * s.byteindex("\uFFFF", 1) # Raises IndexError
4652 * s.byteindex("\uFFFF", 2) # Raises IndexError
4653 * s.byteindex("\uFFFF", 3) # => 3
4654 * s.byteindex("\uFFFF", 4) # Raises IndexError
4655 * s.byteindex("\uFFFF", 5) # Raises IndexError
4656 * s.byteindex("\uFFFF", 6) # => nil
4657 *
4658 * Related: see {Querying}[rdoc-ref:String@Querying].
4659 */
4660
4661static VALUE
4662rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4663{
4664 VALUE sub;
4665 VALUE initpos;
4666 long pos;
4667
4668 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4669 long slen = RSTRING_LEN(str);
4670 pos = NUM2LONG(initpos);
4671 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4672 if (RB_TYPE_P(sub, T_REGEXP)) {
4674 }
4675 return Qnil;
4676 }
4677 }
4678 else {
4679 pos = 0;
4680 }
4681
4682 str_ensure_byte_pos(str, pos);
4683
4684 if (RB_TYPE_P(sub, T_REGEXP)) {
4685 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4686 VALUE match = rb_backref_get();
4687 struct re_registers *regs = RMATCH_REGS(match);
4688 pos = BEG(0);
4689 return LONG2NUM(pos);
4690 }
4691 }
4692 else {
4693 StringValue(sub);
4694 pos = rb_str_byteindex(str, sub, pos);
4695 if (pos >= 0) return LONG2NUM(pos);
4696 }
4697 return Qnil;
4698}
4699
4700#ifndef HAVE_MEMRCHR
4701static void*
4702memrchr(const char *search_str, int chr, long search_len)
4703{
4704 const char *ptr = search_str + search_len;
4705 while (ptr > search_str) {
4706 if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4707 }
4708
4709 return ((void *)0);
4710}
4711#endif
4712
4713static long
4714str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4715{
4716 char *hit, *adjusted;
4717 int c;
4718 long slen, searchlen;
4719 char *sbeg, *e, *t;
4720
4721 sbeg = RSTRING_PTR(str);
4722 slen = RSTRING_LEN(sub);
4723 if (slen == 0) return s - sbeg;
4724 e = RSTRING_END(str);
4725 t = RSTRING_PTR(sub);
4726 c = *t & 0xff;
4727 searchlen = s - sbeg + 1;
4728
4729 if (memcmp(s, t, slen) == 0) {
4730 return s - sbeg;
4731 }
4732
4733 do {
4734 hit = memrchr(sbeg, c, searchlen);
4735 if (!hit) break;
4736 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4737 if (hit != adjusted) {
4738 searchlen = adjusted - sbeg;
4739 continue;
4740 }
4741 if (memcmp(hit, t, slen) == 0)
4742 return hit - sbeg;
4743 searchlen = adjusted - sbeg;
4744 } while (searchlen > 0);
4745
4746 return -1;
4747}
4748
4749/* found index in byte */
4750static long
4751rb_str_rindex(VALUE str, VALUE sub, long pos)
4752{
4753 long len, slen;
4754 char *sbeg, *s;
4755 rb_encoding *enc;
4756 int singlebyte;
4757
4758 enc = rb_enc_check(str, sub);
4759 if (is_broken_string(sub)) return -1;
4760 singlebyte = single_byte_optimizable(str);
4761 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4762 slen = str_strlen(sub, enc); /* rb_enc_check */
4763
4764 /* substring longer than string */
4765 if (len < slen) return -1;
4766 if (len - pos < slen) pos = len - slen;
4767 if (len == 0) return pos;
4768
4769 sbeg = RSTRING_PTR(str);
4770
4771 if (pos == 0) {
4772 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4773 return 0;
4774 else
4775 return -1;
4776 }
4777
4778 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4779 return str_rindex(str, sub, s, enc);
4780}
4781
4782/*
4783 * call-seq:
4784 * rindex(substring, offset = self.length) -> integer or nil
4785 * rindex(regexp, offset = self.length) -> integer or nil
4786 *
4787 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4788 * or +nil+ if none found:
4789 *
4790 * 'foo'.rindex('f') # => 0
4791 * 'foo'.rindex('o') # => 2
4792 * 'foo'.rindex('oo') # => 1
4793 * 'foo'.rindex('ooo') # => nil
4794 *
4795 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4796 * or +nil+ if none found:
4797 *
4798 * 'foo'.rindex(/f/) # => 0
4799 * 'foo'.rindex(/o/) # => 2
4800 * 'foo'.rindex(/oo/) # => 1
4801 * 'foo'.rindex(/ooo/) # => nil
4802 *
4803 * The _last_ match means starting at the possible last position, not
4804 * the last of longest matches.
4805 *
4806 * 'foo'.rindex(/o+/) # => 2
4807 * $~ #=> #<MatchData "o">
4808 *
4809 * To get the last longest match, needs to combine with negative
4810 * lookbehind.
4811 *
4812 * 'foo'.rindex(/(?<!o)o+/) # => 1
4813 * $~ #=> #<MatchData "oo">
4814 *
4815 * Or String#index with negative lookforward.
4816 *
4817 * 'foo'.index(/o+(?!.*o)/) # => 1
4818 * $~ #=> #<MatchData "oo">
4819 *
4820 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4821 * string to _end_ the search:
4822 *
4823 * 'foo'.rindex('o', 0) # => nil
4824 * 'foo'.rindex('o', 1) # => 1
4825 * 'foo'.rindex('o', 2) # => 2
4826 * 'foo'.rindex('o', 3) # => 2
4827 *
4828 * If +offset+ is a negative Integer, the maximum starting position in the
4829 * string to _end_ the search is the sum of the string's length and +offset+:
4830 *
4831 * 'foo'.rindex('o', -1) # => 2
4832 * 'foo'.rindex('o', -2) # => 1
4833 * 'foo'.rindex('o', -3) # => nil
4834 * 'foo'.rindex('o', -4) # => nil
4835 *
4836 * Related: String#index.
4837 */
4838
4839static VALUE
4840rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4841{
4842 VALUE sub;
4843 VALUE initpos;
4844 rb_encoding *enc = STR_ENC_GET(str);
4845 long pos, len = str_strlen(str, enc); /* str's enc */
4846
4847 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4848 pos = NUM2LONG(initpos);
4849 if (pos < 0 && (pos += len) < 0) {
4850 if (RB_TYPE_P(sub, T_REGEXP)) {
4852 }
4853 return Qnil;
4854 }
4855 if (pos > len) pos = len;
4856 }
4857 else {
4858 pos = len;
4859 }
4860
4861 if (RB_TYPE_P(sub, T_REGEXP)) {
4862 /* enc = rb_enc_check(str, sub); */
4863 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4864 enc, single_byte_optimizable(str));
4865
4866 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4867 VALUE match = rb_backref_get();
4868 struct re_registers *regs = RMATCH_REGS(match);
4869 pos = rb_str_sublen(str, BEG(0));
4870 return LONG2NUM(pos);
4871 }
4872 }
4873 else {
4874 StringValue(sub);
4875 pos = rb_str_rindex(str, sub, pos);
4876 if (pos >= 0) {
4877 pos = rb_str_sublen(str, pos);
4878 return LONG2NUM(pos);
4879 }
4880 }
4881 return Qnil;
4882}
4883
4884static long
4885rb_str_byterindex(VALUE str, VALUE sub, long pos)
4886{
4887 long len, slen;
4888 char *sbeg, *s;
4889 rb_encoding *enc;
4890
4891 enc = rb_enc_check(str, sub);
4892 if (is_broken_string(sub)) return -1;
4893 len = RSTRING_LEN(str);
4894 slen = RSTRING_LEN(sub);
4895
4896 /* substring longer than string */
4897 if (len < slen) return -1;
4898 if (len - pos < slen) pos = len - slen;
4899 if (len == 0) return pos;
4900
4901 sbeg = RSTRING_PTR(str);
4902
4903 if (pos == 0) {
4904 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4905 return 0;
4906 else
4907 return -1;
4908 }
4909
4910 s = sbeg + pos;
4911 return str_rindex(str, sub, s, enc);
4912}
4913
4914/*
4915 * call-seq:
4916 * byterindex(object, offset = self.bytesize) -> integer or nil
4917 *
4918 * Returns the 0-based integer index of a substring of +self+
4919 * that is the _last_ match for the given +object+ (a string or Regexp) and +offset+,
4920 * or +nil+ if there is no such substring;
4921 * the returned index is the count of _bytes_ (not characters).
4922 *
4923 * When +object+ is a string,
4924 * returns the index of the _last_ found substring equal to +object+:
4925 *
4926 * s = 'foo' # => "foo"
4927 * s.size # => 3 # Three 1-byte characters.
4928 * s.bytesize # => 3 # Three bytes.
4929 * s.byterindex('f') # => 0
4930 s.byterindex('o') # => 2
4931 s.byterindex('oo') # => 1
4932 s.byterindex('ooo') # => nil
4933 *
4934 * When +object+ is a Regexp,
4935 * returns the index of the last found substring matching +object+;
4936 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
4937 *
4938 * s = 'foo'
4939 * s.byterindex(/f/) # => 0
4940 * $~ # => #<MatchData "f">
4941 * s.byterindex(/o/) # => 2
4942 * s.byterindex(/oo/) # => 1
4943 * s.byterindex(/ooo/) # => nil
4944 * $~ # => nil
4945 *
4946 * The last match means starting at the possible last position,
4947 * not the last of the longest matches:
4948 *
4949 * s = 'foo'
4950 * s.byterindex(/o+/) # => 2
4951 * $~ #=> #<MatchData "o">
4952 *
4953 * To get the last longest match, use a negative lookbehind:
4954 *
4955 * s = 'foo'
4956 * s.byterindex(/(?<!o)o+/) # => 1
4957 * $~ # => #<MatchData "oo">
4958 *
4959 * Or use method #byteindex with negative lookahead:
4960 *
4961 * s = 'foo'
4962 * s.byteindex(/o+(?!.*o)/) # => 1
4963 * $~ #=> #<MatchData "oo">
4964 *
4965 * \Integer argument +offset+, if given, specifies the 0-based index
4966 * of the byte where searching is to end.
4967 *
4968 * When +offset+ is non-negative,
4969 * searching ends at byte position +offset+:
4970 *
4971 * s = 'foo'
4972 * s.byterindex('o', 0) # => nil
4973 * s.byterindex('o', 1) # => 1
4974 * s.byterindex('o', 2) # => 2
4975 * s.byterindex('o', 3) # => 2
4976 *
4977 * When +offset+ is negative, counts backward from the end of +self+:
4978 *
4979 * s = 'foo'
4980 * s.byterindex('o', -1) # => 2
4981 * s.byterindex('o', -2) # => 1
4982 * s.byterindex('o', -3) # => nil
4983 *
4984 * Raises IndexError if the byte at +offset+ is not the first byte of a character:
4985 *
4986 * s = "\uFFFF\uFFFF" # => "\uFFFF\uFFFF"
4987 * s.size # => 2 # Two 3-byte characters.
4988 * s.bytesize # => 6 # Six bytes.
4989 * s.byterindex("\uFFFF") # => 3
4990 * s.byterindex("\uFFFF", 1) # Raises IndexError
4991 * s.byterindex("\uFFFF", 2) # Raises IndexError
4992 * s.byterindex("\uFFFF", 3) # => 3
4993 * s.byterindex("\uFFFF", 4) # Raises IndexError
4994 * s.byterindex("\uFFFF", 5) # Raises IndexError
4995 * s.byterindex("\uFFFF", 6) # => nil
4996 *
4997 * Related: see {Querying}[rdoc-ref:String@Querying].
4998 */
4999
5000static VALUE
5001rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
5002{
5003 VALUE sub;
5004 VALUE initpos;
5005 long pos, len = RSTRING_LEN(str);
5006
5007 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
5008 pos = NUM2LONG(initpos);
5009 if (pos < 0 && (pos += len) < 0) {
5010 if (RB_TYPE_P(sub, T_REGEXP)) {
5012 }
5013 return Qnil;
5014 }
5015 if (pos > len) pos = len;
5016 }
5017 else {
5018 pos = len;
5019 }
5020
5021 str_ensure_byte_pos(str, pos);
5022
5023 if (RB_TYPE_P(sub, T_REGEXP)) {
5024 if (rb_reg_search(sub, str, pos, 1) >= 0) {
5025 VALUE match = rb_backref_get();
5026 struct re_registers *regs = RMATCH_REGS(match);
5027 pos = BEG(0);
5028 return LONG2NUM(pos);
5029 }
5030 }
5031 else {
5032 StringValue(sub);
5033 pos = rb_str_byterindex(str, sub, pos);
5034 if (pos >= 0) return LONG2NUM(pos);
5035 }
5036 return Qnil;
5037}
5038
5039/*
5040 * call-seq:
5041 * self =~ object -> integer or nil
5042 *
5043 * When +object+ is a Regexp, returns the index of the first substring in +self+
5044 * matched by +object+,
5045 * or +nil+ if no match is found;
5046 * updates {Regexp-related global variables}[rdoc-ref:Regexp@Global+Variables]:
5047 *
5048 * 'foo' =~ /f/ # => 0
5049 * $~ # => #<MatchData "f">
5050 * 'foo' =~ /o/ # => 1
5051 * $~ # => #<MatchData "o">
5052 * 'foo' =~ /x/ # => nil
5053 * $~ # => nil
5054 *
5055 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
5056 * (see Regexp#=~):
5057 *
5058 * number = nil
5059 * 'no. 9' =~ /(?<number>\d+)/ # => 4
5060 * number # => nil # Not assigned.
5061 * /(?<number>\d+)/ =~ 'no. 9' # => 4
5062 * number # => "9" # Assigned.
5063 *
5064 * If +object+ is not a Regexp, returns the value
5065 * returned by <tt>object =~ self</tt>.
5066 *
5067 * Related: see {Querying}[rdoc-ref:String@Querying].
5068 */
5069
5070static VALUE
5071rb_str_match(VALUE x, VALUE y)
5072{
5073 switch (OBJ_BUILTIN_TYPE(y)) {
5074 case T_STRING:
5075 rb_raise(rb_eTypeError, "type mismatch: String given");
5076
5077 case T_REGEXP:
5078 return rb_reg_match(y, x);
5079
5080 default:
5081 return rb_funcall(y, idEqTilde, 1, x);
5082 }
5083}
5084
5085
5086static VALUE get_pat(VALUE);
5087
5088
5089/*
5090 * call-seq:
5091 * match(pattern, offset = 0) -> matchdata or nil
5092 * match(pattern, offset = 0) {|matchdata| ... } -> object
5093 *
5094 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
5095 *
5096 * Note: also updates Regexp@Global+Variables.
5097 *
5098 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5099 * regexp = Regexp.new(pattern)
5100 * - Computes +matchdata+, which will be either a MatchData object or +nil+
5101 * (see Regexp#match):
5102 * matchdata = regexp.match(self)
5103 *
5104 * With no block given, returns the computed +matchdata+:
5105 *
5106 * 'foo'.match('f') # => #<MatchData "f">
5107 * 'foo'.match('o') # => #<MatchData "o">
5108 * 'foo'.match('x') # => nil
5109 *
5110 * If Integer argument +offset+ is given, the search begins at index +offset+:
5111 *
5112 * 'foo'.match('f', 1) # => nil
5113 * 'foo'.match('o', 1) # => #<MatchData "o">
5114 *
5115 * With a block given, calls the block with the computed +matchdata+
5116 * and returns the block's return value:
5117 *
5118 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
5119 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
5120 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
5121 *
5122 */
5123
5124static VALUE
5125rb_str_match_m(int argc, VALUE *argv, VALUE str)
5126{
5127 VALUE re, result;
5128 if (argc < 1)
5129 rb_check_arity(argc, 1, 2);
5130 re = argv[0];
5131 argv[0] = str;
5132 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
5133 if (!NIL_P(result) && rb_block_given_p()) {
5134 return rb_yield(result);
5135 }
5136 return result;
5137}
5138
5139/*
5140 * call-seq:
5141 * match?(pattern, offset = 0) -> true or false
5142 *
5143 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
5144 *
5145 * Note: does not update Regexp@Global+Variables.
5146 *
5147 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5148 * regexp = Regexp.new(pattern)
5149 *
5150 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5151 * +false+ otherwise:
5152 *
5153 * 'foo'.match?(/o/) # => true
5154 * 'foo'.match?('o') # => true
5155 * 'foo'.match?(/x/) # => false
5156 *
5157 * If Integer argument +offset+ is given, the search begins at index +offset+:
5158 * 'foo'.match?('f', 1) # => false
5159 * 'foo'.match?('o', 1) # => true
5160 *
5161 */
5162
5163static VALUE
5164rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5165{
5166 VALUE re;
5167 rb_check_arity(argc, 1, 2);
5168 re = get_pat(argv[0]);
5169 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5170}
5171
5172enum neighbor_char {
5173 NEIGHBOR_NOT_CHAR,
5174 NEIGHBOR_FOUND,
5175 NEIGHBOR_WRAPPED
5176};
5177
5178static enum neighbor_char
5179enc_succ_char(char *p, long len, rb_encoding *enc)
5180{
5181 long i;
5182 int l;
5183
5184 if (rb_enc_mbminlen(enc) > 1) {
5185 /* wchar, trivial case */
5186 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5187 if (!MBCLEN_CHARFOUND_P(r)) {
5188 return NEIGHBOR_NOT_CHAR;
5189 }
5190 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5191 l = rb_enc_code_to_mbclen(c, enc);
5192 if (!l) return NEIGHBOR_NOT_CHAR;
5193 if (l != len) return NEIGHBOR_WRAPPED;
5194 rb_enc_mbcput(c, p, enc);
5195 r = rb_enc_precise_mbclen(p, p + len, enc);
5196 if (!MBCLEN_CHARFOUND_P(r)) {
5197 return NEIGHBOR_NOT_CHAR;
5198 }
5199 return NEIGHBOR_FOUND;
5200 }
5201 while (1) {
5202 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5203 p[i] = '\0';
5204 if (i < 0)
5205 return NEIGHBOR_WRAPPED;
5206 ++((unsigned char*)p)[i];
5207 l = rb_enc_precise_mbclen(p, p+len, enc);
5208 if (MBCLEN_CHARFOUND_P(l)) {
5209 l = MBCLEN_CHARFOUND_LEN(l);
5210 if (l == len) {
5211 return NEIGHBOR_FOUND;
5212 }
5213 else {
5214 memset(p+l, 0xff, len-l);
5215 }
5216 }
5217 if (MBCLEN_INVALID_P(l) && i < len-1) {
5218 long len2;
5219 int l2;
5220 for (len2 = len-1; 0 < len2; len2--) {
5221 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5222 if (!MBCLEN_INVALID_P(l2))
5223 break;
5224 }
5225 memset(p+len2+1, 0xff, len-(len2+1));
5226 }
5227 }
5228}
5229
5230static enum neighbor_char
5231enc_pred_char(char *p, long len, rb_encoding *enc)
5232{
5233 long i;
5234 int l;
5235 if (rb_enc_mbminlen(enc) > 1) {
5236 /* wchar, trivial case */
5237 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5238 if (!MBCLEN_CHARFOUND_P(r)) {
5239 return NEIGHBOR_NOT_CHAR;
5240 }
5241 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5242 if (!c) return NEIGHBOR_NOT_CHAR;
5243 --c;
5244 l = rb_enc_code_to_mbclen(c, enc);
5245 if (!l) return NEIGHBOR_NOT_CHAR;
5246 if (l != len) return NEIGHBOR_WRAPPED;
5247 rb_enc_mbcput(c, p, enc);
5248 r = rb_enc_precise_mbclen(p, p + len, enc);
5249 if (!MBCLEN_CHARFOUND_P(r)) {
5250 return NEIGHBOR_NOT_CHAR;
5251 }
5252 return NEIGHBOR_FOUND;
5253 }
5254 while (1) {
5255 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5256 p[i] = '\xff';
5257 if (i < 0)
5258 return NEIGHBOR_WRAPPED;
5259 --((unsigned char*)p)[i];
5260 l = rb_enc_precise_mbclen(p, p+len, enc);
5261 if (MBCLEN_CHARFOUND_P(l)) {
5262 l = MBCLEN_CHARFOUND_LEN(l);
5263 if (l == len) {
5264 return NEIGHBOR_FOUND;
5265 }
5266 else {
5267 memset(p+l, 0, len-l);
5268 }
5269 }
5270 if (MBCLEN_INVALID_P(l) && i < len-1) {
5271 long len2;
5272 int l2;
5273 for (len2 = len-1; 0 < len2; len2--) {
5274 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5275 if (!MBCLEN_INVALID_P(l2))
5276 break;
5277 }
5278 memset(p+len2+1, 0, len-(len2+1));
5279 }
5280 }
5281}
5282
5283/*
5284 overwrite +p+ by succeeding letter in +enc+ and returns
5285 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5286 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5287 assuming each ranges are successive, and mbclen
5288 never change in each ranges.
5289 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5290 character.
5291 */
5292static enum neighbor_char
5293enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5294{
5295 enum neighbor_char ret;
5296 unsigned int c;
5297 int ctype;
5298 int range;
5299 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5300
5301 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5302 int try;
5303 const int max_gaps = 1;
5304
5305 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5306 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5307 ctype = ONIGENC_CTYPE_DIGIT;
5308 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5309 ctype = ONIGENC_CTYPE_ALPHA;
5310 else
5311 return NEIGHBOR_NOT_CHAR;
5312
5313 MEMCPY(save, p, char, len);
5314 for (try = 0; try <= max_gaps; ++try) {
5315 ret = enc_succ_char(p, len, enc);
5316 if (ret == NEIGHBOR_FOUND) {
5317 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5318 if (rb_enc_isctype(c, ctype, enc))
5319 return NEIGHBOR_FOUND;
5320 }
5321 }
5322 MEMCPY(p, save, char, len);
5323 range = 1;
5324 while (1) {
5325 MEMCPY(save, p, char, len);
5326 ret = enc_pred_char(p, len, enc);
5327 if (ret == NEIGHBOR_FOUND) {
5328 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5329 if (!rb_enc_isctype(c, ctype, enc)) {
5330 MEMCPY(p, save, char, len);
5331 break;
5332 }
5333 }
5334 else {
5335 MEMCPY(p, save, char, len);
5336 break;
5337 }
5338 range++;
5339 }
5340 if (range == 1) {
5341 return NEIGHBOR_NOT_CHAR;
5342 }
5343
5344 if (ctype != ONIGENC_CTYPE_DIGIT) {
5345 MEMCPY(carry, p, char, len);
5346 return NEIGHBOR_WRAPPED;
5347 }
5348
5349 MEMCPY(carry, p, char, len);
5350 enc_succ_char(carry, len, enc);
5351 return NEIGHBOR_WRAPPED;
5352}
5353
5354
5355static VALUE str_succ(VALUE str);
5356
5357/*
5358 * call-seq:
5359 * succ -> new_str
5360 *
5361 * Returns the successor to +self+. The successor is calculated by
5362 * incrementing characters.
5363 *
5364 * The first character to be incremented is the rightmost alphanumeric:
5365 * or, if no alphanumerics, the rightmost character:
5366 *
5367 * 'THX1138'.succ # => "THX1139"
5368 * '<<koala>>'.succ # => "<<koalb>>"
5369 * '***'.succ # => '**+'
5370 *
5371 * The successor to a digit is another digit, "carrying" to the next-left
5372 * character for a "rollover" from 9 to 0, and prepending another digit
5373 * if necessary:
5374 *
5375 * '00'.succ # => "01"
5376 * '09'.succ # => "10"
5377 * '99'.succ # => "100"
5378 *
5379 * The successor to a letter is another letter of the same case,
5380 * carrying to the next-left character for a rollover,
5381 * and prepending another same-case letter if necessary:
5382 *
5383 * 'aa'.succ # => "ab"
5384 * 'az'.succ # => "ba"
5385 * 'zz'.succ # => "aaa"
5386 * 'AA'.succ # => "AB"
5387 * 'AZ'.succ # => "BA"
5388 * 'ZZ'.succ # => "AAA"
5389 *
5390 * The successor to a non-alphanumeric character is the next character
5391 * in the underlying character set's collating sequence,
5392 * carrying to the next-left character for a rollover,
5393 * and prepending another character if necessary:
5394 *
5395 * s = 0.chr * 3
5396 * s # => "\x00\x00\x00"
5397 * s.succ # => "\x00\x00\x01"
5398 * s = 255.chr * 3
5399 * s # => "\xFF\xFF\xFF"
5400 * s.succ # => "\x01\x00\x00\x00"
5401 *
5402 * Carrying can occur between and among mixtures of alphanumeric characters:
5403 *
5404 * s = 'zz99zz99'
5405 * s.succ # => "aaa00aa00"
5406 * s = '99zz99zz'
5407 * s.succ # => "100aa00aa"
5408 *
5409 * The successor to an empty +String+ is a new empty +String+:
5410 *
5411 * ''.succ # => ""
5412 *
5413 */
5414
5415VALUE
5417{
5418 VALUE str;
5419 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5420 rb_enc_cr_str_copy_for_substr(str, orig);
5421 return str_succ(str);
5422}
5423
5424static VALUE
5425str_succ(VALUE str)
5426{
5427 rb_encoding *enc;
5428 char *sbeg, *s, *e, *last_alnum = 0;
5429 int found_alnum = 0;
5430 long l, slen;
5431 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5432 long carry_pos = 0, carry_len = 1;
5433 enum neighbor_char neighbor = NEIGHBOR_FOUND;
5434
5435 slen = RSTRING_LEN(str);
5436 if (slen == 0) return str;
5437
5438 enc = STR_ENC_GET(str);
5439 sbeg = RSTRING_PTR(str);
5440 s = e = sbeg + slen;
5441
5442 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5443 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5444 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5445 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5446 break;
5447 }
5448 }
5449 l = rb_enc_precise_mbclen(s, e, enc);
5450 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5451 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5452 neighbor = enc_succ_alnum_char(s, l, enc, carry);
5453 switch (neighbor) {
5454 case NEIGHBOR_NOT_CHAR:
5455 continue;
5456 case NEIGHBOR_FOUND:
5457 return str;
5458 case NEIGHBOR_WRAPPED:
5459 last_alnum = s;
5460 break;
5461 }
5462 found_alnum = 1;
5463 carry_pos = s - sbeg;
5464 carry_len = l;
5465 }
5466 if (!found_alnum) { /* str contains no alnum */
5467 s = e;
5468 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5469 enum neighbor_char neighbor;
5470 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5471 l = rb_enc_precise_mbclen(s, e, enc);
5472 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5473 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5474 MEMCPY(tmp, s, char, l);
5475 neighbor = enc_succ_char(tmp, l, enc);
5476 switch (neighbor) {
5477 case NEIGHBOR_FOUND:
5478 MEMCPY(s, tmp, char, l);
5479 return str;
5480 break;
5481 case NEIGHBOR_WRAPPED:
5482 MEMCPY(s, tmp, char, l);
5483 break;
5484 case NEIGHBOR_NOT_CHAR:
5485 break;
5486 }
5487 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5488 /* wrapped to \0...\0. search next valid char. */
5489 enc_succ_char(s, l, enc);
5490 }
5491 if (!rb_enc_asciicompat(enc)) {
5492 MEMCPY(carry, s, char, l);
5493 carry_len = l;
5494 }
5495 carry_pos = s - sbeg;
5496 }
5498 }
5499 RESIZE_CAPA(str, slen + carry_len);
5500 sbeg = RSTRING_PTR(str);
5501 s = sbeg + carry_pos;
5502 memmove(s + carry_len, s, slen - carry_pos);
5503 memmove(s, carry, carry_len);
5504 slen += carry_len;
5505 STR_SET_LEN(str, slen);
5506 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5508 return str;
5509}
5510
5511
5512/*
5513 * call-seq:
5514 * succ! -> self
5515 *
5516 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5517 */
5518
5519static VALUE
5520rb_str_succ_bang(VALUE str)
5521{
5522 rb_str_modify(str);
5523 str_succ(str);
5524 return str;
5525}
5526
5527static int
5528all_digits_p(const char *s, long len)
5529{
5530 while (len-- > 0) {
5531 if (!ISDIGIT(*s)) return 0;
5532 s++;
5533 }
5534 return 1;
5535}
5536
5537static int
5538str_upto_i(VALUE str, VALUE arg)
5539{
5540 rb_yield(str);
5541 return 0;
5542}
5543
5544/*
5545 * call-seq:
5546 * upto(other_string, exclusive = false) {|string| ... } -> self
5547 * upto(other_string, exclusive = false) -> new_enumerator
5548 *
5549 * With a block given, calls the block with each +String+ value
5550 * returned by successive calls to String#succ;
5551 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5552 * the sequence terminates when value +other_string+ is reached;
5553 * returns +self+:
5554 *
5555 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5556 * Output:
5557 *
5558 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5559 *
5560 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5561 *
5562 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5563 *
5564 * Output:
5565 *
5566 * a8 a9 b0 b1 b2 b3 b4 b5
5567 *
5568 * If +other_string+ would not be reached, does not call the block:
5569 *
5570 * '25'.upto('5') {|s| fail s }
5571 * 'aa'.upto('a') {|s| fail s }
5572 *
5573 * With no block given, returns a new Enumerator:
5574 *
5575 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5576 *
5577 */
5578
5579static VALUE
5580rb_str_upto(int argc, VALUE *argv, VALUE beg)
5581{
5582 VALUE end, exclusive;
5583
5584 rb_scan_args(argc, argv, "11", &end, &exclusive);
5585 RETURN_ENUMERATOR(beg, argc, argv);
5586 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5587}
5588
5589VALUE
5590rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5591{
5592 VALUE current, after_end;
5593 ID succ;
5594 int n, ascii;
5595 rb_encoding *enc;
5596
5597 CONST_ID(succ, "succ");
5598 StringValue(end);
5599 enc = rb_enc_check(beg, end);
5600 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5601 /* single character */
5602 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5603 char c = RSTRING_PTR(beg)[0];
5604 char e = RSTRING_PTR(end)[0];
5605
5606 if (c > e || (excl && c == e)) return beg;
5607 for (;;) {
5608 VALUE str = rb_enc_str_new(&c, 1, enc);
5610 if ((*each)(str, arg)) break;
5611 if (!excl && c == e) break;
5612 c++;
5613 if (excl && c == e) break;
5614 }
5615 return beg;
5616 }
5617 /* both edges are all digits */
5618 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5619 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5620 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5621 VALUE b, e;
5622 int width;
5623
5624 width = RSTRING_LENINT(beg);
5625 b = rb_str_to_inum(beg, 10, FALSE);
5626 e = rb_str_to_inum(end, 10, FALSE);
5627 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5628 long bi = FIX2LONG(b);
5629 long ei = FIX2LONG(e);
5630 rb_encoding *usascii = rb_usascii_encoding();
5631
5632 while (bi <= ei) {
5633 if (excl && bi == ei) break;
5634 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5635 bi++;
5636 }
5637 }
5638 else {
5639 ID op = excl ? '<' : idLE;
5640 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5641
5642 args[0] = INT2FIX(width);
5643 while (rb_funcall(b, op, 1, e)) {
5644 args[1] = b;
5645 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5646 b = rb_funcallv(b, succ, 0, 0);
5647 }
5648 }
5649 return beg;
5650 }
5651 /* normal case */
5652 n = rb_str_cmp(beg, end);
5653 if (n > 0 || (excl && n == 0)) return beg;
5654
5655 after_end = rb_funcallv(end, succ, 0, 0);
5656 current = str_duplicate(rb_cString, beg);
5657 while (!rb_str_equal(current, after_end)) {
5658 VALUE next = Qnil;
5659 if (excl || !rb_str_equal(current, end))
5660 next = rb_funcallv(current, succ, 0, 0);
5661 if ((*each)(current, arg)) break;
5662 if (NIL_P(next)) break;
5663 current = next;
5664 StringValue(current);
5665 if (excl && rb_str_equal(current, end)) break;
5666 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5667 break;
5668 }
5669
5670 return beg;
5671}
5672
5673VALUE
5674rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5675{
5676 VALUE current;
5677 ID succ;
5678
5679 CONST_ID(succ, "succ");
5680 /* both edges are all digits */
5681 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5682 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5683 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5684 int width = RSTRING_LENINT(beg);
5685 b = rb_str_to_inum(beg, 10, FALSE);
5686 if (FIXNUM_P(b)) {
5687 long bi = FIX2LONG(b);
5688 rb_encoding *usascii = rb_usascii_encoding();
5689
5690 while (FIXABLE(bi)) {
5691 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5692 bi++;
5693 }
5694 b = LONG2NUM(bi);
5695 }
5696 args[0] = INT2FIX(width);
5697 while (1) {
5698 args[1] = b;
5699 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5700 b = rb_funcallv(b, succ, 0, 0);
5701 }
5702 }
5703 /* normal case */
5704 current = str_duplicate(rb_cString, beg);
5705 while (1) {
5706 VALUE next = rb_funcallv(current, succ, 0, 0);
5707 if ((*each)(current, arg)) break;
5708 current = next;
5709 StringValue(current);
5710 if (RSTRING_LEN(current) == 0)
5711 break;
5712 }
5713
5714 return beg;
5715}
5716
5717static int
5718include_range_i(VALUE str, VALUE arg)
5719{
5720 VALUE *argp = (VALUE *)arg;
5721 if (!rb_equal(str, *argp)) return 0;
5722 *argp = Qnil;
5723 return 1;
5724}
5725
5726VALUE
5727rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5728{
5729 beg = rb_str_new_frozen(beg);
5730 StringValue(end);
5731 end = rb_str_new_frozen(end);
5732 if (NIL_P(val)) return Qfalse;
5733 val = rb_check_string_type(val);
5734 if (NIL_P(val)) return Qfalse;
5735 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5736 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5737 rb_enc_asciicompat(STR_ENC_GET(val))) {
5738 const char *bp = RSTRING_PTR(beg);
5739 const char *ep = RSTRING_PTR(end);
5740 const char *vp = RSTRING_PTR(val);
5741 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5742 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5743 return Qfalse;
5744 else {
5745 char b = *bp;
5746 char e = *ep;
5747 char v = *vp;
5748
5749 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5750 if (b <= v && v < e) return Qtrue;
5751 return RBOOL(!RTEST(exclusive) && v == e);
5752 }
5753 }
5754 }
5755#if 0
5756 /* both edges are all digits */
5757 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5758 all_digits_p(bp, RSTRING_LEN(beg)) &&
5759 all_digits_p(ep, RSTRING_LEN(end))) {
5760 /* TODO */
5761 }
5762#endif
5763 }
5764 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5765
5766 return RBOOL(NIL_P(val));
5767}
5768
5769static VALUE
5770rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5771{
5772 if (rb_reg_search(re, str, 0, 0) >= 0) {
5773 VALUE match = rb_backref_get();
5774 int nth = rb_reg_backref_number(match, backref);
5775 return rb_reg_nth_match(nth, match);
5776 }
5777 return Qnil;
5778}
5779
5780static VALUE
5781rb_str_aref(VALUE str, VALUE indx)
5782{
5783 long idx;
5784
5785 if (FIXNUM_P(indx)) {
5786 idx = FIX2LONG(indx);
5787 }
5788 else if (RB_TYPE_P(indx, T_REGEXP)) {
5789 return rb_str_subpat(str, indx, INT2FIX(0));
5790 }
5791 else if (RB_TYPE_P(indx, T_STRING)) {
5792 if (rb_str_index(str, indx, 0) != -1)
5793 return str_duplicate(rb_cString, indx);
5794 return Qnil;
5795 }
5796 else {
5797 /* check if indx is Range */
5798 long beg, len = str_strlen(str, NULL);
5799 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5800 case Qfalse:
5801 break;
5802 case Qnil:
5803 return Qnil;
5804 default:
5805 return rb_str_substr(str, beg, len);
5806 }
5807 idx = NUM2LONG(indx);
5808 }
5809
5810 return str_substr(str, idx, 1, FALSE);
5811}
5812
5813
5814/*
5815 * call-seq:
5816 * self[index] -> new_string or nil
5817 * self[start, length] -> new_string or nil
5818 * self[range] -> new_string or nil
5819 * self[regexp, capture = 0] -> new_string or nil
5820 * self[substring] -> new_string or nil
5821 *
5822 * Returns the substring of +self+ specified by the arguments.
5823 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5824 *
5825 * Related: see {Converting to New String}[rdoc-ref:String@Converting+to+New+String].
5826 */
5827
5828static VALUE
5829rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5830{
5831 if (argc == 2) {
5832 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5833 return rb_str_subpat(str, argv[0], argv[1]);
5834 }
5835 else {
5836 return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5837 }
5838 }
5839 rb_check_arity(argc, 1, 2);
5840 return rb_str_aref(str, argv[0]);
5841}
5842
5843VALUE
5845{
5846 char *ptr = RSTRING_PTR(str);
5847 long olen = RSTRING_LEN(str), nlen;
5848
5849 str_modifiable(str);
5850 if (len > olen) len = olen;
5851 nlen = olen - len;
5852 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5853 char *oldptr = ptr;
5854 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5855 STR_SET_EMBED(str);
5856 ptr = RSTRING(str)->as.embed.ary;
5857 memmove(ptr, oldptr + len, nlen);
5858 if (fl == STR_NOEMBED) xfree(oldptr);
5859 }
5860 else {
5861 if (!STR_SHARED_P(str)) {
5862 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5863 rb_enc_cr_str_exact_copy(shared, str);
5864 OBJ_FREEZE(shared);
5865 }
5866 ptr = RSTRING(str)->as.heap.ptr += len;
5867 }
5868 STR_SET_LEN(str, nlen);
5869
5870 if (!SHARABLE_MIDDLE_SUBSTRING) {
5871 TERM_FILL(ptr + nlen, TERM_LEN(str));
5872 }
5874 return str;
5875}
5876
5877static void
5878rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5879{
5880 char *sptr;
5881 long slen;
5882 int cr;
5883
5884 if (beg == 0 && vlen == 0) {
5885 rb_str_drop_bytes(str, len);
5886 return;
5887 }
5888
5889 str_modify_keep_cr(str);
5890 RSTRING_GETMEM(str, sptr, slen);
5891 if (len < vlen) {
5892 /* expand string */
5893 RESIZE_CAPA(str, slen + vlen - len);
5894 sptr = RSTRING_PTR(str);
5895 }
5896
5898 cr = rb_enc_str_coderange(val);
5899 else
5901
5902 if (vlen != len) {
5903 memmove(sptr + beg + vlen,
5904 sptr + beg + len,
5905 slen - (beg + len));
5906 }
5907 if (vlen < beg && len < 0) {
5908 MEMZERO(sptr + slen, char, -len);
5909 }
5910 if (vlen > 0) {
5911 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5912 }
5913 slen += vlen - len;
5914 STR_SET_LEN(str, slen);
5915 TERM_FILL(&sptr[slen], TERM_LEN(str));
5916 ENC_CODERANGE_SET(str, cr);
5917}
5918
5919static inline void
5920rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5921{
5922 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5923}
5924
5925void
5926rb_str_update(VALUE str, long beg, long len, VALUE val)
5927{
5928 long slen;
5929 char *p, *e;
5930 rb_encoding *enc;
5931 int singlebyte = single_byte_optimizable(str);
5932 int cr;
5933
5934 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5935
5936 StringValue(val);
5937 enc = rb_enc_check(str, val);
5938 slen = str_strlen(str, enc); /* rb_enc_check */
5939
5940 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5941 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5942 }
5943 if (beg < 0) {
5944 beg += slen;
5945 }
5946 RUBY_ASSERT(beg >= 0);
5947 RUBY_ASSERT(beg <= slen);
5948
5949 if (len > slen - beg) {
5950 len = slen - beg;
5951 }
5952 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5953 if (!p) p = RSTRING_END(str);
5954 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5955 if (!e) e = RSTRING_END(str);
5956 /* error check */
5957 beg = p - RSTRING_PTR(str); /* physical position */
5958 len = e - p; /* physical length */
5959 rb_str_update_0(str, beg, len, val);
5960 rb_enc_associate(str, enc);
5962 if (cr != ENC_CODERANGE_BROKEN)
5963 ENC_CODERANGE_SET(str, cr);
5964}
5965
5966static void
5967rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5968{
5969 int nth;
5970 VALUE match;
5971 long start, end, len;
5972 rb_encoding *enc;
5973 struct re_registers *regs;
5974
5975 if (rb_reg_search(re, str, 0, 0) < 0) {
5976 rb_raise(rb_eIndexError, "regexp not matched");
5977 }
5978 match = rb_backref_get();
5979 nth = rb_reg_backref_number(match, backref);
5980 regs = RMATCH_REGS(match);
5981 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5982 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5983 }
5984 if (nth < 0) {
5985 nth += regs->num_regs;
5986 }
5987
5988 start = BEG(nth);
5989 if (start == -1) {
5990 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5991 }
5992 end = END(nth);
5993 len = end - start;
5994 StringValue(val);
5995 enc = rb_enc_check_str(str, val);
5996 rb_str_update_0(str, start, len, val);
5997 rb_enc_associate(str, enc);
5998}
5999
6000static VALUE
6001rb_str_aset(VALUE str, VALUE indx, VALUE val)
6002{
6003 long idx, beg;
6004
6005 switch (TYPE(indx)) {
6006 case T_REGEXP:
6007 rb_str_subpat_set(str, indx, INT2FIX(0), val);
6008 return val;
6009
6010 case T_STRING:
6011 beg = rb_str_index(str, indx, 0);
6012 if (beg < 0) {
6013 rb_raise(rb_eIndexError, "string not matched");
6014 }
6015 beg = rb_str_sublen(str, beg);
6016 rb_str_update(str, beg, str_strlen(indx, NULL), val);
6017 return val;
6018
6019 default:
6020 /* check if indx is Range */
6021 {
6022 long beg, len;
6023 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
6024 rb_str_update(str, beg, len, val);
6025 return val;
6026 }
6027 }
6028 /* FALLTHROUGH */
6029
6030 case T_FIXNUM:
6031 idx = NUM2LONG(indx);
6032 rb_str_update(str, idx, 1, val);
6033 return val;
6034 }
6035}
6036
6037/*
6038 * call-seq:
6039 * self[index] = new_string
6040 * self[start, length] = new_string
6041 * self[range] = new_string
6042 * self[regexp, capture = 0] = new_string
6043 * self[substring] = new_string
6044 *
6045 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
6046 * See {String Slices}[rdoc-ref:String@String+Slices].
6047 *
6048 * A few examples:
6049 *
6050 * s = 'foo'
6051 * s[2] = 'rtune' # => "rtune"
6052 * s # => "fortune"
6053 * s[1, 5] = 'init' # => "init"
6054 * s # => "finite"
6055 * s[3..4] = 'al' # => "al"
6056 * s # => "finale"
6057 * s[/e$/] = 'ly' # => "ly"
6058 * s # => "finally"
6059 * s['lly'] = 'ncial' # => "ncial"
6060 * s # => "financial"
6061 *
6062 * Related: see {Modifying}[rdoc-ref:String@Modifying].
6063 */
6064
6065static VALUE
6066rb_str_aset_m(int argc, VALUE *argv, VALUE str)
6067{
6068 if (argc == 3) {
6069 if (RB_TYPE_P(argv[0], T_REGEXP)) {
6070 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
6071 }
6072 else {
6073 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
6074 }
6075 return argv[2];
6076 }
6077 rb_check_arity(argc, 2, 3);
6078 return rb_str_aset(str, argv[0], argv[1]);
6079}
6080
6081/*
6082 * call-seq:
6083 * insert(index, other_string) -> self
6084 *
6085 * Inserts the given +other_string+ into +self+; returns +self+.
6086 *
6087 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
6088 *
6089 * 'foo'.insert(1, 'bar') # => "fbaroo"
6090 *
6091 * If the Integer +index+ is negative, counts backward from the end of +self+
6092 * and inserts +other_string+ at offset <tt>index+1</tt>
6093 * (that is, _after_ <tt>self[index]</tt>):
6094 *
6095 * 'foo'.insert(-2, 'bar') # => "fobaro"
6096 *
6097 */
6098
6099static VALUE
6100rb_str_insert(VALUE str, VALUE idx, VALUE str2)
6101{
6102 long pos = NUM2LONG(idx);
6103
6104 if (pos == -1) {
6105 return rb_str_append(str, str2);
6106 }
6107 else if (pos < 0) {
6108 pos++;
6109 }
6110 rb_str_update(str, pos, 0, str2);
6111 return str;
6112}
6113
6114
6115/*
6116 * call-seq:
6117 * slice!(index) -> new_string or nil
6118 * slice!(start, length) -> new_string or nil
6119 * slice!(range) -> new_string or nil
6120 * slice!(regexp, capture = 0) -> new_string or nil
6121 * slice!(substring) -> new_string or nil
6122 *
6123 * Removes and returns the substring of +self+ specified by the arguments.
6124 * See {String Slices}[rdoc-ref:String@String+Slices].
6125 *
6126 * A few examples:
6127 *
6128 * string = "This is a string"
6129 * string.slice!(2) #=> "i"
6130 * string.slice!(3..6) #=> " is "
6131 * string.slice!(/s.*t/) #=> "sa st"
6132 * string.slice!("r") #=> "r"
6133 * string #=> "Thing"
6134 *
6135 */
6136
6137static VALUE
6138rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
6139{
6140 VALUE result = Qnil;
6141 VALUE indx;
6142 long beg, len = 1;
6143 char *p;
6144
6145 rb_check_arity(argc, 1, 2);
6146 str_modify_keep_cr(str);
6147 indx = argv[0];
6148 if (RB_TYPE_P(indx, T_REGEXP)) {
6149 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6150 VALUE match = rb_backref_get();
6151 struct re_registers *regs = RMATCH_REGS(match);
6152 int nth = 0;
6153 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6154 if ((nth += regs->num_regs) <= 0) return Qnil;
6155 }
6156 else if (nth >= regs->num_regs) return Qnil;
6157 beg = BEG(nth);
6158 len = END(nth) - beg;
6159 goto subseq;
6160 }
6161 else if (argc == 2) {
6162 beg = NUM2LONG(indx);
6163 len = NUM2LONG(argv[1]);
6164 goto num_index;
6165 }
6166 else if (FIXNUM_P(indx)) {
6167 beg = FIX2LONG(indx);
6168 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6169 if (!len) return Qnil;
6170 beg = p - RSTRING_PTR(str);
6171 goto subseq;
6172 }
6173 else if (RB_TYPE_P(indx, T_STRING)) {
6174 beg = rb_str_index(str, indx, 0);
6175 if (beg == -1) return Qnil;
6176 len = RSTRING_LEN(indx);
6177 result = str_duplicate(rb_cString, indx);
6178 goto squash;
6179 }
6180 else {
6181 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6182 case Qnil:
6183 return Qnil;
6184 case Qfalse:
6185 beg = NUM2LONG(indx);
6186 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6187 if (!len) return Qnil;
6188 beg = p - RSTRING_PTR(str);
6189 goto subseq;
6190 default:
6191 goto num_index;
6192 }
6193 }
6194
6195 num_index:
6196 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6197 beg = p - RSTRING_PTR(str);
6198
6199 subseq:
6200 result = rb_str_new(RSTRING_PTR(str)+beg, len);
6201 rb_enc_cr_str_copy_for_substr(result, str);
6202
6203 squash:
6204 if (len > 0) {
6205 if (beg == 0) {
6206 rb_str_drop_bytes(str, len);
6207 }
6208 else {
6209 char *sptr = RSTRING_PTR(str);
6210 long slen = RSTRING_LEN(str);
6211 if (beg + len > slen) /* pathological check */
6212 len = slen - beg;
6213 memmove(sptr + beg,
6214 sptr + beg + len,
6215 slen - (beg + len));
6216 slen -= len;
6217 STR_SET_LEN(str, slen);
6218 TERM_FILL(&sptr[slen], TERM_LEN(str));
6219 }
6220 }
6221 return result;
6222}
6223
6224static VALUE
6225get_pat(VALUE pat)
6226{
6227 VALUE val;
6228
6229 switch (OBJ_BUILTIN_TYPE(pat)) {
6230 case T_REGEXP:
6231 return pat;
6232
6233 case T_STRING:
6234 break;
6235
6236 default:
6237 val = rb_check_string_type(pat);
6238 if (NIL_P(val)) {
6239 Check_Type(pat, T_REGEXP);
6240 }
6241 pat = val;
6242 }
6243
6244 return rb_reg_regcomp(pat);
6245}
6246
6247static VALUE
6248get_pat_quoted(VALUE pat, int check)
6249{
6250 VALUE val;
6251
6252 switch (OBJ_BUILTIN_TYPE(pat)) {
6253 case T_REGEXP:
6254 return pat;
6255
6256 case T_STRING:
6257 break;
6258
6259 default:
6260 val = rb_check_string_type(pat);
6261 if (NIL_P(val)) {
6262 Check_Type(pat, T_REGEXP);
6263 }
6264 pat = val;
6265 }
6266 if (check && is_broken_string(pat)) {
6267 rb_exc_raise(rb_reg_check_preprocess(pat));
6268 }
6269 return pat;
6270}
6271
6272static long
6273rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
6274{
6275 if (BUILTIN_TYPE(pat) == T_STRING) {
6276 pos = rb_str_byteindex(str, pat, pos);
6277 if (set_backref_str) {
6278 if (pos >= 0) {
6279 str = rb_str_new_frozen_String(str);
6280 VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6281 if (match) {
6282 *match = match_data;
6283 }
6284 }
6285 else {
6287 }
6288 }
6289 return pos;
6290 }
6291 else {
6292 return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
6293 }
6294}
6295
6296static long
6297rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6298{
6299 return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6300}
6301
6302
6303/*
6304 * call-seq:
6305 * sub!(pattern, replacement) -> self or nil
6306 * sub!(pattern) {|match| ... } -> self or nil
6307 *
6308 * Replaces the first occurrence (not all occurrences) of the given +pattern+
6309 * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6310 *
6311 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6312 *
6313 * Related: String#sub, String#gsub, String#gsub!.
6314 *
6315 */
6316
6317static VALUE
6318rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6319{
6320 VALUE pat, repl, hash = Qnil;
6321 int iter = 0;
6322 long plen;
6323 int min_arity = rb_block_given_p() ? 1 : 2;
6324 long beg;
6325
6326 rb_check_arity(argc, min_arity, 2);
6327 if (argc == 1) {
6328 iter = 1;
6329 }
6330 else {
6331 repl = argv[1];
6332 hash = rb_check_hash_type(argv[1]);
6333 if (NIL_P(hash)) {
6334 StringValue(repl);
6335 }
6336 }
6337
6338 pat = get_pat_quoted(argv[0], 1);
6339
6340 str_modifiable(str);
6341 beg = rb_pat_search(pat, str, 0, 1);
6342 if (beg >= 0) {
6343 rb_encoding *enc;
6344 int cr = ENC_CODERANGE(str);
6345 long beg0, end0;
6346 VALUE match, match0 = Qnil;
6347 struct re_registers *regs;
6348 char *p, *rp;
6349 long len, rlen;
6350
6351 match = rb_backref_get();
6352 regs = RMATCH_REGS(match);
6353 if (RB_TYPE_P(pat, T_STRING)) {
6354 beg0 = beg;
6355 end0 = beg0 + RSTRING_LEN(pat);
6356 match0 = pat;
6357 }
6358 else {
6359 beg0 = BEG(0);
6360 end0 = END(0);
6361 if (iter) match0 = rb_reg_nth_match(0, match);
6362 }
6363
6364 if (iter || !NIL_P(hash)) {
6365 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6366
6367 if (iter) {
6368 repl = rb_obj_as_string(rb_yield(match0));
6369 }
6370 else {
6371 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6372 repl = rb_obj_as_string(repl);
6373 }
6374 str_mod_check(str, p, len);
6375 rb_check_frozen(str);
6376 }
6377 else {
6378 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6379 }
6380
6381 enc = rb_enc_compatible(str, repl);
6382 if (!enc) {
6383 rb_encoding *str_enc = STR_ENC_GET(str);
6384 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6385 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6386 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6387 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6388 rb_enc_inspect_name(str_enc),
6389 rb_enc_inspect_name(STR_ENC_GET(repl)));
6390 }
6391 enc = STR_ENC_GET(repl);
6392 }
6393 rb_str_modify(str);
6394 rb_enc_associate(str, enc);
6396 int cr2 = ENC_CODERANGE(repl);
6397 if (cr2 == ENC_CODERANGE_BROKEN ||
6398 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6400 else
6401 cr = cr2;
6402 }
6403 plen = end0 - beg0;
6404 rlen = RSTRING_LEN(repl);
6405 len = RSTRING_LEN(str);
6406 if (rlen > plen) {
6407 RESIZE_CAPA(str, len + rlen - plen);
6408 }
6409 p = RSTRING_PTR(str);
6410 if (rlen != plen) {
6411 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6412 }
6413 rp = RSTRING_PTR(repl);
6414 memmove(p + beg0, rp, rlen);
6415 len += rlen - plen;
6416 STR_SET_LEN(str, len);
6417 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6418 ENC_CODERANGE_SET(str, cr);
6419
6420 RB_GC_GUARD(match);
6421
6422 return str;
6423 }
6424 return Qnil;
6425}
6426
6427
6428/*
6429 * call-seq:
6430 * sub(pattern, replacement) -> new_string
6431 * sub(pattern) {|match| ... } -> new_string
6432 *
6433 * Returns a copy of +self+ with only the first occurrence
6434 * (not all occurrences) of the given +pattern+ replaced.
6435 *
6436 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6437 *
6438 * Related: String#sub!, String#gsub, String#gsub!.
6439 *
6440 */
6441
6442static VALUE
6443rb_str_sub(int argc, VALUE *argv, VALUE str)
6444{
6445 str = str_duplicate(rb_cString, str);
6446 rb_str_sub_bang(argc, argv, str);
6447 return str;
6448}
6449
6450static VALUE
6451str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6452{
6453 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
6454 long beg, beg0, end0;
6455 long offset, blen, slen, len, last;
6456 enum {STR, ITER, FAST_MAP, MAP} mode = STR;
6457 char *sp, *cp;
6458 int need_backref_str = -1;
6459 rb_encoding *str_enc;
6460
6461 switch (argc) {
6462 case 1:
6463 RETURN_ENUMERATOR(str, argc, argv);
6464 mode = ITER;
6465 break;
6466 case 2:
6467 repl = argv[1];
6468 hash = rb_check_hash_type(argv[1]);
6469 if (NIL_P(hash)) {
6470 StringValue(repl);
6471 }
6472 else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
6473 mode = FAST_MAP;
6474 }
6475 else {
6476 mode = MAP;
6477 }
6478 break;
6479 default:
6480 rb_error_arity(argc, 1, 2);
6481 }
6482
6483 pat = get_pat_quoted(argv[0], 1);
6484 beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6485
6486 if (beg < 0) {
6487 if (bang) return Qnil; /* no match, no substitution */
6488 return str_duplicate(rb_cString, str);
6489 }
6490
6491 offset = 0;
6492 blen = RSTRING_LEN(str) + 30; /* len + margin */
6493 dest = rb_str_buf_new(blen);
6494 sp = RSTRING_PTR(str);
6495 slen = RSTRING_LEN(str);
6496 cp = sp;
6497 str_enc = STR_ENC_GET(str);
6498 rb_enc_associate(dest, str_enc);
6499 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
6500
6501 do {
6502 struct re_registers *regs = RMATCH_REGS(match);
6503 if (RB_TYPE_P(pat, T_STRING)) {
6504 beg0 = beg;
6505 end0 = beg0 + RSTRING_LEN(pat);
6506 match0 = pat;
6507 }
6508 else {
6509 beg0 = BEG(0);
6510 end0 = END(0);
6511 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6512 }
6513
6514 if (mode != STR) {
6515 if (mode == ITER) {
6516 val = rb_obj_as_string(rb_yield(match0));
6517 }
6518 else {
6519 struct RString fake_str;
6520 VALUE key;
6521 if (mode == FAST_MAP) {
6522 // It is safe to use a fake_str here because we established that it won't escape,
6523 // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
6524 // default proc.
6525 key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
6526 }
6527 else {
6528 key = rb_str_subseq(str, beg0, end0 - beg0);
6529 }
6530 val = rb_hash_aref(hash, key);
6531 val = rb_obj_as_string(val);
6532 }
6533 str_mod_check(str, sp, slen);
6534 if (val == dest) { /* paranoid check [ruby-dev:24827] */
6535 rb_raise(rb_eRuntimeError, "block should not cheat");
6536 }
6537 }
6538 else if (need_backref_str) {
6539 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6540 if (need_backref_str < 0) {
6541 need_backref_str = val != repl;
6542 }
6543 }
6544 else {
6545 val = repl;
6546 }
6547
6548 len = beg0 - offset; /* copy pre-match substr */
6549 if (len) {
6550 rb_enc_str_buf_cat(dest, cp, len, str_enc);
6551 }
6552
6553 rb_str_buf_append(dest, val);
6554
6555 last = offset;
6556 offset = end0;
6557 if (beg0 == end0) {
6558 /*
6559 * Always consume at least one character of the input string
6560 * in order to prevent infinite loops.
6561 */
6562 if (RSTRING_LEN(str) <= end0) break;
6563 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6564 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6565 offset = end0 + len;
6566 }
6567 cp = RSTRING_PTR(str) + offset;
6568 if (offset > RSTRING_LEN(str)) break;
6569
6570 // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6571 if (mode != FAST_MAP && mode != STR) {
6572 match = Qnil;
6573 }
6574 beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
6575
6576 RB_GC_GUARD(match);
6577 } while (beg >= 0);
6578
6579 if (RSTRING_LEN(str) > offset) {
6580 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6581 }
6582 rb_pat_search0(pat, str, last, 1, &match);
6583 if (bang) {
6584 str_shared_replace(str, dest);
6585 }
6586 else {
6587 str = dest;
6588 }
6589
6590 return str;
6591}
6592
6593
6594/*
6595 * call-seq:
6596 * gsub!(pattern, replacement) -> self or nil
6597 * gsub!(pattern) {|match| ... } -> self or nil
6598 * gsub!(pattern) -> an_enumerator
6599 *
6600 * Performs the specified substring replacement(s) on +self+;
6601 * returns +self+ if any replacement occurred, +nil+ otherwise.
6602 *
6603 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6604 *
6605 * Returns an Enumerator if no +replacement+ and no block given.
6606 *
6607 * Related: String#sub, String#gsub, String#sub!.
6608 *
6609 */
6610
6611static VALUE
6612rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6613{
6614 str_modify_keep_cr(str);
6615 return str_gsub(argc, argv, str, 1);
6616}
6617
6618
6619/*
6620 * call-seq:
6621 * gsub(pattern, replacement) -> new_string
6622 * gsub(pattern) {|match| ... } -> new_string
6623 * gsub(pattern) -> enumerator
6624 *
6625 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6626 *
6627 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6628 *
6629 * Returns an Enumerator if no +replacement+ and no block given.
6630 *
6631 * Related: String#sub, String#sub!, String#gsub!.
6632 *
6633 */
6634
6635static VALUE
6636rb_str_gsub(int argc, VALUE *argv, VALUE str)
6637{
6638 return str_gsub(argc, argv, str, 0);
6639}
6640
6641
6642/*
6643 * call-seq:
6644 * replace(other_string) -> self
6645 *
6646 * Replaces the contents of +self+ with the contents of +other_string+:
6647 *
6648 * s = 'foo' # => "foo"
6649 * s.replace('bar') # => "bar"
6650 *
6651 */
6652
6653VALUE
6655{
6656 str_modifiable(str);
6657 if (str == str2) return str;
6658
6659 StringValue(str2);
6660 str_discard(str);
6661 return str_replace(str, str2);
6662}
6663
6664/*
6665 * call-seq:
6666 * clear -> self
6667 *
6668 * Removes the contents of +self+:
6669 *
6670 * s = 'foo' # => "foo"
6671 * s.clear # => ""
6672 *
6673 */
6674
6675static VALUE
6676rb_str_clear(VALUE str)
6677{
6678 str_discard(str);
6679 STR_SET_EMBED(str);
6680 STR_SET_LEN(str, 0);
6681 RSTRING_PTR(str)[0] = 0;
6682 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6684 else
6686 return str;
6687}
6688
6689/*
6690 * call-seq:
6691 * chr -> string
6692 *
6693 * Returns a string containing the first character of +self+:
6694 *
6695 * s = 'foo' # => "foo"
6696 * s.chr # => "f"
6697 *
6698 */
6699
6700static VALUE
6701rb_str_chr(VALUE str)
6702{
6703 return rb_str_substr(str, 0, 1);
6704}
6705
6706/*
6707 * call-seq:
6708 * getbyte(index) -> integer or nil
6709 *
6710 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6711 *
6712 * s = 'abcde' # => "abcde"
6713 * s.getbyte(0) # => 97
6714 * s.getbyte(-1) # => 101
6715 * s.getbyte(5) # => nil
6716 *
6717 * Related: String#setbyte.
6718 */
6719VALUE
6720rb_str_getbyte(VALUE str, VALUE index)
6721{
6722 long pos = NUM2LONG(index);
6723
6724 if (pos < 0)
6725 pos += RSTRING_LEN(str);
6726 if (pos < 0 || RSTRING_LEN(str) <= pos)
6727 return Qnil;
6728
6729 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6730}
6731
6732/*
6733 * call-seq:
6734 * setbyte(index, integer) -> integer
6735 *
6736 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6737 *
6738 * s = 'abcde' # => "abcde"
6739 * s.setbyte(0, 98) # => 98
6740 * s # => "bbcde"
6741 *
6742 * Related: String#getbyte.
6743 */
6744VALUE
6745rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6746{
6747 long pos = NUM2LONG(index);
6748 long len = RSTRING_LEN(str);
6749 char *ptr, *head, *left = 0;
6750 rb_encoding *enc;
6751 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6752
6753 if (pos < -len || len <= pos)
6754 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6755 if (pos < 0)
6756 pos += len;
6757
6758 VALUE v = rb_to_int(value);
6759 VALUE w = rb_int_and(v, INT2FIX(0xff));
6760 char byte = (char)(NUM2INT(w) & 0xFF);
6761
6762 if (!str_independent(str))
6763 str_make_independent(str);
6764 enc = STR_ENC_GET(str);
6765 head = RSTRING_PTR(str);
6766 ptr = &head[pos];
6767 if (!STR_EMBED_P(str)) {
6768 cr = ENC_CODERANGE(str);
6769 switch (cr) {
6770 case ENC_CODERANGE_7BIT:
6771 left = ptr;
6772 *ptr = byte;
6773 if (ISASCII(byte)) goto end;
6774 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6775 if (!MBCLEN_CHARFOUND_P(nlen))
6777 else
6779 goto end;
6781 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6782 width = rb_enc_precise_mbclen(left, head+len, enc);
6783 *ptr = byte;
6784 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6785 if (!MBCLEN_CHARFOUND_P(nlen))
6787 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6789 goto end;
6790 }
6791 }
6793 *ptr = byte;
6794
6795 end:
6796 return value;
6797}
6798
6799static VALUE
6800str_byte_substr(VALUE str, long beg, long len, int empty)
6801{
6802 long n = RSTRING_LEN(str);
6803
6804 if (beg > n || len < 0) return Qnil;
6805 if (beg < 0) {
6806 beg += n;
6807 if (beg < 0) return Qnil;
6808 }
6809 if (len > n - beg)
6810 len = n - beg;
6811 if (len <= 0) {
6812 if (!empty) return Qnil;
6813 len = 0;
6814 }
6815
6816 VALUE str2 = str_subseq(str, beg, len);
6817
6818 str_enc_copy_direct(str2, str);
6819
6820 if (RSTRING_LEN(str2) == 0) {
6821 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6823 else
6825 }
6826 else {
6827 switch (ENC_CODERANGE(str)) {
6828 case ENC_CODERANGE_7BIT:
6830 break;
6831 default:
6833 break;
6834 }
6835 }
6836
6837 return str2;
6838}
6839
6840VALUE
6841rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6842{
6843 return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6844}
6845
6846static VALUE
6847str_byte_aref(VALUE str, VALUE indx)
6848{
6849 long idx;
6850 if (FIXNUM_P(indx)) {
6851 idx = FIX2LONG(indx);
6852 }
6853 else {
6854 /* check if indx is Range */
6855 long beg, len = RSTRING_LEN(str);
6856
6857 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6858 case Qfalse:
6859 break;
6860 case Qnil:
6861 return Qnil;
6862 default:
6863 return str_byte_substr(str, beg, len, TRUE);
6864 }
6865
6866 idx = NUM2LONG(indx);
6867 }
6868 return str_byte_substr(str, idx, 1, FALSE);
6869}
6870
6871/*
6872 * call-seq:
6873 * byteslice(offset, length = 1) -> string or nil
6874 * byteslice(range) -> string or nil
6875 *
6876 * :include: doc/string/byteslice.rdoc
6877 */
6878
6879static VALUE
6880rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6881{
6882 if (argc == 2) {
6883 long beg = NUM2LONG(argv[0]);
6884 long len = NUM2LONG(argv[1]);
6885 return str_byte_substr(str, beg, len, TRUE);
6886 }
6887 rb_check_arity(argc, 1, 2);
6888 return str_byte_aref(str, argv[0]);
6889}
6890
6891static void
6892str_check_beg_len(VALUE str, long *beg, long *len)
6893{
6894 long end, slen = RSTRING_LEN(str);
6895
6896 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6897 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6898 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6899 }
6900 if (*beg < 0) {
6901 *beg += slen;
6902 }
6903 RUBY_ASSERT(*beg >= 0);
6904 RUBY_ASSERT(*beg <= slen);
6905
6906 if (*len > slen - *beg) {
6907 *len = slen - *beg;
6908 }
6909 end = *beg + *len;
6910 str_ensure_byte_pos(str, *beg);
6911 str_ensure_byte_pos(str, end);
6912}
6913
6914/*
6915 * call-seq:
6916 * bytesplice(offset, length, str) -> self
6917 * bytesplice(offset, length, str, str_offset, str_length) -> self
6918 * bytesplice(range, str) -> self
6919 * bytesplice(range, str, str_range) -> self
6920 *
6921 * :include: doc/string/bytesplice.rdoc
6922 */
6923
6924static VALUE
6925rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6926{
6927 long beg, len, vbeg, vlen;
6928 VALUE val;
6929 int cr;
6930
6931 rb_check_arity(argc, 2, 5);
6932 if (!(argc == 2 || argc == 3 || argc == 5)) {
6933 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6934 }
6935 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6936 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6937 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6938 rb_builtin_class_name(argv[0]));
6939 }
6940 val = argv[1];
6941 StringValue(val);
6942 if (argc == 2) {
6943 /* bytesplice(range, str) */
6944 vbeg = 0;
6945 vlen = RSTRING_LEN(val);
6946 }
6947 else {
6948 /* bytesplice(range, str, str_range) */
6949 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6950 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6951 rb_builtin_class_name(argv[2]));
6952 }
6953 }
6954 }
6955 else {
6956 beg = NUM2LONG(argv[0]);
6957 len = NUM2LONG(argv[1]);
6958 val = argv[2];
6959 StringValue(val);
6960 if (argc == 3) {
6961 /* bytesplice(index, length, str) */
6962 vbeg = 0;
6963 vlen = RSTRING_LEN(val);
6964 }
6965 else {
6966 /* bytesplice(index, length, str, str_index, str_length) */
6967 vbeg = NUM2LONG(argv[3]);
6968 vlen = NUM2LONG(argv[4]);
6969 }
6970 }
6971 str_check_beg_len(str, &beg, &len);
6972 str_check_beg_len(val, &vbeg, &vlen);
6973 str_modify_keep_cr(str);
6974
6975 if (RB_UNLIKELY(ENCODING_GET_INLINED(str) != ENCODING_GET_INLINED(val))) {
6976 rb_enc_associate(str, rb_enc_check(str, val));
6977 }
6978
6979 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6981 if (cr != ENC_CODERANGE_BROKEN)
6982 ENC_CODERANGE_SET(str, cr);
6983 return str;
6984}
6985
6986/*
6987 * call-seq:
6988 * reverse -> string
6989 *
6990 * Returns a new string with the characters from +self+ in reverse order.
6991 *
6992 * 'stressed'.reverse # => "desserts"
6993 *
6994 */
6995
6996static VALUE
6997rb_str_reverse(VALUE str)
6998{
6999 rb_encoding *enc;
7000 VALUE rev;
7001 char *s, *e, *p;
7002 int cr;
7003
7004 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
7005 enc = STR_ENC_GET(str);
7006 rev = rb_str_new(0, RSTRING_LEN(str));
7007 s = RSTRING_PTR(str); e = RSTRING_END(str);
7008 p = RSTRING_END(rev);
7009 cr = ENC_CODERANGE(str);
7010
7011 if (RSTRING_LEN(str) > 1) {
7012 if (single_byte_optimizable(str)) {
7013 while (s < e) {
7014 *--p = *s++;
7015 }
7016 }
7017 else if (cr == ENC_CODERANGE_VALID) {
7018 while (s < e) {
7019 int clen = rb_enc_fast_mbclen(s, e, enc);
7020
7021 p -= clen;
7022 memcpy(p, s, clen);
7023 s += clen;
7024 }
7025 }
7026 else {
7027 cr = rb_enc_asciicompat(enc) ?
7029 while (s < e) {
7030 int clen = rb_enc_mbclen(s, e, enc);
7031
7032 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
7033 p -= clen;
7034 memcpy(p, s, clen);
7035 s += clen;
7036 }
7037 }
7038 }
7039 STR_SET_LEN(rev, RSTRING_LEN(str));
7040 str_enc_copy_direct(rev, str);
7041 ENC_CODERANGE_SET(rev, cr);
7042
7043 return rev;
7044}
7045
7046
7047/*
7048 * call-seq:
7049 * reverse! -> self
7050 *
7051 * Returns +self+ with its characters reversed:
7052 *
7053 * s = 'stressed'
7054 * s.reverse! # => "desserts"
7055 * s # => "desserts"
7056 *
7057 */
7058
7059static VALUE
7060rb_str_reverse_bang(VALUE str)
7061{
7062 if (RSTRING_LEN(str) > 1) {
7063 if (single_byte_optimizable(str)) {
7064 char *s, *e, c;
7065
7066 str_modify_keep_cr(str);
7067 s = RSTRING_PTR(str);
7068 e = RSTRING_END(str) - 1;
7069 while (s < e) {
7070 c = *s;
7071 *s++ = *e;
7072 *e-- = c;
7073 }
7074 }
7075 else {
7076 str_shared_replace(str, rb_str_reverse(str));
7077 }
7078 }
7079 else {
7080 str_modify_keep_cr(str);
7081 }
7082 return str;
7083}
7084
7085
7086/*
7087 * call-seq:
7088 * include?(other_string) -> true or false
7089 *
7090 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
7091 *
7092 * s = 'foo'
7093 * s.include?('f') # => true
7094 * s.include?('fo') # => true
7095 * s.include?('food') # => false
7096 *
7097 */
7098
7099VALUE
7100rb_str_include(VALUE str, VALUE arg)
7101{
7102 long i;
7103
7104 StringValue(arg);
7105 i = rb_str_index(str, arg, 0);
7106
7107 return RBOOL(i != -1);
7108}
7109
7110
7111/*
7112 * call-seq:
7113 * to_i(base = 10) -> integer
7114 *
7115 * Returns the result of interpreting leading characters in +self+
7116 * as an integer in the given +base+ (which must be in (0, 2..36)):
7117 *
7118 * '123456'.to_i # => 123456
7119 * '123def'.to_i(16) # => 1195503
7120 *
7121 * With +base+ zero, string +object+ may contain leading characters
7122 * to specify the actual base:
7123 *
7124 * '123def'.to_i(0) # => 123
7125 * '0123def'.to_i(0) # => 83
7126 * '0b123def'.to_i(0) # => 1
7127 * '0o123def'.to_i(0) # => 83
7128 * '0d123def'.to_i(0) # => 123
7129 * '0x123def'.to_i(0) # => 1195503
7130 *
7131 * Characters past a leading valid number (in the given +base+) are ignored:
7132 *
7133 * '12.345'.to_i # => 12
7134 * '12345'.to_i(2) # => 1
7135 *
7136 * Returns zero if there is no leading valid number:
7137 *
7138 * 'abcdef'.to_i # => 0
7139 * '2'.to_i(2) # => 0
7140 *
7141 */
7142
7143static VALUE
7144rb_str_to_i(int argc, VALUE *argv, VALUE str)
7145{
7146 int base = 10;
7147
7148 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7149 rb_raise(rb_eArgError, "invalid radix %d", base);
7150 }
7151 return rb_str_to_inum(str, base, FALSE);
7152}
7153
7154
7155/*
7156 * call-seq:
7157 * to_f -> float
7158 *
7159 * Returns the result of interpreting leading characters in +self+ as a Float:
7160 *
7161 * '3.14159'.to_f # => 3.14159
7162 * '1.234e-2'.to_f # => 0.01234
7163 *
7164 * Characters past a leading valid number (in the given +base+) are ignored:
7165 *
7166 * '3.14 (pi to two places)'.to_f # => 3.14
7167 *
7168 * Returns zero if there is no leading valid number:
7169 *
7170 * 'abcdef'.to_f # => 0.0
7171 *
7172 */
7173
7174static VALUE
7175rb_str_to_f(VALUE str)
7176{
7177 return DBL2NUM(rb_str_to_dbl(str, FALSE));
7178}
7179
7180
7181/*
7182 * call-seq:
7183 * to_s -> self or string
7184 *
7185 * Returns +self+ if +self+ is a +String+,
7186 * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7187 */
7188
7189static VALUE
7190rb_str_to_s(VALUE str)
7191{
7192 if (rb_obj_class(str) != rb_cString) {
7193 return str_duplicate(rb_cString, str);
7194 }
7195 return str;
7196}
7197
7198#if 0
7199static void
7200str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7201{
7202 char s[RUBY_MAX_CHAR_LEN];
7203 int n = rb_enc_codelen(c, enc);
7204
7205 rb_enc_mbcput(c, s, enc);
7206 rb_enc_str_buf_cat(str, s, n, enc);
7207}
7208#endif
7209
7210#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7211
7212int
7213rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7214{
7215 char buf[CHAR_ESC_LEN + 1];
7216 int l;
7217
7218#if SIZEOF_INT > 4
7219 c &= 0xffffffff;
7220#endif
7221 if (unicode_p) {
7222 if (c < 0x7F && ISPRINT(c)) {
7223 snprintf(buf, CHAR_ESC_LEN, "%c", c);
7224 }
7225 else if (c < 0x10000) {
7226 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7227 }
7228 else {
7229 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7230 }
7231 }
7232 else {
7233 if (c < 0x100) {
7234 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7235 }
7236 else {
7237 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7238 }
7239 }
7240 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7241 rb_str_buf_cat(result, buf, l);
7242 return l;
7243}
7244
7245const char *
7246ruby_escaped_char(int c)
7247{
7248 switch (c) {
7249 case '\0': return "\\0";
7250 case '\n': return "\\n";
7251 case '\r': return "\\r";
7252 case '\t': return "\\t";
7253 case '\f': return "\\f";
7254 case '\013': return "\\v";
7255 case '\010': return "\\b";
7256 case '\007': return "\\a";
7257 case '\033': return "\\e";
7258 case '\x7f': return "\\c?";
7259 }
7260 return NULL;
7261}
7262
7263VALUE
7264rb_str_escape(VALUE str)
7265{
7266 int encidx = ENCODING_GET(str);
7267 rb_encoding *enc = rb_enc_from_index(encidx);
7268 const char *p = RSTRING_PTR(str);
7269 const char *pend = RSTRING_END(str);
7270 const char *prev = p;
7271 char buf[CHAR_ESC_LEN + 1];
7272 VALUE result = rb_str_buf_new(0);
7273 int unicode_p = rb_enc_unicode_p(enc);
7274 int asciicompat = rb_enc_asciicompat(enc);
7275
7276 while (p < pend) {
7277 unsigned int c;
7278 const char *cc;
7279 int n = rb_enc_precise_mbclen(p, pend, enc);
7280 if (!MBCLEN_CHARFOUND_P(n)) {
7281 if (p > prev) str_buf_cat(result, prev, p - prev);
7282 n = rb_enc_mbminlen(enc);
7283 if (pend < p + n)
7284 n = (int)(pend - p);
7285 while (n--) {
7286 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7287 str_buf_cat(result, buf, strlen(buf));
7288 prev = ++p;
7289 }
7290 continue;
7291 }
7292 n = MBCLEN_CHARFOUND_LEN(n);
7293 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7294 p += n;
7295 cc = ruby_escaped_char(c);
7296 if (cc) {
7297 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7298 str_buf_cat(result, cc, strlen(cc));
7299 prev = p;
7300 }
7301 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7302 }
7303 else {
7304 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7305 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7306 prev = p;
7307 }
7308 }
7309 if (p > prev) str_buf_cat(result, prev, p - prev);
7310 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
7311
7312 return result;
7313}
7314
7315/*
7316 * call-seq:
7317 * inspect -> string
7318 *
7319 * Returns a printable version of +self+, enclosed in double-quotes,
7320 * and with special characters escaped:
7321 *
7322 * s = "foo\tbar\tbaz\n"
7323 * s.inspect
7324 * # => "\"foo\\tbar\\tbaz\\n\""
7325 *
7326 */
7327
7328VALUE
7330{
7331 int encidx = ENCODING_GET(str);
7332 rb_encoding *enc = rb_enc_from_index(encidx);
7333 const char *p, *pend, *prev;
7334 char buf[CHAR_ESC_LEN + 1];
7335 VALUE result = rb_str_buf_new(0);
7336 rb_encoding *resenc = rb_default_internal_encoding();
7337 int unicode_p = rb_enc_unicode_p(enc);
7338 int asciicompat = rb_enc_asciicompat(enc);
7339
7340 if (resenc == NULL) resenc = rb_default_external_encoding();
7341 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7342 rb_enc_associate(result, resenc);
7343 str_buf_cat2(result, "\"");
7344
7345 p = RSTRING_PTR(str); pend = RSTRING_END(str);
7346 prev = p;
7347 while (p < pend) {
7348 unsigned int c, cc;
7349 int n;
7350
7351 n = rb_enc_precise_mbclen(p, pend, enc);
7352 if (!MBCLEN_CHARFOUND_P(n)) {
7353 if (p > prev) str_buf_cat(result, prev, p - prev);
7354 n = rb_enc_mbminlen(enc);
7355 if (pend < p + n)
7356 n = (int)(pend - p);
7357 while (n--) {
7358 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7359 str_buf_cat(result, buf, strlen(buf));
7360 prev = ++p;
7361 }
7362 continue;
7363 }
7364 n = MBCLEN_CHARFOUND_LEN(n);
7365 c = rb_enc_mbc_to_codepoint(p, pend, enc);
7366 p += n;
7367 if ((asciicompat || unicode_p) &&
7368 (c == '"'|| c == '\\' ||
7369 (c == '#' &&
7370 p < pend &&
7371 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
7372 (cc = rb_enc_codepoint(p,pend,enc),
7373 (cc == '$' || cc == '@' || cc == '{'))))) {
7374 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7375 str_buf_cat2(result, "\\");
7376 if (asciicompat || enc == resenc) {
7377 prev = p - n;
7378 continue;
7379 }
7380 }
7381 switch (c) {
7382 case '\n': cc = 'n'; break;
7383 case '\r': cc = 'r'; break;
7384 case '\t': cc = 't'; break;
7385 case '\f': cc = 'f'; break;
7386 case '\013': cc = 'v'; break;
7387 case '\010': cc = 'b'; break;
7388 case '\007': cc = 'a'; break;
7389 case 033: cc = 'e'; break;
7390 default: cc = 0; break;
7391 }
7392 if (cc) {
7393 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7394 buf[0] = '\\';
7395 buf[1] = (char)cc;
7396 str_buf_cat(result, buf, 2);
7397 prev = p;
7398 continue;
7399 }
7400 /* The special casing of 0x85 (NEXT_LINE) here is because
7401 * Oniguruma historically treats it as printable, but it
7402 * doesn't match the print POSIX bracket class or character
7403 * property in regexps.
7404 *
7405 * See Ruby Bug #16842 for details:
7406 * https://bugs.ruby-lang.org/issues/16842
7407 */
7408 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7409 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7410 continue;
7411 }
7412 else {
7413 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7414 rb_str_buf_cat_escaped_char(result, c, unicode_p);
7415 prev = p;
7416 continue;
7417 }
7418 }
7419 if (p > prev) str_buf_cat(result, prev, p - prev);
7420 str_buf_cat2(result, "\"");
7421
7422 return result;
7423}
7424
7425#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7426
7427/*
7428 * call-seq:
7429 * dump -> string
7430 *
7431 * Returns a printable version of +self+, enclosed in double-quotes,
7432 * with special characters escaped, and with non-printing characters
7433 * replaced by hexadecimal notation:
7434 *
7435 * "hello \n ''".dump # => "\"hello \\n ''\""
7436 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7437 *
7438 * Related: String#undump (inverse of String#dump).
7439 *
7440 */
7441
7442VALUE
7444{
7445 int encidx = rb_enc_get_index(str);
7446 rb_encoding *enc = rb_enc_from_index(encidx);
7447 long len;
7448 const char *p, *pend;
7449 char *q, *qend;
7450 VALUE result;
7451 int u8 = (encidx == rb_utf8_encindex());
7452 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7453
7454 len = 2; /* "" */
7455 if (!rb_enc_asciicompat(enc)) {
7456 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7457 len += strlen(enc->name);
7458 }
7459
7460 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7461 while (p < pend) {
7462 int clen;
7463 unsigned char c = *p++;
7464
7465 switch (c) {
7466 case '"': case '\\':
7467 case '\n': case '\r':
7468 case '\t': case '\f':
7469 case '\013': case '\010': case '\007': case '\033':
7470 clen = 2;
7471 break;
7472
7473 case '#':
7474 clen = IS_EVSTR(p, pend) ? 2 : 1;
7475 break;
7476
7477 default:
7478 if (ISPRINT(c)) {
7479 clen = 1;
7480 }
7481 else {
7482 if (u8 && c > 0x7F) { /* \u notation */
7483 int n = rb_enc_precise_mbclen(p-1, pend, enc);
7484 if (MBCLEN_CHARFOUND_P(n)) {
7485 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7486 if (cc <= 0xFFFF)
7487 clen = 6; /* \uXXXX */
7488 else if (cc <= 0xFFFFF)
7489 clen = 9; /* \u{XXXXX} */
7490 else
7491 clen = 10; /* \u{XXXXXX} */
7492 p += MBCLEN_CHARFOUND_LEN(n)-1;
7493 break;
7494 }
7495 }
7496 clen = 4; /* \xNN */
7497 }
7498 break;
7499 }
7500
7501 if (clen > LONG_MAX - len) {
7502 rb_raise(rb_eRuntimeError, "string size too big");
7503 }
7504 len += clen;
7505 }
7506
7507 result = rb_str_new(0, len);
7508 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7509 q = RSTRING_PTR(result); qend = q + len + 1;
7510
7511 *q++ = '"';
7512 while (p < pend) {
7513 unsigned char c = *p++;
7514
7515 if (c == '"' || c == '\\') {
7516 *q++ = '\\';
7517 *q++ = c;
7518 }
7519 else if (c == '#') {
7520 if (IS_EVSTR(p, pend)) *q++ = '\\';
7521 *q++ = '#';
7522 }
7523 else if (c == '\n') {
7524 *q++ = '\\';
7525 *q++ = 'n';
7526 }
7527 else if (c == '\r') {
7528 *q++ = '\\';
7529 *q++ = 'r';
7530 }
7531 else if (c == '\t') {
7532 *q++ = '\\';
7533 *q++ = 't';
7534 }
7535 else if (c == '\f') {
7536 *q++ = '\\';
7537 *q++ = 'f';
7538 }
7539 else if (c == '\013') {
7540 *q++ = '\\';
7541 *q++ = 'v';
7542 }
7543 else if (c == '\010') {
7544 *q++ = '\\';
7545 *q++ = 'b';
7546 }
7547 else if (c == '\007') {
7548 *q++ = '\\';
7549 *q++ = 'a';
7550 }
7551 else if (c == '\033') {
7552 *q++ = '\\';
7553 *q++ = 'e';
7554 }
7555 else if (ISPRINT(c)) {
7556 *q++ = c;
7557 }
7558 else {
7559 *q++ = '\\';
7560 if (u8) {
7561 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7562 if (MBCLEN_CHARFOUND_P(n)) {
7563 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7564 p += n;
7565 if (cc <= 0xFFFF)
7566 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7567 else
7568 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7569 q += strlen(q);
7570 continue;
7571 }
7572 }
7573 snprintf(q, qend-q, "x%02X", c);
7574 q += 3;
7575 }
7576 }
7577 *q++ = '"';
7578 *q = '\0';
7579 if (!rb_enc_asciicompat(enc)) {
7580 snprintf(q, qend-q, nonascii_suffix, enc->name);
7581 encidx = rb_ascii8bit_encindex();
7582 }
7583 /* result from dump is ASCII */
7584 rb_enc_associate_index(result, encidx);
7586 return result;
7587}
7588
7589static int
7590unescape_ascii(unsigned int c)
7591{
7592 switch (c) {
7593 case 'n':
7594 return '\n';
7595 case 'r':
7596 return '\r';
7597 case 't':
7598 return '\t';
7599 case 'f':
7600 return '\f';
7601 case 'v':
7602 return '\13';
7603 case 'b':
7604 return '\010';
7605 case 'a':
7606 return '\007';
7607 case 'e':
7608 return 033;
7609 }
7611}
7612
7613static void
7614undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7615{
7616 const char *s = *ss;
7617 unsigned int c;
7618 int codelen;
7619 size_t hexlen;
7620 unsigned char buf[6];
7621 static rb_encoding *enc_utf8 = NULL;
7622
7623 switch (*s) {
7624 case '\\':
7625 case '"':
7626 case '#':
7627 rb_str_cat(undumped, s, 1); /* cat itself */
7628 s++;
7629 break;
7630 case 'n':
7631 case 'r':
7632 case 't':
7633 case 'f':
7634 case 'v':
7635 case 'b':
7636 case 'a':
7637 case 'e':
7638 *buf = unescape_ascii(*s);
7639 rb_str_cat(undumped, (char *)buf, 1);
7640 s++;
7641 break;
7642 case 'u':
7643 if (*binary) {
7644 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7645 }
7646 *utf8 = true;
7647 if (++s >= s_end) {
7648 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7649 }
7650 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7651 if (*penc != enc_utf8) {
7652 *penc = enc_utf8;
7653 rb_enc_associate(undumped, enc_utf8);
7654 }
7655 if (*s == '{') { /* handle \u{...} form */
7656 s++;
7657 for (;;) {
7658 if (s >= s_end) {
7659 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7660 }
7661 if (*s == '}') {
7662 s++;
7663 break;
7664 }
7665 if (ISSPACE(*s)) {
7666 s++;
7667 continue;
7668 }
7669 c = scan_hex(s, s_end-s, &hexlen);
7670 if (hexlen == 0 || hexlen > 6) {
7671 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7672 }
7673 if (c > 0x10ffff) {
7674 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7675 }
7676 if (0xd800 <= c && c <= 0xdfff) {
7677 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7678 }
7679 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7680 rb_str_cat(undumped, (char *)buf, codelen);
7681 s += hexlen;
7682 }
7683 }
7684 else { /* handle \uXXXX form */
7685 c = scan_hex(s, 4, &hexlen);
7686 if (hexlen != 4) {
7687 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7688 }
7689 if (0xd800 <= c && c <= 0xdfff) {
7690 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7691 }
7692 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7693 rb_str_cat(undumped, (char *)buf, codelen);
7694 s += hexlen;
7695 }
7696 break;
7697 case 'x':
7698 if (*utf8) {
7699 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7700 }
7701 *binary = true;
7702 if (++s >= s_end) {
7703 rb_raise(rb_eRuntimeError, "invalid hex escape");
7704 }
7705 *buf = scan_hex(s, 2, &hexlen);
7706 if (hexlen != 2) {
7707 rb_raise(rb_eRuntimeError, "invalid hex escape");
7708 }
7709 rb_str_cat(undumped, (char *)buf, 1);
7710 s += hexlen;
7711 break;
7712 default:
7713 rb_str_cat(undumped, s-1, 2);
7714 s++;
7715 }
7716
7717 *ss = s;
7718}
7719
7720static VALUE rb_str_is_ascii_only_p(VALUE str);
7721
7722/*
7723 * call-seq:
7724 * undump -> string
7725 *
7726 * Returns an unescaped version of +self+:
7727 *
7728 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7729 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7730 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7731 * s_undumped == s_orig # => true
7732 *
7733 * Related: String#dump (inverse of String#undump).
7734 *
7735 */
7736
7737static VALUE
7738str_undump(VALUE str)
7739{
7740 const char *s = RSTRING_PTR(str);
7741 const char *s_end = RSTRING_END(str);
7742 rb_encoding *enc = rb_enc_get(str);
7743 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7744 bool utf8 = false;
7745 bool binary = false;
7746 int w;
7747
7749 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7750 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7751 }
7752 if (!str_null_check(str, &w)) {
7753 rb_raise(rb_eRuntimeError, "string contains null byte");
7754 }
7755 if (RSTRING_LEN(str) < 2) goto invalid_format;
7756 if (*s != '"') goto invalid_format;
7757
7758 /* strip '"' at the start */
7759 s++;
7760
7761 for (;;) {
7762 if (s >= s_end) {
7763 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7764 }
7765
7766 if (*s == '"') {
7767 /* epilogue */
7768 s++;
7769 if (s == s_end) {
7770 /* ascii compatible dumped string */
7771 break;
7772 }
7773 else {
7774 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7775 static const char dup_suffix[] = ".dup";
7776 const char *encname;
7777 int encidx;
7778 ptrdiff_t size;
7779
7780 /* check separately for strings dumped by older versions */
7781 size = sizeof(dup_suffix) - 1;
7782 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7783
7784 size = sizeof(force_encoding_suffix) - 1;
7785 if (s_end - s <= size) goto invalid_format;
7786 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7787 s += size;
7788
7789 if (utf8) {
7790 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7791 }
7792
7793 encname = s;
7794 s = memchr(s, '"', s_end-s);
7795 size = s - encname;
7796 if (!s) goto invalid_format;
7797 if (s_end - s != 2) goto invalid_format;
7798 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7799
7800 encidx = rb_enc_find_index2(encname, (long)size);
7801 if (encidx < 0) {
7802 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7803 }
7804 rb_enc_associate_index(undumped, encidx);
7805 }
7806 break;
7807 }
7808
7809 if (*s == '\\') {
7810 s++;
7811 if (s >= s_end) {
7812 rb_raise(rb_eRuntimeError, "invalid escape");
7813 }
7814 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7815 }
7816 else {
7817 rb_str_cat(undumped, s++, 1);
7818 }
7819 }
7820
7821 RB_GC_GUARD(str);
7822
7823 return undumped;
7824invalid_format:
7825 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7826}
7827
7828static void
7829rb_str_check_dummy_enc(rb_encoding *enc)
7830{
7831 if (rb_enc_dummy_p(enc)) {
7832 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7833 rb_enc_name(enc));
7834 }
7835}
7836
7837static rb_encoding *
7838str_true_enc(VALUE str)
7839{
7840 rb_encoding *enc = STR_ENC_GET(str);
7841 rb_str_check_dummy_enc(enc);
7842 return enc;
7843}
7844
7845static OnigCaseFoldType
7846check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7847{
7848 if (argc==0)
7849 return flags;
7850 if (argc>2)
7851 rb_raise(rb_eArgError, "too many options");
7852 if (argv[0]==sym_turkic) {
7853 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7854 if (argc==2) {
7855 if (argv[1]==sym_lithuanian)
7856 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7857 else
7858 rb_raise(rb_eArgError, "invalid second option");
7859 }
7860 }
7861 else if (argv[0]==sym_lithuanian) {
7862 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7863 if (argc==2) {
7864 if (argv[1]==sym_turkic)
7865 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7866 else
7867 rb_raise(rb_eArgError, "invalid second option");
7868 }
7869 }
7870 else if (argc>1)
7871 rb_raise(rb_eArgError, "too many options");
7872 else if (argv[0]==sym_ascii)
7873 flags |= ONIGENC_CASE_ASCII_ONLY;
7874 else if (argv[0]==sym_fold) {
7875 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7876 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7877 else
7878 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7879 }
7880 else
7881 rb_raise(rb_eArgError, "invalid option");
7882 return flags;
7883}
7884
7885static inline bool
7886case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7887{
7888 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7889 return true;
7890 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7891}
7892
7893/* 16 should be long enough to absorb any kind of single character length increase */
7894#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7895#ifndef CASEMAP_DEBUG
7896# define CASEMAP_DEBUG 0
7897#endif
7898
7899struct mapping_buffer;
7900typedef struct mapping_buffer {
7901 size_t capa;
7902 size_t used;
7903 struct mapping_buffer *next;
7904 OnigUChar space[FLEX_ARY_LEN];
7906
7907static void
7908mapping_buffer_free(void *p)
7909{
7910 mapping_buffer *previous_buffer;
7911 mapping_buffer *current_buffer = p;
7912 while (current_buffer) {
7913 previous_buffer = current_buffer;
7914 current_buffer = current_buffer->next;
7915 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7916 }
7917}
7918
7919static const rb_data_type_t mapping_buffer_type = {
7920 "mapping_buffer",
7921 {0, mapping_buffer_free,},
7922 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7923};
7924
7925static VALUE
7926rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7927{
7928 VALUE target;
7929
7930 const OnigUChar *source_current, *source_end;
7931 int target_length = 0;
7932 VALUE buffer_anchor;
7933 mapping_buffer *current_buffer = 0;
7934 mapping_buffer **pre_buffer;
7935 size_t buffer_count = 0;
7936 int buffer_length_or_invalid;
7937
7938 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7939
7940 source_current = (OnigUChar*)RSTRING_PTR(source);
7941 source_end = (OnigUChar*)RSTRING_END(source);
7942
7943 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7944 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7945 while (source_current < source_end) {
7946 /* increase multiplier using buffer count to converge quickly */
7947 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7948 if (CASEMAP_DEBUG) {
7949 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7950 }
7951 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7952 *pre_buffer = current_buffer;
7953 pre_buffer = &current_buffer->next;
7954 current_buffer->next = NULL;
7955 current_buffer->capa = capa;
7956 buffer_length_or_invalid = enc->case_map(flags,
7957 &source_current, source_end,
7958 current_buffer->space,
7959 current_buffer->space+current_buffer->capa,
7960 enc);
7961 if (buffer_length_or_invalid < 0) {
7962 current_buffer = DATA_PTR(buffer_anchor);
7963 DATA_PTR(buffer_anchor) = 0;
7964 mapping_buffer_free(current_buffer);
7965 rb_raise(rb_eArgError, "input string invalid");
7966 }
7967 target_length += current_buffer->used = buffer_length_or_invalid;
7968 }
7969 if (CASEMAP_DEBUG) {
7970 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7971 }
7972
7973 if (buffer_count==1) {
7974 target = rb_str_new((const char*)current_buffer->space, target_length);
7975 }
7976 else {
7977 char *target_current;
7978
7979 target = rb_str_new(0, target_length);
7980 target_current = RSTRING_PTR(target);
7981 current_buffer = DATA_PTR(buffer_anchor);
7982 while (current_buffer) {
7983 memcpy(target_current, current_buffer->space, current_buffer->used);
7984 target_current += current_buffer->used;
7985 current_buffer = current_buffer->next;
7986 }
7987 }
7988 current_buffer = DATA_PTR(buffer_anchor);
7989 DATA_PTR(buffer_anchor) = 0;
7990 mapping_buffer_free(current_buffer);
7991
7992 RB_GC_GUARD(buffer_anchor);
7993
7994 /* TODO: check about string terminator character */
7995 str_enc_copy_direct(target, source);
7996 /*ENC_CODERANGE_SET(mapped, cr);*/
7997
7998 return target;
7999}
8000
8001static VALUE
8002rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
8003{
8004 const OnigUChar *source_current, *source_end;
8005 OnigUChar *target_current, *target_end;
8006 long old_length = RSTRING_LEN(source);
8007 int length_or_invalid;
8008
8009 if (old_length == 0) return Qnil;
8010
8011 source_current = (OnigUChar*)RSTRING_PTR(source);
8012 source_end = (OnigUChar*)RSTRING_END(source);
8013 if (source == target) {
8014 target_current = (OnigUChar*)source_current;
8015 target_end = (OnigUChar*)source_end;
8016 }
8017 else {
8018 target_current = (OnigUChar*)RSTRING_PTR(target);
8019 target_end = (OnigUChar*)RSTRING_END(target);
8020 }
8021
8022 length_or_invalid = onigenc_ascii_only_case_map(flags,
8023 &source_current, source_end,
8024 target_current, target_end, enc);
8025 if (length_or_invalid < 0)
8026 rb_raise(rb_eArgError, "input string invalid");
8027 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
8028 fprintf(stderr, "problem with rb_str_ascii_casemap"
8029 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8030 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
8031 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
8032 }
8033
8034 str_enc_copy(target, source);
8035
8036 return target;
8037}
8038
8039static bool
8040upcase_single(VALUE str)
8041{
8042 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8043 bool modified = false;
8044
8045 while (s < send) {
8046 unsigned int c = *(unsigned char*)s;
8047
8048 if ('a' <= c && c <= 'z') {
8049 *s = 'A' + (c - 'a');
8050 modified = true;
8051 }
8052 s++;
8053 }
8054 return modified;
8055}
8056
8057/*
8058 * call-seq:
8059 * upcase!(mapping) -> self or nil
8060 *
8061 * Upcases the characters in +self+;
8062 * returns +self+ if any changes were made, +nil+ otherwise:
8063 *
8064 * s = 'Hello World!' # => "Hello World!"
8065 * s.upcase! # => "HELLO WORLD!"
8066 * s # => "HELLO WORLD!"
8067 * s.upcase! # => nil
8068 *
8069 * The casing may be affected by the given +mapping+;
8070 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8071 *
8072 * Related: String#upcase, String#downcase, String#downcase!.
8073 *
8074 */
8075
8076static VALUE
8077rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
8078{
8079 rb_encoding *enc;
8080 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8081
8082 flags = check_case_options(argc, argv, flags);
8083 str_modify_keep_cr(str);
8084 enc = str_true_enc(str);
8085 if (case_option_single_p(flags, enc, str)) {
8086 if (upcase_single(str))
8087 flags |= ONIGENC_CASE_MODIFIED;
8088 }
8089 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8090 rb_str_ascii_casemap(str, str, &flags, enc);
8091 else
8092 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8093
8094 if (ONIGENC_CASE_MODIFIED&flags) return str;
8095 return Qnil;
8096}
8097
8098
8099/*
8100 * call-seq:
8101 * upcase(mapping) -> string
8102 *
8103 * Returns a string containing the upcased characters in +self+:
8104 *
8105 * s = 'Hello World!' # => "Hello World!"
8106 * s.upcase # => "HELLO WORLD!"
8107 *
8108 * The casing may be affected by the given +mapping+;
8109 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8110 *
8111 * Related: String#upcase!, String#downcase, String#downcase!.
8112 *
8113 */
8114
8115static VALUE
8116rb_str_upcase(int argc, VALUE *argv, VALUE str)
8117{
8118 rb_encoding *enc;
8119 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
8120 VALUE ret;
8121
8122 flags = check_case_options(argc, argv, flags);
8123 enc = str_true_enc(str);
8124 if (case_option_single_p(flags, enc, str)) {
8125 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8126 str_enc_copy_direct(ret, str);
8127 upcase_single(ret);
8128 }
8129 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8130 ret = rb_str_new(0, RSTRING_LEN(str));
8131 rb_str_ascii_casemap(str, ret, &flags, enc);
8132 }
8133 else {
8134 ret = rb_str_casemap(str, &flags, enc);
8135 }
8136
8137 return ret;
8138}
8139
8140static bool
8141downcase_single(VALUE str)
8142{
8143 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8144 bool modified = false;
8145
8146 while (s < send) {
8147 unsigned int c = *(unsigned char*)s;
8148
8149 if ('A' <= c && c <= 'Z') {
8150 *s = 'a' + (c - 'A');
8151 modified = true;
8152 }
8153 s++;
8154 }
8155
8156 return modified;
8157}
8158
8159/*
8160 * call-seq:
8161 * downcase!(mapping) -> self or nil
8162 *
8163 * Downcases the characters in +self+;
8164 * returns +self+ if any changes were made, +nil+ otherwise:
8165 *
8166 * s = 'Hello World!' # => "Hello World!"
8167 * s.downcase! # => "hello world!"
8168 * s # => "hello world!"
8169 * s.downcase! # => nil
8170 *
8171 * The casing may be affected by the given +mapping+;
8172 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8173 *
8174 * Related: String#downcase, String#upcase, String#upcase!.
8175 *
8176 */
8177
8178static VALUE
8179rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8180{
8181 rb_encoding *enc;
8182 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8183
8184 flags = check_case_options(argc, argv, flags);
8185 str_modify_keep_cr(str);
8186 enc = str_true_enc(str);
8187 if (case_option_single_p(flags, enc, str)) {
8188 if (downcase_single(str))
8189 flags |= ONIGENC_CASE_MODIFIED;
8190 }
8191 else if (flags&ONIGENC_CASE_ASCII_ONLY)
8192 rb_str_ascii_casemap(str, str, &flags, enc);
8193 else
8194 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8195
8196 if (ONIGENC_CASE_MODIFIED&flags) return str;
8197 return Qnil;
8198}
8199
8200
8201/*
8202 * call-seq:
8203 * downcase(mapping) -> string
8204 *
8205 * Returns a string containing the downcased characters in +self+:
8206 *
8207 * s = 'Hello World!' # => "Hello World!"
8208 * s.downcase # => "hello world!"
8209 *
8210 * The casing may be affected by the given +mapping+;
8211 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8212 *
8213 * Related: String#downcase!, String#upcase, String#upcase!.
8214 *
8215 */
8216
8217static VALUE
8218rb_str_downcase(int argc, VALUE *argv, VALUE str)
8219{
8220 rb_encoding *enc;
8221 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8222 VALUE ret;
8223
8224 flags = check_case_options(argc, argv, flags);
8225 enc = str_true_enc(str);
8226 if (case_option_single_p(flags, enc, str)) {
8227 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8228 str_enc_copy_direct(ret, str);
8229 downcase_single(ret);
8230 }
8231 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8232 ret = rb_str_new(0, RSTRING_LEN(str));
8233 rb_str_ascii_casemap(str, ret, &flags, enc);
8234 }
8235 else {
8236 ret = rb_str_casemap(str, &flags, enc);
8237 }
8238
8239 return ret;
8240}
8241
8242
8243/*
8244 * call-seq:
8245 * capitalize!(mapping) -> self or nil
8246 *
8247 * Upcases the first character in +self+;
8248 * downcases the remaining characters;
8249 * returns +self+ if any changes were made, +nil+ otherwise:
8250 *
8251 * s = 'hello World!' # => "hello World!"
8252 * s.capitalize! # => "Hello world!"
8253 * s # => "Hello world!"
8254 * s.capitalize! # => nil
8255 *
8256 * The casing may be affected by the given +mapping+;
8257 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8258 *
8259 * Related: String#capitalize.
8260 *
8261 */
8262
8263static VALUE
8264rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8265{
8266 rb_encoding *enc;
8267 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8268
8269 flags = check_case_options(argc, argv, flags);
8270 str_modify_keep_cr(str);
8271 enc = str_true_enc(str);
8272 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8273 if (flags&ONIGENC_CASE_ASCII_ONLY)
8274 rb_str_ascii_casemap(str, str, &flags, enc);
8275 else
8276 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8277
8278 if (ONIGENC_CASE_MODIFIED&flags) return str;
8279 return Qnil;
8280}
8281
8282
8283/*
8284 * call-seq:
8285 * capitalize(mapping) -> string
8286 *
8287 * Returns a string containing the characters in +self+;
8288 * the first character is upcased;
8289 * the remaining characters are downcased:
8290 *
8291 * s = 'hello World!' # => "hello World!"
8292 * s.capitalize # => "Hello world!"
8293 *
8294 * The casing may be affected by the given +mapping+;
8295 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8296 *
8297 * Related: String#capitalize!.
8298 *
8299 */
8300
8301static VALUE
8302rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8303{
8304 rb_encoding *enc;
8305 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8306 VALUE ret;
8307
8308 flags = check_case_options(argc, argv, flags);
8309 enc = str_true_enc(str);
8310 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8311 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8312 ret = rb_str_new(0, RSTRING_LEN(str));
8313 rb_str_ascii_casemap(str, ret, &flags, enc);
8314 }
8315 else {
8316 ret = rb_str_casemap(str, &flags, enc);
8317 }
8318 return ret;
8319}
8320
8321
8322/*
8323 * call-seq:
8324 * swapcase!(mapping) -> self or nil
8325 *
8326 * Upcases each lowercase character in +self+;
8327 * downcases uppercase character;
8328 * returns +self+ if any changes were made, +nil+ otherwise:
8329 *
8330 * s = 'Hello World!' # => "Hello World!"
8331 * s.swapcase! # => "hELLO wORLD!"
8332 * s # => "hELLO wORLD!"
8333 * ''.swapcase! # => nil
8334 *
8335 * The casing may be affected by the given +mapping+;
8336 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8337 *
8338 * Related: String#swapcase.
8339 *
8340 */
8341
8342static VALUE
8343rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8344{
8345 rb_encoding *enc;
8346 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8347
8348 flags = check_case_options(argc, argv, flags);
8349 str_modify_keep_cr(str);
8350 enc = str_true_enc(str);
8351 if (flags&ONIGENC_CASE_ASCII_ONLY)
8352 rb_str_ascii_casemap(str, str, &flags, enc);
8353 else
8354 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8355
8356 if (ONIGENC_CASE_MODIFIED&flags) return str;
8357 return Qnil;
8358}
8359
8360
8361/*
8362 * call-seq:
8363 * swapcase(mapping) -> string
8364 *
8365 * Returns a string containing the characters in +self+, with cases reversed;
8366 * each uppercase character is downcased;
8367 * each lowercase character is upcased:
8368 *
8369 * s = 'Hello World!' # => "Hello World!"
8370 * s.swapcase # => "hELLO wORLD!"
8371 *
8372 * The casing may be affected by the given +mapping+;
8373 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8374 *
8375 * Related: String#swapcase!.
8376 *
8377 */
8378
8379static VALUE
8380rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8381{
8382 rb_encoding *enc;
8383 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8384 VALUE ret;
8385
8386 flags = check_case_options(argc, argv, flags);
8387 enc = str_true_enc(str);
8388 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8389 if (flags&ONIGENC_CASE_ASCII_ONLY) {
8390 ret = rb_str_new(0, RSTRING_LEN(str));
8391 rb_str_ascii_casemap(str, ret, &flags, enc);
8392 }
8393 else {
8394 ret = rb_str_casemap(str, &flags, enc);
8395 }
8396 return ret;
8397}
8398
8399typedef unsigned char *USTR;
8400
8401struct tr {
8402 int gen;
8403 unsigned int now, max;
8404 char *p, *pend;
8405};
8406
8407static unsigned int
8408trnext(struct tr *t, rb_encoding *enc)
8409{
8410 int n;
8411
8412 for (;;) {
8413 nextpart:
8414 if (!t->gen) {
8415 if (t->p == t->pend) return -1;
8416 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8417 t->p += n;
8418 }
8419 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8420 t->p += n;
8421 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8422 t->p += n;
8423 if (t->p < t->pend) {
8424 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8425 t->p += n;
8426 if (t->now > c) {
8427 if (t->now < 0x80 && c < 0x80) {
8428 rb_raise(rb_eArgError,
8429 "invalid range \"%c-%c\" in string transliteration",
8430 t->now, c);
8431 }
8432 else {
8433 rb_raise(rb_eArgError, "invalid range in string transliteration");
8434 }
8435 continue; /* not reached */
8436 }
8437 else if (t->now < c) {
8438 t->gen = 1;
8439 t->max = c;
8440 }
8441 }
8442 }
8443 return t->now;
8444 }
8445 else {
8446 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8447 if (t->now == t->max) {
8448 t->gen = 0;
8449 goto nextpart;
8450 }
8451 }
8452 if (t->now < t->max) {
8453 return t->now;
8454 }
8455 else {
8456 t->gen = 0;
8457 return t->max;
8458 }
8459 }
8460 }
8461}
8462
8463static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8464
8465static VALUE
8466tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8467{
8468 const unsigned int errc = -1;
8469 unsigned int trans[256];
8470 rb_encoding *enc, *e1, *e2;
8471 struct tr trsrc, trrepl;
8472 int cflag = 0;
8473 unsigned int c, c0, last = 0;
8474 int modify = 0, i, l;
8475 unsigned char *s, *send;
8476 VALUE hash = 0;
8477 int singlebyte = single_byte_optimizable(str);
8478 int termlen;
8479 int cr;
8480
8481#define CHECK_IF_ASCII(c) \
8482 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8483 (cr = ENC_CODERANGE_VALID) : 0)
8484
8485 StringValue(src);
8486 StringValue(repl);
8487 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8488 if (RSTRING_LEN(repl) == 0) {
8489 return rb_str_delete_bang(1, &src, str);
8490 }
8491
8492 cr = ENC_CODERANGE(str);
8493 e1 = rb_enc_check(str, src);
8494 e2 = rb_enc_check(str, repl);
8495 if (e1 == e2) {
8496 enc = e1;
8497 }
8498 else {
8499 enc = rb_enc_check(src, repl);
8500 }
8501 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8502 if (RSTRING_LEN(src) > 1 &&
8503 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8504 trsrc.p + l < trsrc.pend) {
8505 cflag = 1;
8506 trsrc.p += l;
8507 }
8508 trrepl.p = RSTRING_PTR(repl);
8509 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8510 trsrc.gen = trrepl.gen = 0;
8511 trsrc.now = trrepl.now = 0;
8512 trsrc.max = trrepl.max = 0;
8513
8514 if (cflag) {
8515 for (i=0; i<256; i++) {
8516 trans[i] = 1;
8517 }
8518 while ((c = trnext(&trsrc, enc)) != errc) {
8519 if (c < 256) {
8520 trans[c] = errc;
8521 }
8522 else {
8523 if (!hash) hash = rb_hash_new();
8524 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8525 }
8526 }
8527 while ((c = trnext(&trrepl, enc)) != errc)
8528 /* retrieve last replacer */;
8529 last = trrepl.now;
8530 for (i=0; i<256; i++) {
8531 if (trans[i] != errc) {
8532 trans[i] = last;
8533 }
8534 }
8535 }
8536 else {
8537 unsigned int r;
8538
8539 for (i=0; i<256; i++) {
8540 trans[i] = errc;
8541 }
8542 while ((c = trnext(&trsrc, enc)) != errc) {
8543 r = trnext(&trrepl, enc);
8544 if (r == errc) r = trrepl.now;
8545 if (c < 256) {
8546 trans[c] = r;
8547 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8548 }
8549 else {
8550 if (!hash) hash = rb_hash_new();
8551 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8552 }
8553 }
8554 }
8555
8556 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8557 cr = ENC_CODERANGE_7BIT;
8558 str_modify_keep_cr(str);
8559 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8560 termlen = rb_enc_mbminlen(enc);
8561 if (sflag) {
8562 int clen, tlen;
8563 long offset, max = RSTRING_LEN(str);
8564 unsigned int save = -1;
8565 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8566
8567 while (s < send) {
8568 int may_modify = 0;
8569
8570 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8571 if (!MBCLEN_CHARFOUND_P(r)) {
8572 xfree(buf);
8573 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8574 }
8575 clen = MBCLEN_CHARFOUND_LEN(r);
8576 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8577
8578 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8579
8580 s += clen;
8581 if (c < 256) {
8582 c = trans[c];
8583 }
8584 else if (hash) {
8585 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8586 if (NIL_P(tmp)) {
8587 if (cflag) c = last;
8588 else c = errc;
8589 }
8590 else if (cflag) c = errc;
8591 else c = NUM2INT(tmp);
8592 }
8593 else {
8594 c = errc;
8595 }
8596 if (c != (unsigned int)-1) {
8597 if (save == c) {
8598 CHECK_IF_ASCII(c);
8599 continue;
8600 }
8601 save = c;
8602 tlen = rb_enc_codelen(c, enc);
8603 modify = 1;
8604 }
8605 else {
8606 save = -1;
8607 c = c0;
8608 if (enc != e1) may_modify = 1;
8609 }
8610 if ((offset = t - buf) + tlen > max) {
8611 size_t MAYBE_UNUSED(old) = max + termlen;
8612 max = offset + tlen + (send - s);
8613 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8614 t = buf + offset;
8615 }
8616 rb_enc_mbcput(c, t, enc);
8617 if (may_modify && memcmp(s, t, tlen) != 0) {
8618 modify = 1;
8619 }
8620 CHECK_IF_ASCII(c);
8621 t += tlen;
8622 }
8623 if (!STR_EMBED_P(str)) {
8624 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8625 }
8626 TERM_FILL((char *)t, termlen);
8627 RSTRING(str)->as.heap.ptr = (char *)buf;
8628 STR_SET_LEN(str, t - buf);
8629 STR_SET_NOEMBED(str);
8630 RSTRING(str)->as.heap.aux.capa = max;
8631 }
8632 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8633 while (s < send) {
8634 c = (unsigned char)*s;
8635 if (trans[c] != errc) {
8636 if (!cflag) {
8637 c = trans[c];
8638 *s = c;
8639 modify = 1;
8640 }
8641 else {
8642 *s = last;
8643 modify = 1;
8644 }
8645 }
8646 CHECK_IF_ASCII(c);
8647 s++;
8648 }
8649 }
8650 else {
8651 int clen, tlen;
8652 long offset, max = (long)((send - s) * 1.2);
8653 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8654
8655 while (s < send) {
8656 int may_modify = 0;
8657
8658 int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8659 if (!MBCLEN_CHARFOUND_P(r)) {
8660 xfree(buf);
8661 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8662 }
8663 clen = MBCLEN_CHARFOUND_LEN(r);
8664 c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8665
8666 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8667
8668 if (c < 256) {
8669 c = trans[c];
8670 }
8671 else if (hash) {
8672 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8673 if (NIL_P(tmp)) {
8674 if (cflag) c = last;
8675 else c = errc;
8676 }
8677 else if (cflag) c = errc;
8678 else c = NUM2INT(tmp);
8679 }
8680 else {
8681 c = cflag ? last : errc;
8682 }
8683 if (c != errc) {
8684 tlen = rb_enc_codelen(c, enc);
8685 modify = 1;
8686 }
8687 else {
8688 c = c0;
8689 if (enc != e1) may_modify = 1;
8690 }
8691 if ((offset = t - buf) + tlen > max) {
8692 size_t MAYBE_UNUSED(old) = max + termlen;
8693 max = offset + tlen + (long)((send - s) * 1.2);
8694 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8695 t = buf + offset;
8696 }
8697 if (s != t) {
8698 rb_enc_mbcput(c, t, enc);
8699 if (may_modify && memcmp(s, t, tlen) != 0) {
8700 modify = 1;
8701 }
8702 }
8703 CHECK_IF_ASCII(c);
8704 s += clen;
8705 t += tlen;
8706 }
8707 if (!STR_EMBED_P(str)) {
8708 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8709 }
8710 TERM_FILL((char *)t, termlen);
8711 RSTRING(str)->as.heap.ptr = (char *)buf;
8712 STR_SET_LEN(str, t - buf);
8713 STR_SET_NOEMBED(str);
8714 RSTRING(str)->as.heap.aux.capa = max;
8715 }
8716
8717 if (modify) {
8718 if (cr != ENC_CODERANGE_BROKEN)
8719 ENC_CODERANGE_SET(str, cr);
8720 rb_enc_associate(str, enc);
8721 return str;
8722 }
8723 return Qnil;
8724}
8725
8726
8727/*
8728 * call-seq:
8729 * tr!(selector, replacements) -> self or nil
8730 *
8731 * Like String#tr, but modifies +self+ in place.
8732 * Returns +self+ if any changes were made, +nil+ otherwise.
8733 *
8734 */
8735
8736static VALUE
8737rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8738{
8739 return tr_trans(str, src, repl, 0);
8740}
8741
8742
8743/*
8744 * call-seq:
8745 * tr(selector, replacements) -> new_string
8746 *
8747 * Returns a copy of +self+ with each character specified by string +selector+
8748 * translated to the corresponding character in string +replacements+.
8749 * The correspondence is _positional_:
8750 *
8751 * - Each occurrence of the first character specified by +selector+
8752 * is translated to the first character in +replacements+.
8753 * - Each occurrence of the second character specified by +selector+
8754 * is translated to the second character in +replacements+.
8755 * - And so on.
8756 *
8757 * Example:
8758 *
8759 * 'hello'.tr('el', 'ip') #=> "hippo"
8760 *
8761 * If +replacements+ is shorter than +selector+,
8762 * it is implicitly padded with its own last character:
8763 *
8764 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8765 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8766 *
8767 * Arguments +selector+ and +replacements+ must be valid character selectors
8768 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8769 * and may use any of its valid forms, including negation, ranges, and escaping:
8770 *
8771 * # Negation.
8772 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8773 * # Ranges.
8774 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8775 * # Escapes.
8776 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8777 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8778 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8779 *
8780 */
8781
8782static VALUE
8783rb_str_tr(VALUE str, VALUE src, VALUE repl)
8784{
8785 str = str_duplicate(rb_cString, str);
8786 tr_trans(str, src, repl, 0);
8787 return str;
8788}
8789
8790#define TR_TABLE_MAX (UCHAR_MAX+1)
8791#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8792static void
8793tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8794 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8795{
8796 const unsigned int errc = -1;
8797 char buf[TR_TABLE_MAX];
8798 struct tr tr;
8799 unsigned int c;
8800 VALUE table = 0, ptable = 0;
8801 int i, l, cflag = 0;
8802
8803 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8804 tr.gen = tr.now = tr.max = 0;
8805
8806 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8807 cflag = 1;
8808 tr.p += l;
8809 }
8810 if (first) {
8811 for (i=0; i<TR_TABLE_MAX; i++) {
8812 stable[i] = 1;
8813 }
8814 stable[TR_TABLE_MAX] = cflag;
8815 }
8816 else if (stable[TR_TABLE_MAX] && !cflag) {
8817 stable[TR_TABLE_MAX] = 0;
8818 }
8819 for (i=0; i<TR_TABLE_MAX; i++) {
8820 buf[i] = cflag;
8821 }
8822
8823 while ((c = trnext(&tr, enc)) != errc) {
8824 if (c < TR_TABLE_MAX) {
8825 buf[(unsigned char)c] = !cflag;
8826 }
8827 else {
8828 VALUE key = UINT2NUM(c);
8829
8830 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8831 if (cflag) {
8832 ptable = *ctablep;
8833 table = ptable ? ptable : rb_hash_new();
8834 *ctablep = table;
8835 }
8836 else {
8837 table = rb_hash_new();
8838 ptable = *tablep;
8839 *tablep = table;
8840 }
8841 }
8842 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8843 rb_hash_aset(table, key, Qtrue);
8844 }
8845 }
8846 }
8847 for (i=0; i<TR_TABLE_MAX; i++) {
8848 stable[i] = stable[i] && buf[i];
8849 }
8850 if (!table && !cflag) {
8851 *tablep = 0;
8852 }
8853}
8854
8855
8856static int
8857tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8858{
8859 if (c < TR_TABLE_MAX) {
8860 return table[c] != 0;
8861 }
8862 else {
8863 VALUE v = UINT2NUM(c);
8864
8865 if (del) {
8866 if (!NIL_P(rb_hash_lookup(del, v)) &&
8867 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8868 return TRUE;
8869 }
8870 }
8871 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8872 return FALSE;
8873 }
8874 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8875 }
8876}
8877
8878/*
8879 * call-seq:
8880 * delete!(*selectors) -> self or nil
8881 *
8882 * Like String#delete, but modifies +self+ in place.
8883 * Returns +self+ if any changes were made, +nil+ otherwise.
8884 *
8885 */
8886
8887static VALUE
8888rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8889{
8890 char squeez[TR_TABLE_SIZE];
8891 rb_encoding *enc = 0;
8892 char *s, *send, *t;
8893 VALUE del = 0, nodel = 0;
8894 int modify = 0;
8895 int i, ascompat, cr;
8896
8897 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8899 for (i=0; i<argc; i++) {
8900 VALUE s = argv[i];
8901
8902 StringValue(s);
8903 enc = rb_enc_check(str, s);
8904 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8905 }
8906
8907 str_modify_keep_cr(str);
8908 ascompat = rb_enc_asciicompat(enc);
8909 s = t = RSTRING_PTR(str);
8910 send = RSTRING_END(str);
8911 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8912 while (s < send) {
8913 unsigned int c;
8914 int clen;
8915
8916 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8917 if (squeez[c]) {
8918 modify = 1;
8919 }
8920 else {
8921 if (t != s) *t = c;
8922 t++;
8923 }
8924 s++;
8925 }
8926 else {
8927 c = rb_enc_codepoint_len(s, send, &clen, enc);
8928
8929 if (tr_find(c, squeez, del, nodel)) {
8930 modify = 1;
8931 }
8932 else {
8933 if (t != s) rb_enc_mbcput(c, t, enc);
8934 t += clen;
8936 }
8937 s += clen;
8938 }
8939 }
8940 TERM_FILL(t, TERM_LEN(str));
8941 STR_SET_LEN(str, t - RSTRING_PTR(str));
8942 ENC_CODERANGE_SET(str, cr);
8943
8944 if (modify) return str;
8945 return Qnil;
8946}
8947
8948
8949/*
8950 * call-seq:
8951 * delete(*selectors) -> new_string
8952 *
8953 * Returns a copy of +self+ with characters specified by +selectors+ removed
8954 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8955 *
8956 * "hello".delete "l","lo" #=> "heo"
8957 * "hello".delete "lo" #=> "he"
8958 * "hello".delete "aeiou", "^e" #=> "hell"
8959 * "hello".delete "ej-m" #=> "ho"
8960 *
8961 */
8962
8963static VALUE
8964rb_str_delete(int argc, VALUE *argv, VALUE str)
8965{
8966 str = str_duplicate(rb_cString, str);
8967 rb_str_delete_bang(argc, argv, str);
8968 return str;
8969}
8970
8971
8972/*
8973 * call-seq:
8974 * squeeze!(*selectors) -> self or nil
8975 *
8976 * Like String#squeeze, but modifies +self+ in place.
8977 * Returns +self+ if any changes were made, +nil+ otherwise.
8978 */
8979
8980static VALUE
8981rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8982{
8983 char squeez[TR_TABLE_SIZE];
8984 rb_encoding *enc = 0;
8985 VALUE del = 0, nodel = 0;
8986 unsigned char *s, *send, *t;
8987 int i, modify = 0;
8988 int ascompat, singlebyte = single_byte_optimizable(str);
8989 unsigned int save;
8990
8991 if (argc == 0) {
8992 enc = STR_ENC_GET(str);
8993 }
8994 else {
8995 for (i=0; i<argc; i++) {
8996 VALUE s = argv[i];
8997
8998 StringValue(s);
8999 enc = rb_enc_check(str, s);
9000 if (singlebyte && !single_byte_optimizable(s))
9001 singlebyte = 0;
9002 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
9003 }
9004 }
9005
9006 str_modify_keep_cr(str);
9007 s = t = (unsigned char *)RSTRING_PTR(str);
9008 if (!s || RSTRING_LEN(str) == 0) return Qnil;
9009 send = (unsigned char *)RSTRING_END(str);
9010 save = -1;
9011 ascompat = rb_enc_asciicompat(enc);
9012
9013 if (singlebyte) {
9014 while (s < send) {
9015 unsigned int c = *s++;
9016 if (c != save || (argc > 0 && !squeez[c])) {
9017 *t++ = save = c;
9018 }
9019 }
9020 }
9021 else {
9022 while (s < send) {
9023 unsigned int c;
9024 int clen;
9025
9026 if (ascompat && (c = *s) < 0x80) {
9027 if (c != save || (argc > 0 && !squeez[c])) {
9028 *t++ = save = c;
9029 }
9030 s++;
9031 }
9032 else {
9033 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
9034
9035 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
9036 if (t != s) rb_enc_mbcput(c, t, enc);
9037 save = c;
9038 t += clen;
9039 }
9040 s += clen;
9041 }
9042 }
9043 }
9044
9045 TERM_FILL((char *)t, TERM_LEN(str));
9046 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
9047 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
9048 modify = 1;
9049 }
9050
9051 if (modify) return str;
9052 return Qnil;
9053}
9054
9055
9056/*
9057 * call-seq:
9058 * squeeze(*selectors) -> new_string
9059 *
9060 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
9061 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9062 *
9063 * "Squeezed" means that each multiple-character run of a selected character
9064 * is squeezed down to a single character;
9065 * with no arguments given, squeezes all characters:
9066 *
9067 * "yellow moon".squeeze #=> "yelow mon"
9068 * " now is the".squeeze(" ") #=> " now is the"
9069 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
9070 *
9071 */
9072
9073static VALUE
9074rb_str_squeeze(int argc, VALUE *argv, VALUE str)
9075{
9076 str = str_duplicate(rb_cString, str);
9077 rb_str_squeeze_bang(argc, argv, str);
9078 return str;
9079}
9080
9081
9082/*
9083 * call-seq:
9084 * tr_s!(selector, replacements) -> self or nil
9085 *
9086 * Like String#tr_s, but modifies +self+ in place.
9087 * Returns +self+ if any changes were made, +nil+ otherwise.
9088 *
9089 * Related: String#squeeze!.
9090 */
9091
9092static VALUE
9093rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
9094{
9095 return tr_trans(str, src, repl, 1);
9096}
9097
9098
9099/*
9100 * call-seq:
9101 * tr_s(selector, replacements) -> string
9102 *
9103 * Like String#tr, but also squeezes the modified portions of the translated string;
9104 * returns a new string (translated and squeezed).
9105 *
9106 * 'hello'.tr_s('l', 'r') #=> "hero"
9107 * 'hello'.tr_s('el', '-') #=> "h-o"
9108 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
9109 *
9110 * Related: String#squeeze.
9111 *
9112 */
9113
9114static VALUE
9115rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
9116{
9117 str = str_duplicate(rb_cString, str);
9118 tr_trans(str, src, repl, 1);
9119 return str;
9120}
9121
9122
9123/*
9124 * call-seq:
9125 * count(*selectors) -> integer
9126 *
9127 * Returns the total number of characters in +self+
9128 * that are specified by the given +selectors+
9129 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
9130 *
9131 * a = "hello world"
9132 * a.count "lo" #=> 5
9133 * a.count "lo", "o" #=> 2
9134 * a.count "hello", "^l" #=> 4
9135 * a.count "ej-m" #=> 4
9136 *
9137 * "hello^world".count "\\^aeiou" #=> 4
9138 * "hello-world".count "a\\-eo" #=> 4
9139 *
9140 * c = "hello world\\r\\n"
9141 * c.count "\\" #=> 2
9142 * c.count "\\A" #=> 0
9143 * c.count "X-\\w" #=> 3
9144 */
9145
9146static VALUE
9147rb_str_count(int argc, VALUE *argv, VALUE str)
9148{
9149 char table[TR_TABLE_SIZE];
9150 rb_encoding *enc = 0;
9151 VALUE del = 0, nodel = 0, tstr;
9152 char *s, *send;
9153 int i;
9154 int ascompat;
9155 size_t n = 0;
9156
9158
9159 tstr = argv[0];
9160 StringValue(tstr);
9161 enc = rb_enc_check(str, tstr);
9162 if (argc == 1) {
9163 const char *ptstr;
9164 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9165 (ptstr = RSTRING_PTR(tstr),
9166 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9167 !is_broken_string(str)) {
9168 int clen;
9169 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9170
9171 s = RSTRING_PTR(str);
9172 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9173 send = RSTRING_END(str);
9174 while (s < send) {
9175 if (*(unsigned char*)s++ == c) n++;
9176 }
9177 return SIZET2NUM(n);
9178 }
9179 }
9180
9181 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9182 for (i=1; i<argc; i++) {
9183 tstr = argv[i];
9184 StringValue(tstr);
9185 enc = rb_enc_check(str, tstr);
9186 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9187 }
9188
9189 s = RSTRING_PTR(str);
9190 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9191 send = RSTRING_END(str);
9192 ascompat = rb_enc_asciicompat(enc);
9193 while (s < send) {
9194 unsigned int c;
9195
9196 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9197 if (table[c]) {
9198 n++;
9199 }
9200 s++;
9201 }
9202 else {
9203 int clen;
9204 c = rb_enc_codepoint_len(s, send, &clen, enc);
9205 if (tr_find(c, table, del, nodel)) {
9206 n++;
9207 }
9208 s += clen;
9209 }
9210 }
9211
9212 return SIZET2NUM(n);
9213}
9214
9215static VALUE
9216rb_fs_check(VALUE val)
9217{
9218 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9219 val = rb_check_string_type(val);
9220 if (NIL_P(val)) return 0;
9221 }
9222 return val;
9223}
9224
9225static const char isspacetable[256] = {
9226 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9228 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9242};
9243
9244#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9245
9246static long
9247split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9248{
9249 if (empty_count >= 0 && len == 0) {
9250 return empty_count + 1;
9251 }
9252 if (empty_count > 0) {
9253 /* make different substrings */
9254 if (result) {
9255 do {
9256 rb_ary_push(result, str_new_empty_String(str));
9257 } while (--empty_count > 0);
9258 }
9259 else {
9260 do {
9261 rb_yield(str_new_empty_String(str));
9262 } while (--empty_count > 0);
9263 }
9264 }
9265 str = rb_str_subseq(str, beg, len);
9266 if (result) {
9267 rb_ary_push(result, str);
9268 }
9269 else {
9270 rb_yield(str);
9271 }
9272 return empty_count;
9273}
9274
9275typedef enum {
9276 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9277} split_type_t;
9278
9279static split_type_t
9280literal_split_pattern(VALUE spat, split_type_t default_type)
9281{
9282 rb_encoding *enc = STR_ENC_GET(spat);
9283 const char *ptr;
9284 long len;
9285 RSTRING_GETMEM(spat, ptr, len);
9286 if (len == 0) {
9287 /* Special case - split into chars */
9288 return SPLIT_TYPE_CHARS;
9289 }
9290 else if (rb_enc_asciicompat(enc)) {
9291 if (len == 1 && ptr[0] == ' ') {
9292 return SPLIT_TYPE_AWK;
9293 }
9294 }
9295 else {
9296 int l;
9297 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9298 return SPLIT_TYPE_AWK;
9299 }
9300 }
9301 return default_type;
9302}
9303
9304/*
9305 * call-seq:
9306 * split(field_sep = $;, limit = 0) -> array
9307 * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9308 *
9309 * :include: doc/string/split.rdoc
9310 *
9311 */
9312
9313static VALUE
9314rb_str_split_m(int argc, VALUE *argv, VALUE str)
9315{
9316 rb_encoding *enc;
9317 VALUE spat;
9318 VALUE limit;
9319 split_type_t split_type;
9320 long beg, end, i = 0, empty_count = -1;
9321 int lim = 0;
9322 VALUE result, tmp;
9323
9324 result = rb_block_given_p() ? Qfalse : Qnil;
9325 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9326 lim = NUM2INT(limit);
9327 if (lim <= 0) limit = Qnil;
9328 else if (lim == 1) {
9329 if (RSTRING_LEN(str) == 0)
9330 return result ? rb_ary_new2(0) : str;
9331 tmp = str_duplicate(rb_cString, str);
9332 if (!result) {
9333 rb_yield(tmp);
9334 return str;
9335 }
9336 return rb_ary_new3(1, tmp);
9337 }
9338 i = 1;
9339 }
9340 if (NIL_P(limit) && !lim) empty_count = 0;
9341
9342 enc = STR_ENC_GET(str);
9343 split_type = SPLIT_TYPE_REGEXP;
9344 if (!NIL_P(spat)) {
9345 spat = get_pat_quoted(spat, 0);
9346 }
9347 else if (NIL_P(spat = rb_fs)) {
9348 split_type = SPLIT_TYPE_AWK;
9349 }
9350 else if (!(spat = rb_fs_check(spat))) {
9351 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9352 }
9353 else {
9354 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9355 }
9356 if (split_type != SPLIT_TYPE_AWK) {
9357 switch (BUILTIN_TYPE(spat)) {
9358 case T_REGEXP:
9359 rb_reg_options(spat); /* check if uninitialized */
9360 tmp = RREGEXP_SRC(spat);
9361 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9362 if (split_type == SPLIT_TYPE_AWK) {
9363 spat = tmp;
9364 split_type = SPLIT_TYPE_STRING;
9365 }
9366 break;
9367
9368 case T_STRING:
9369 mustnot_broken(spat);
9370 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9371 break;
9372
9373 default:
9375 }
9376 }
9377
9378#define SPLIT_STR(beg, len) ( \
9379 empty_count = split_string(result, str, beg, len, empty_count), \
9380 str_mod_check(str, str_start, str_len))
9381
9382 beg = 0;
9383 char *ptr = RSTRING_PTR(str);
9384 char *const str_start = ptr;
9385 const long str_len = RSTRING_LEN(str);
9386 char *const eptr = str_start + str_len;
9387 if (split_type == SPLIT_TYPE_AWK) {
9388 char *bptr = ptr;
9389 int skip = 1;
9390 unsigned int c;
9391
9392 if (result) result = rb_ary_new();
9393 end = beg;
9394 if (is_ascii_string(str)) {
9395 while (ptr < eptr) {
9396 c = (unsigned char)*ptr++;
9397 if (skip) {
9398 if (ascii_isspace(c)) {
9399 beg = ptr - bptr;
9400 }
9401 else {
9402 end = ptr - bptr;
9403 skip = 0;
9404 if (!NIL_P(limit) && lim <= i) break;
9405 }
9406 }
9407 else if (ascii_isspace(c)) {
9408 SPLIT_STR(beg, end-beg);
9409 skip = 1;
9410 beg = ptr - bptr;
9411 if (!NIL_P(limit)) ++i;
9412 }
9413 else {
9414 end = ptr - bptr;
9415 }
9416 }
9417 }
9418 else {
9419 while (ptr < eptr) {
9420 int n;
9421
9422 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9423 ptr += n;
9424 if (skip) {
9425 if (rb_isspace(c)) {
9426 beg = ptr - bptr;
9427 }
9428 else {
9429 end = ptr - bptr;
9430 skip = 0;
9431 if (!NIL_P(limit) && lim <= i) break;
9432 }
9433 }
9434 else if (rb_isspace(c)) {
9435 SPLIT_STR(beg, end-beg);
9436 skip = 1;
9437 beg = ptr - bptr;
9438 if (!NIL_P(limit)) ++i;
9439 }
9440 else {
9441 end = ptr - bptr;
9442 }
9443 }
9444 }
9445 }
9446 else if (split_type == SPLIT_TYPE_STRING) {
9447 char *substr_start = ptr;
9448 char *sptr = RSTRING_PTR(spat);
9449 long slen = RSTRING_LEN(spat);
9450
9451 if (result) result = rb_ary_new();
9452 mustnot_broken(str);
9453 enc = rb_enc_check(str, spat);
9454 while (ptr < eptr &&
9455 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9456 /* Check we are at the start of a char */
9457 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9458 if (t != ptr + end) {
9459 ptr = t;
9460 continue;
9461 }
9462 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9463 str_mod_check(spat, sptr, slen);
9464 ptr += end + slen;
9465 substr_start = ptr;
9466 if (!NIL_P(limit) && lim <= ++i) break;
9467 }
9468 beg = ptr - str_start;
9469 }
9470 else if (split_type == SPLIT_TYPE_CHARS) {
9471 int n;
9472
9473 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9474 mustnot_broken(str);
9475 enc = rb_enc_get(str);
9476 while (ptr < eptr &&
9477 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9478 SPLIT_STR(ptr - str_start, n);
9479 ptr += n;
9480 if (!NIL_P(limit) && lim <= ++i) break;
9481 }
9482 beg = ptr - str_start;
9483 }
9484 else {
9485 if (result) result = rb_ary_new();
9486 long len = RSTRING_LEN(str);
9487 long start = beg;
9488 long idx;
9489 int last_null = 0;
9490 struct re_registers *regs;
9491 VALUE match = 0;
9492
9493 for (; rb_reg_search(spat, str, start, 0) >= 0;
9494 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9495 match = rb_backref_get();
9496 if (!result) rb_match_busy(match);
9497 regs = RMATCH_REGS(match);
9498 end = BEG(0);
9499 if (start == end && BEG(0) == END(0)) {
9500 if (!ptr) {
9501 SPLIT_STR(0, 0);
9502 break;
9503 }
9504 else if (last_null == 1) {
9505 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9506 beg = start;
9507 }
9508 else {
9509 if (start == len)
9510 start++;
9511 else
9512 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9513 last_null = 1;
9514 continue;
9515 }
9516 }
9517 else {
9518 SPLIT_STR(beg, end-beg);
9519 beg = start = END(0);
9520 }
9521 last_null = 0;
9522
9523 for (idx=1; idx < regs->num_regs; idx++) {
9524 if (BEG(idx) == -1) continue;
9525 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9526 }
9527 if (!NIL_P(limit) && lim <= ++i) break;
9528 }
9529 if (match) rb_match_unbusy(match);
9530 }
9531 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9532 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9533 }
9534
9535 return result ? result : str;
9536}
9537
9538VALUE
9539rb_str_split(VALUE str, const char *sep0)
9540{
9541 VALUE sep;
9542
9543 StringValue(str);
9544 sep = rb_str_new_cstr(sep0);
9545 return rb_str_split_m(1, &sep, str);
9546}
9547
9548#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9549
9550static inline int
9551enumerator_element(VALUE ary, VALUE e)
9552{
9553 if (ary) {
9554 rb_ary_push(ary, e);
9555 return 0;
9556 }
9557 else {
9558 rb_yield(e);
9559 return 1;
9560 }
9561}
9562
9563#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9564
9565static const char *
9566chomp_newline(const char *p, const char *e, rb_encoding *enc)
9567{
9568 const char *prev = rb_enc_prev_char(p, e, e, enc);
9569 if (rb_enc_is_newline(prev, e, enc)) {
9570 e = prev;
9571 prev = rb_enc_prev_char(p, e, e, enc);
9572 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9573 e = prev;
9574 }
9575 return e;
9576}
9577
9578static VALUE
9579get_rs(void)
9580{
9581 VALUE rs = rb_rs;
9582 if (!NIL_P(rs) &&
9583 (!RB_TYPE_P(rs, T_STRING) ||
9584 RSTRING_LEN(rs) != 1 ||
9585 RSTRING_PTR(rs)[0] != '\n')) {
9586 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9587 }
9588 return rs;
9589}
9590
9591#define rb_rs get_rs()
9592
9593static VALUE
9594rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9595{
9596 rb_encoding *enc;
9597 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9598 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9599 long pos, len, rslen;
9600 int rsnewline = 0;
9601
9602 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9603 rs = rb_rs;
9604 if (!NIL_P(opts)) {
9605 static ID keywords[1];
9606 if (!keywords[0]) {
9607 keywords[0] = rb_intern_const("chomp");
9608 }
9609 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9610 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9611 }
9612
9613 if (NIL_P(rs)) {
9614 if (!ENUM_ELEM(ary, str)) {
9615 return ary;
9616 }
9617 else {
9618 return orig;
9619 }
9620 }
9621
9622 if (!RSTRING_LEN(str)) goto end;
9623 str = rb_str_new_frozen(str);
9624 ptr = subptr = RSTRING_PTR(str);
9625 pend = RSTRING_END(str);
9626 len = RSTRING_LEN(str);
9627 StringValue(rs);
9628 rslen = RSTRING_LEN(rs);
9629
9630 if (rs == rb_default_rs)
9631 enc = rb_enc_get(str);
9632 else
9633 enc = rb_enc_check(str, rs);
9634
9635 if (rslen == 0) {
9636 /* paragraph mode */
9637 int n;
9638 const char *eol = NULL;
9639 subend = subptr;
9640 while (subend < pend) {
9641 long chomp_rslen = 0;
9642 do {
9643 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9644 n = 0;
9645 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9646 if (rb_enc_is_newline(subend + n, pend, enc)) {
9647 if (eol == subend) break;
9648 subend += rslen;
9649 if (subptr) {
9650 eol = subend;
9651 chomp_rslen = -rslen;
9652 }
9653 }
9654 else {
9655 if (!subptr) subptr = subend;
9656 subend += rslen;
9657 }
9658 rslen = 0;
9659 } while (subend < pend);
9660 if (!subptr) break;
9661 if (rslen == 0) chomp_rslen = 0;
9662 line = rb_str_subseq(str, subptr - ptr,
9663 subend - subptr + (chomp ? chomp_rslen : rslen));
9664 if (ENUM_ELEM(ary, line)) {
9665 str_mod_check(str, ptr, len);
9666 }
9667 subptr = eol = NULL;
9668 }
9669 goto end;
9670 }
9671 else {
9672 rsptr = RSTRING_PTR(rs);
9673 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9674 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9675 rsnewline = 1;
9676 }
9677 }
9678
9679 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9680 rs = rb_str_new(rsptr, rslen);
9681 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9682 rsptr = RSTRING_PTR(rs);
9683 rslen = RSTRING_LEN(rs);
9684 }
9685
9686 while (subptr < pend) {
9687 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9688 if (pos < 0) break;
9689 hit = subptr + pos;
9690 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9691 if (hit != adjusted) {
9692 subptr = adjusted;
9693 continue;
9694 }
9695 subend = hit += rslen;
9696 if (chomp) {
9697 if (rsnewline) {
9698 subend = chomp_newline(subptr, subend, enc);
9699 }
9700 else {
9701 subend -= rslen;
9702 }
9703 }
9704 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9705 if (ENUM_ELEM(ary, line)) {
9706 str_mod_check(str, ptr, len);
9707 }
9708 subptr = hit;
9709 }
9710
9711 if (subptr != pend) {
9712 if (chomp) {
9713 if (rsnewline) {
9714 pend = chomp_newline(subptr, pend, enc);
9715 }
9716 else if (pend - subptr >= rslen &&
9717 memcmp(pend - rslen, rsptr, rslen) == 0) {
9718 pend -= rslen;
9719 }
9720 }
9721 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9722 ENUM_ELEM(ary, line);
9723 RB_GC_GUARD(str);
9724 }
9725
9726 end:
9727 if (ary)
9728 return ary;
9729 else
9730 return orig;
9731}
9732
9733/*
9734 * call-seq:
9735 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9736 * each_line(line_sep = $/, chomp: false) -> enumerator
9737 *
9738 * :include: doc/string/each_line.rdoc
9739 *
9740 */
9741
9742static VALUE
9743rb_str_each_line(int argc, VALUE *argv, VALUE str)
9744{
9745 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9746 return rb_str_enumerate_lines(argc, argv, str, 0);
9747}
9748
9749/*
9750 * call-seq:
9751 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9752 *
9753 * Forms substrings ("lines") of +self+ according to the given arguments
9754 * (see String#each_line for details); returns the lines in an array.
9755 *
9756 */
9757
9758static VALUE
9759rb_str_lines(int argc, VALUE *argv, VALUE str)
9760{
9761 VALUE ary = WANTARRAY("lines", 0);
9762 return rb_str_enumerate_lines(argc, argv, str, ary);
9763}
9764
9765static VALUE
9766rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9767{
9768 return LONG2FIX(RSTRING_LEN(str));
9769}
9770
9771static VALUE
9772rb_str_enumerate_bytes(VALUE str, VALUE ary)
9773{
9774 long i;
9775
9776 for (i=0; i<RSTRING_LEN(str); i++) {
9777 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9778 }
9779 if (ary)
9780 return ary;
9781 else
9782 return str;
9783}
9784
9785/*
9786 * call-seq:
9787 * each_byte {|byte| ... } -> self
9788 * each_byte -> enumerator
9789 *
9790 * :include: doc/string/each_byte.rdoc
9791 *
9792 */
9793
9794static VALUE
9795rb_str_each_byte(VALUE str)
9796{
9797 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9798 return rb_str_enumerate_bytes(str, 0);
9799}
9800
9801/*
9802 * call-seq:
9803 * bytes -> array_of_bytes
9804 *
9805 * :include: doc/string/bytes.rdoc
9806 *
9807 */
9808
9809static VALUE
9810rb_str_bytes(VALUE str)
9811{
9812 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9813 return rb_str_enumerate_bytes(str, ary);
9814}
9815
9816static VALUE
9817rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9818{
9819 return rb_str_length(str);
9820}
9821
9822static VALUE
9823rb_str_enumerate_chars(VALUE str, VALUE ary)
9824{
9825 VALUE orig = str;
9826 long i, len, n;
9827 const char *ptr;
9828 rb_encoding *enc;
9829
9830 str = rb_str_new_frozen(str);
9831 ptr = RSTRING_PTR(str);
9832 len = RSTRING_LEN(str);
9833 enc = rb_enc_get(str);
9834
9836 for (i = 0; i < len; i += n) {
9837 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9838 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9839 }
9840 }
9841 else {
9842 for (i = 0; i < len; i += n) {
9843 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9844 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9845 }
9846 }
9847 RB_GC_GUARD(str);
9848 if (ary)
9849 return ary;
9850 else
9851 return orig;
9852}
9853
9854/*
9855 * call-seq:
9856 * each_char {|c| ... } -> self
9857 * each_char -> enumerator
9858 *
9859 * :include: doc/string/each_char.rdoc
9860 *
9861 */
9862
9863static VALUE
9864rb_str_each_char(VALUE str)
9865{
9866 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9867 return rb_str_enumerate_chars(str, 0);
9868}
9869
9870/*
9871 * call-seq:
9872 * chars -> array_of_characters
9873 *
9874 * :include: doc/string/chars.rdoc
9875 *
9876 */
9877
9878static VALUE
9879rb_str_chars(VALUE str)
9880{
9881 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9882 return rb_str_enumerate_chars(str, ary);
9883}
9884
9885static VALUE
9886rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9887{
9888 VALUE orig = str;
9889 int n;
9890 unsigned int c;
9891 const char *ptr, *end;
9892 rb_encoding *enc;
9893
9894 if (single_byte_optimizable(str))
9895 return rb_str_enumerate_bytes(str, ary);
9896
9897 str = rb_str_new_frozen(str);
9898 ptr = RSTRING_PTR(str);
9899 end = RSTRING_END(str);
9900 enc = STR_ENC_GET(str);
9901
9902 while (ptr < end) {
9903 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9904 ENUM_ELEM(ary, UINT2NUM(c));
9905 ptr += n;
9906 }
9907 RB_GC_GUARD(str);
9908 if (ary)
9909 return ary;
9910 else
9911 return orig;
9912}
9913
9914/*
9915 * call-seq:
9916 * each_codepoint {|integer| ... } -> self
9917 * each_codepoint -> enumerator
9918 *
9919 * :include: doc/string/each_codepoint.rdoc
9920 *
9921 */
9922
9923static VALUE
9924rb_str_each_codepoint(VALUE str)
9925{
9926 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9927 return rb_str_enumerate_codepoints(str, 0);
9928}
9929
9930/*
9931 * call-seq:
9932 * codepoints -> array_of_integers
9933 *
9934 * :include: doc/string/codepoints.rdoc
9935 *
9936 */
9937
9938static VALUE
9939rb_str_codepoints(VALUE str)
9940{
9941 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9942 return rb_str_enumerate_codepoints(str, ary);
9943}
9944
9945static regex_t *
9946get_reg_grapheme_cluster(rb_encoding *enc)
9947{
9948 int encidx = rb_enc_to_index(enc);
9949
9950 const OnigUChar source_ascii[] = "\\X";
9951 const OnigUChar *source = source_ascii;
9952 size_t source_len = sizeof(source_ascii) - 1;
9953
9954 switch (encidx) {
9955#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9956#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9957#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9958#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9959#define CASE_UTF(e) \
9960 case ENCINDEX_UTF_##e: { \
9961 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9962 source = source_UTF_##e; \
9963 source_len = sizeof(source_UTF_##e); \
9964 break; \
9965 }
9966 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9967#undef CASE_UTF
9968#undef CHARS_16BE
9969#undef CHARS_16LE
9970#undef CHARS_32BE
9971#undef CHARS_32LE
9972 }
9973
9974 regex_t *reg_grapheme_cluster;
9975 OnigErrorInfo einfo;
9976 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9977 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9978 if (r) {
9979 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9980 onig_error_code_to_str(message, r, &einfo);
9981 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9982 }
9983
9984 return reg_grapheme_cluster;
9985}
9986
9987static regex_t *
9988get_cached_reg_grapheme_cluster(rb_encoding *enc)
9989{
9990 int encidx = rb_enc_to_index(enc);
9991 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9992
9993 if (encidx == rb_utf8_encindex()) {
9994 if (!reg_grapheme_cluster_utf8) {
9995 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9996 }
9997
9998 return reg_grapheme_cluster_utf8;
9999 }
10000
10001 return NULL;
10002}
10003
10004static VALUE
10005rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
10006{
10007 size_t grapheme_cluster_count = 0;
10008 rb_encoding *enc = get_encoding(str);
10009 const char *ptr, *end;
10010
10011 if (!rb_enc_unicode_p(enc)) {
10012 return rb_str_length(str);
10013 }
10014
10015 bool cached_reg_grapheme_cluster = true;
10016 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10017 if (!reg_grapheme_cluster) {
10018 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10019 cached_reg_grapheme_cluster = false;
10020 }
10021
10022 ptr = RSTRING_PTR(str);
10023 end = RSTRING_END(str);
10024
10025 while (ptr < end) {
10026 OnigPosition len = onig_match(reg_grapheme_cluster,
10027 (const OnigUChar *)ptr, (const OnigUChar *)end,
10028 (const OnigUChar *)ptr, NULL, 0);
10029 if (len <= 0) break;
10030 grapheme_cluster_count++;
10031 ptr += len;
10032 }
10033
10034 if (!cached_reg_grapheme_cluster) {
10035 onig_free(reg_grapheme_cluster);
10036 }
10037
10038 return SIZET2NUM(grapheme_cluster_count);
10039}
10040
10041static VALUE
10042rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
10043{
10044 VALUE orig = str;
10045 rb_encoding *enc = get_encoding(str);
10046 const char *ptr0, *ptr, *end;
10047
10048 if (!rb_enc_unicode_p(enc)) {
10049 return rb_str_enumerate_chars(str, ary);
10050 }
10051
10052 if (!ary) str = rb_str_new_frozen(str);
10053
10054 bool cached_reg_grapheme_cluster = true;
10055 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
10056 if (!reg_grapheme_cluster) {
10057 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
10058 cached_reg_grapheme_cluster = false;
10059 }
10060
10061 ptr0 = ptr = RSTRING_PTR(str);
10062 end = RSTRING_END(str);
10063
10064 while (ptr < end) {
10065 OnigPosition len = onig_match(reg_grapheme_cluster,
10066 (const OnigUChar *)ptr, (const OnigUChar *)end,
10067 (const OnigUChar *)ptr, NULL, 0);
10068 if (len <= 0) break;
10069 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
10070 ptr += len;
10071 }
10072
10073 if (!cached_reg_grapheme_cluster) {
10074 onig_free(reg_grapheme_cluster);
10075 }
10076
10077 RB_GC_GUARD(str);
10078 if (ary)
10079 return ary;
10080 else
10081 return orig;
10082}
10083
10084/*
10085 * call-seq:
10086 * each_grapheme_cluster {|gc| ... } -> self
10087 * each_grapheme_cluster -> enumerator
10088 *
10089 * :include: doc/string/each_grapheme_cluster.rdoc
10090 *
10091 */
10092
10093static VALUE
10094rb_str_each_grapheme_cluster(VALUE str)
10095{
10096 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
10097 return rb_str_enumerate_grapheme_clusters(str, 0);
10098}
10099
10100/*
10101 * call-seq:
10102 * grapheme_clusters -> array_of_grapheme_clusters
10103 *
10104 * :include: doc/string/grapheme_clusters.rdoc
10105 *
10106 */
10107
10108static VALUE
10109rb_str_grapheme_clusters(VALUE str)
10110{
10111 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
10112 return rb_str_enumerate_grapheme_clusters(str, ary);
10113}
10114
10115static long
10116chopped_length(VALUE str)
10117{
10118 rb_encoding *enc = STR_ENC_GET(str);
10119 const char *p, *p2, *beg, *end;
10120
10121 beg = RSTRING_PTR(str);
10122 end = beg + RSTRING_LEN(str);
10123 if (beg >= end) return 0;
10124 p = rb_enc_prev_char(beg, end, end, enc);
10125 if (!p) return 0;
10126 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
10127 p2 = rb_enc_prev_char(beg, p, end, enc);
10128 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
10129 }
10130 return p - beg;
10131}
10132
10133/*
10134 * call-seq:
10135 * chop! -> self or nil
10136 *
10137 * Like String#chop, but modifies +self+ in place;
10138 * returns +nil+ if +self+ is empty, +self+ otherwise.
10139 *
10140 * Related: String#chomp!.
10141 */
10142
10143static VALUE
10144rb_str_chop_bang(VALUE str)
10145{
10146 str_modify_keep_cr(str);
10147 if (RSTRING_LEN(str) > 0) {
10148 long len;
10149 len = chopped_length(str);
10150 STR_SET_LEN(str, len);
10151 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10152 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10154 }
10155 return str;
10156 }
10157 return Qnil;
10158}
10159
10160
10161/*
10162 * call-seq:
10163 * chop -> new_string
10164 *
10165 * :include: doc/string/chop.rdoc
10166 *
10167 */
10168
10169static VALUE
10170rb_str_chop(VALUE str)
10171{
10172 return rb_str_subseq(str, 0, chopped_length(str));
10173}
10174
10175static long
10176smart_chomp(VALUE str, const char *e, const char *p)
10177{
10178 rb_encoding *enc = rb_enc_get(str);
10179 if (rb_enc_mbminlen(enc) > 1) {
10180 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10181 if (rb_enc_is_newline(pp, e, enc)) {
10182 e = pp;
10183 }
10184 pp = e - rb_enc_mbminlen(enc);
10185 if (pp >= p) {
10186 pp = rb_enc_left_char_head(p, pp, e, enc);
10187 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10188 e = pp;
10189 }
10190 }
10191 }
10192 else {
10193 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10194 case '\n':
10195 if (--e > p && *(e-1) == '\r') {
10196 --e;
10197 }
10198 break;
10199 case '\r':
10200 --e;
10201 break;
10202 }
10203 }
10204 return e - p;
10205}
10206
10207static long
10208chompped_length(VALUE str, VALUE rs)
10209{
10210 rb_encoding *enc;
10211 int newline;
10212 char *pp, *e, *rsptr;
10213 long rslen;
10214 char *const p = RSTRING_PTR(str);
10215 long len = RSTRING_LEN(str);
10216
10217 if (len == 0) return 0;
10218 e = p + len;
10219 if (rs == rb_default_rs) {
10220 return smart_chomp(str, e, p);
10221 }
10222
10223 enc = rb_enc_get(str);
10224 RSTRING_GETMEM(rs, rsptr, rslen);
10225 if (rslen == 0) {
10226 if (rb_enc_mbminlen(enc) > 1) {
10227 while (e > p) {
10228 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10229 if (!rb_enc_is_newline(pp, e, enc)) break;
10230 e = pp;
10231 pp -= rb_enc_mbminlen(enc);
10232 if (pp >= p) {
10233 pp = rb_enc_left_char_head(p, pp, e, enc);
10234 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10235 e = pp;
10236 }
10237 }
10238 }
10239 }
10240 else {
10241 while (e > p && *(e-1) == '\n') {
10242 --e;
10243 if (e > p && *(e-1) == '\r')
10244 --e;
10245 }
10246 }
10247 return e - p;
10248 }
10249 if (rslen > len) return len;
10250
10251 enc = rb_enc_get(rs);
10252 newline = rsptr[rslen-1];
10253 if (rslen == rb_enc_mbminlen(enc)) {
10254 if (rslen == 1) {
10255 if (newline == '\n')
10256 return smart_chomp(str, e, p);
10257 }
10258 else {
10259 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10260 return smart_chomp(str, e, p);
10261 }
10262 }
10263
10264 enc = rb_enc_check(str, rs);
10265 if (is_broken_string(rs)) {
10266 return len;
10267 }
10268 pp = e - rslen;
10269 if (p[len-1] == newline &&
10270 (rslen <= 1 ||
10271 memcmp(rsptr, pp, rslen) == 0)) {
10272 if (at_char_boundary(p, pp, e, enc))
10273 return len - rslen;
10274 RB_GC_GUARD(rs);
10275 }
10276 return len;
10277}
10278
10284static VALUE
10285chomp_rs(int argc, const VALUE *argv)
10286{
10287 rb_check_arity(argc, 0, 1);
10288 if (argc > 0) {
10289 VALUE rs = argv[0];
10290 if (!NIL_P(rs)) StringValue(rs);
10291 return rs;
10292 }
10293 else {
10294 return rb_rs;
10295 }
10296}
10297
10298VALUE
10299rb_str_chomp_string(VALUE str, VALUE rs)
10300{
10301 long olen = RSTRING_LEN(str);
10302 long len = chompped_length(str, rs);
10303 if (len >= olen) return Qnil;
10304 str_modify_keep_cr(str);
10305 STR_SET_LEN(str, len);
10306 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10307 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10309 }
10310 return str;
10311}
10312
10313/*
10314 * call-seq:
10315 * chomp!(line_sep = $/) -> self or nil
10316 *
10317 * Like String#chomp, but modifies +self+ in place;
10318 * returns +nil+ if no modification made, +self+ otherwise.
10319 *
10320 */
10321
10322static VALUE
10323rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10324{
10325 VALUE rs;
10326 str_modifiable(str);
10327 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10328 rs = chomp_rs(argc, argv);
10329 if (NIL_P(rs)) return Qnil;
10330 return rb_str_chomp_string(str, rs);
10331}
10332
10333
10334/*
10335 * call-seq:
10336 * chomp(line_sep = $/) -> new_string
10337 *
10338 * :include: doc/string/chomp.rdoc
10339 *
10340 */
10341
10342static VALUE
10343rb_str_chomp(int argc, VALUE *argv, VALUE str)
10344{
10345 VALUE rs = chomp_rs(argc, argv);
10346 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10347 return rb_str_subseq(str, 0, chompped_length(str, rs));
10348}
10349
10350static long
10351lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10352{
10353 const char *const start = s;
10354
10355 if (!s || s >= e) return 0;
10356
10357 /* remove spaces at head */
10358 if (single_byte_optimizable(str)) {
10359 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10360 }
10361 else {
10362 while (s < e) {
10363 int n;
10364 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10365
10366 if (cc && !rb_isspace(cc)) break;
10367 s += n;
10368 }
10369 }
10370 return s - start;
10371}
10372
10373/*
10374 * call-seq:
10375 * lstrip! -> self or nil
10376 *
10377 * Like String#lstrip, except that any modifications are made in +self+;
10378 * returns +self+ if any modification are made, +nil+ otherwise.
10379 *
10380 * Related: String#rstrip!, String#strip!.
10381 */
10382
10383static VALUE
10384rb_str_lstrip_bang(VALUE str)
10385{
10386 rb_encoding *enc;
10387 char *start, *s;
10388 long olen, loffset;
10389
10390 str_modify_keep_cr(str);
10391 enc = STR_ENC_GET(str);
10392 RSTRING_GETMEM(str, start, olen);
10393 loffset = lstrip_offset(str, start, start+olen, enc);
10394 if (loffset > 0) {
10395 long len = olen-loffset;
10396 s = start + loffset;
10397 memmove(start, s, len);
10398 STR_SET_LEN(str, len);
10399 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10400 return str;
10401 }
10402 return Qnil;
10403}
10404
10405
10406/*
10407 * call-seq:
10408 * lstrip -> new_string
10409 *
10410 * Returns a copy of +self+ with leading whitespace removed;
10411 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10412 *
10413 * whitespace = "\x00\t\n\v\f\r "
10414 * s = whitespace + 'abc' + whitespace
10415 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10416 * s.lstrip # => "abc\u0000\t\n\v\f\r "
10417 *
10418 * Related: String#rstrip, String#strip.
10419 */
10420
10421static VALUE
10422rb_str_lstrip(VALUE str)
10423{
10424 char *start;
10425 long len, loffset;
10426 RSTRING_GETMEM(str, start, len);
10427 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10428 if (loffset <= 0) return str_duplicate(rb_cString, str);
10429 return rb_str_subseq(str, loffset, len - loffset);
10430}
10431
10432static long
10433rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10434{
10435 const char *t;
10436
10437 rb_str_check_dummy_enc(enc);
10439 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10440 }
10441 if (!s || s >= e) return 0;
10442 t = e;
10443
10444 /* remove trailing spaces or '\0's */
10445 if (single_byte_optimizable(str)) {
10446 unsigned char c;
10447 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10448 }
10449 else {
10450 char *tp;
10451
10452 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10453 unsigned int c = rb_enc_codepoint(tp, e, enc);
10454 if (c && !rb_isspace(c)) break;
10455 t = tp;
10456 }
10457 }
10458 return e - t;
10459}
10460
10461/*
10462 * call-seq:
10463 * rstrip! -> self or nil
10464 *
10465 * Like String#rstrip, except that any modifications are made in +self+;
10466 * returns +self+ if any modification are made, +nil+ otherwise.
10467 *
10468 * Related: String#lstrip!, String#strip!.
10469 */
10470
10471static VALUE
10472rb_str_rstrip_bang(VALUE str)
10473{
10474 rb_encoding *enc;
10475 char *start;
10476 long olen, roffset;
10477
10478 str_modify_keep_cr(str);
10479 enc = STR_ENC_GET(str);
10480 RSTRING_GETMEM(str, start, olen);
10481 roffset = rstrip_offset(str, start, start+olen, enc);
10482 if (roffset > 0) {
10483 long len = olen - roffset;
10484
10485 STR_SET_LEN(str, len);
10486 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10487 return str;
10488 }
10489 return Qnil;
10490}
10491
10492
10493/*
10494 * call-seq:
10495 * rstrip -> new_string
10496 *
10497 * Returns a copy of the receiver with trailing whitespace removed;
10498 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10499 *
10500 * whitespace = "\x00\t\n\v\f\r "
10501 * s = whitespace + 'abc' + whitespace
10502 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10503 * s.rstrip # => "\u0000\t\n\v\f\r abc"
10504 *
10505 * Related: String#lstrip, String#strip.
10506 */
10507
10508static VALUE
10509rb_str_rstrip(VALUE str)
10510{
10511 rb_encoding *enc;
10512 char *start;
10513 long olen, roffset;
10514
10515 enc = STR_ENC_GET(str);
10516 RSTRING_GETMEM(str, start, olen);
10517 roffset = rstrip_offset(str, start, start+olen, enc);
10518
10519 if (roffset <= 0) return str_duplicate(rb_cString, str);
10520 return rb_str_subseq(str, 0, olen-roffset);
10521}
10522
10523
10524/*
10525 * call-seq:
10526 * strip! -> self or nil
10527 *
10528 * Like String#strip, except that any modifications are made in +self+;
10529 * returns +self+ if any modification are made, +nil+ otherwise.
10530 *
10531 * Related: String#lstrip!, String#strip!.
10532 */
10533
10534static VALUE
10535rb_str_strip_bang(VALUE str)
10536{
10537 char *start;
10538 long olen, loffset, roffset;
10539 rb_encoding *enc;
10540
10541 str_modify_keep_cr(str);
10542 enc = STR_ENC_GET(str);
10543 RSTRING_GETMEM(str, start, olen);
10544 loffset = lstrip_offset(str, start, start+olen, enc);
10545 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10546
10547 if (loffset > 0 || roffset > 0) {
10548 long len = olen-roffset;
10549 if (loffset > 0) {
10550 len -= loffset;
10551 memmove(start, start + loffset, len);
10552 }
10553 STR_SET_LEN(str, len);
10554 TERM_FILL(start+len, rb_enc_mbminlen(enc));
10555 return str;
10556 }
10557 return Qnil;
10558}
10559
10560
10561/*
10562 * call-seq:
10563 * strip -> new_string
10564 *
10565 * Returns a copy of the receiver with leading and trailing whitespace removed;
10566 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10567 *
10568 * whitespace = "\x00\t\n\v\f\r "
10569 * s = whitespace + 'abc' + whitespace
10570 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10571 * s.strip # => "abc"
10572 *
10573 * Related: String#lstrip, String#rstrip.
10574 */
10575
10576static VALUE
10577rb_str_strip(VALUE str)
10578{
10579 char *start;
10580 long olen, loffset, roffset;
10581 rb_encoding *enc = STR_ENC_GET(str);
10582
10583 RSTRING_GETMEM(str, start, olen);
10584 loffset = lstrip_offset(str, start, start+olen, enc);
10585 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10586
10587 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10588 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10589}
10590
10591static VALUE
10592scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10593{
10594 VALUE result = Qnil;
10595 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10596 if (pos >= 0) {
10597 VALUE match;
10598 struct re_registers *regs;
10599 if (BUILTIN_TYPE(pat) == T_STRING) {
10600 regs = NULL;
10601 end = pos + RSTRING_LEN(pat);
10602 }
10603 else {
10604 match = rb_backref_get();
10605 regs = RMATCH_REGS(match);
10606 pos = BEG(0);
10607 end = END(0);
10608 }
10609
10610 if (pos == end) {
10611 rb_encoding *enc = STR_ENC_GET(str);
10612 /*
10613 * Always consume at least one character of the input string
10614 */
10615 if (RSTRING_LEN(str) > end)
10616 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10617 RSTRING_END(str), enc);
10618 else
10619 *start = end + 1;
10620 }
10621 else {
10622 *start = end;
10623 }
10624
10625 if (!regs || regs->num_regs == 1) {
10626 result = rb_str_subseq(str, pos, end - pos);
10627 return result;
10628 }
10629 else {
10630 result = rb_ary_new2(regs->num_regs);
10631 for (int i = 1; i < regs->num_regs; i++) {
10632 VALUE s = Qnil;
10633 if (BEG(i) >= 0) {
10634 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10635 }
10636
10637 rb_ary_push(result, s);
10638 }
10639 }
10640
10641 RB_GC_GUARD(match);
10642 }
10643
10644 return result;
10645}
10646
10647
10648/*
10649 * call-seq:
10650 * scan(string_or_regexp) -> array
10651 * scan(string_or_regexp) {|matches| ... } -> self
10652 *
10653 * Matches a pattern against +self+; the pattern is:
10654 *
10655 * - +string_or_regexp+ itself, if it is a Regexp.
10656 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10657 *
10658 * Iterates through +self+, generating a collection of matching results:
10659 *
10660 * - If the pattern contains no groups, each result is the
10661 * matched string, <code>$&</code>.
10662 * - If the pattern contains groups, each result is an array
10663 * containing one entry per group.
10664 *
10665 * With no block given, returns an array of the results:
10666 *
10667 * s = 'cruel world'
10668 * s.scan(/\w+/) # => ["cruel", "world"]
10669 * s.scan(/.../) # => ["cru", "el ", "wor"]
10670 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10671 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10672 *
10673 * With a block given, calls the block with each result; returns +self+:
10674 *
10675 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10676 * print "\n"
10677 * s.scan(/(.)(.)/) {|x,y| print y, x }
10678 * print "\n"
10679 *
10680 * Output:
10681 *
10682 * <<cruel>> <<world>>
10683 * rceu lowlr
10684 *
10685 */
10686
10687static VALUE
10688rb_str_scan(VALUE str, VALUE pat)
10689{
10690 VALUE result;
10691 long start = 0;
10692 long last = -1, prev = 0;
10693 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10694
10695 pat = get_pat_quoted(pat, 1);
10696 mustnot_broken(str);
10697 if (!rb_block_given_p()) {
10698 VALUE ary = rb_ary_new();
10699
10700 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10701 last = prev;
10702 prev = start;
10703 rb_ary_push(ary, result);
10704 }
10705 if (last >= 0) rb_pat_search(pat, str, last, 1);
10706 else rb_backref_set(Qnil);
10707 return ary;
10708 }
10709
10710 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10711 last = prev;
10712 prev = start;
10713 rb_yield(result);
10714 str_mod_check(str, p, len);
10715 }
10716 if (last >= 0) rb_pat_search(pat, str, last, 1);
10717 return str;
10718}
10719
10720
10721/*
10722 * call-seq:
10723 * hex -> integer
10724 *
10725 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10726 * (with an optional sign and an optional <code>0x</code>) and returns the
10727 * corresponding number;
10728 * returns zero if there is no such leading substring:
10729 *
10730 * '0x0a'.hex # => 10
10731 * '-1234'.hex # => -4660
10732 * '0'.hex # => 0
10733 * 'non-numeric'.hex # => 0
10734 *
10735 * Related: String#oct.
10736 *
10737 */
10738
10739static VALUE
10740rb_str_hex(VALUE str)
10741{
10742 return rb_str_to_inum(str, 16, FALSE);
10743}
10744
10745
10746/*
10747 * call-seq:
10748 * oct -> integer
10749 *
10750 * Interprets the leading substring of +self+ as a string of octal digits
10751 * (with an optional sign) and returns the corresponding number;
10752 * returns zero if there is no such leading substring:
10753 *
10754 * '123'.oct # => 83
10755 * '-377'.oct # => -255
10756 * '0377non-numeric'.oct # => 255
10757 * 'non-numeric'.oct # => 0
10758 *
10759 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10760 * see Kernel#Integer.
10761 *
10762 * Related: String#hex.
10763 *
10764 */
10765
10766static VALUE
10767rb_str_oct(VALUE str)
10768{
10769 return rb_str_to_inum(str, -8, FALSE);
10770}
10771
10772#ifndef HAVE_CRYPT_R
10773# include "ruby/thread_native.h"
10774# include "ruby/atomic.h"
10775
10776static struct {
10777 rb_nativethread_lock_t lock;
10778} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10779#endif
10780
10781/*
10782 * call-seq:
10783 * crypt(salt_str) -> new_string
10784 *
10785 * Returns the string generated by calling <code>crypt(3)</code>
10786 * standard library function with <code>str</code> and
10787 * <code>salt_str</code>, in this order, as its arguments. Please do
10788 * not use this method any longer. It is legacy; provided only for
10789 * backward compatibility with ruby scripts in earlier days. It is
10790 * bad to use in contemporary programs for several reasons:
10791 *
10792 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10793 * run. The generated string lacks data portability.
10794 *
10795 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10796 * (i.e. silently ends up in unexpected results).
10797 *
10798 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10799 * thread safe.
10800 *
10801 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10802 * very very weak. According to its manpage, Linux's traditional
10803 * <code>crypt(3)</code> output has only 2**56 variations; too
10804 * easy to brute force today. And this is the default behaviour.
10805 *
10806 * * In order to make things robust some OSes implement so-called
10807 * "modular" usage. To go through, you have to do a complex
10808 * build-up of the <code>salt_str</code> parameter, by hand.
10809 * Failure in generation of a proper salt string tends not to
10810 * yield any errors; typos in parameters are normally not
10811 * detectable.
10812 *
10813 * * For instance, in the following example, the second invocation
10814 * of String#crypt is wrong; it has a typo in "round=" (lacks
10815 * "s"). However the call does not fail and something unexpected
10816 * is generated.
10817 *
10818 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10819 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10820 *
10821 * * Even in the "modular" mode, some hash functions are considered
10822 * archaic and no longer recommended at all; for instance module
10823 * <code>$1$</code> is officially abandoned by its author: see
10824 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10825 * instance module <code>$3$</code> is considered completely
10826 * broken: see the manpage of FreeBSD.
10827 *
10828 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10829 * written above, <code>crypt(3)</code> on Mac OS never fails.
10830 * This means even if you build up a proper salt string it
10831 * generates a traditional DES hash anyways, and there is no way
10832 * for you to be aware of.
10833 *
10834 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10835 *
10836 * If for some reason you cannot migrate to other secure contemporary
10837 * password hashing algorithms, install the string-crypt gem and
10838 * <code>require 'string/crypt'</code> to continue using it.
10839 */
10840
10841static VALUE
10842rb_str_crypt(VALUE str, VALUE salt)
10843{
10844#ifdef HAVE_CRYPT_R
10845 VALUE databuf;
10846 struct crypt_data *data;
10847# define CRYPT_END() ALLOCV_END(databuf)
10848#else
10849 char *tmp_buf;
10850 extern char *crypt(const char *, const char *);
10851# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10852#endif
10853 VALUE result;
10854 const char *s, *saltp;
10855 char *res;
10856#ifdef BROKEN_CRYPT
10857 char salt_8bit_clean[3];
10858#endif
10859
10860 StringValue(salt);
10861 mustnot_wchar(str);
10862 mustnot_wchar(salt);
10863 s = StringValueCStr(str);
10864 saltp = RSTRING_PTR(salt);
10865 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10866 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10867 }
10868
10869#ifdef BROKEN_CRYPT
10870 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10871 salt_8bit_clean[0] = saltp[0] & 0x7f;
10872 salt_8bit_clean[1] = saltp[1] & 0x7f;
10873 salt_8bit_clean[2] = '\0';
10874 saltp = salt_8bit_clean;
10875 }
10876#endif
10877#ifdef HAVE_CRYPT_R
10878 data = ALLOCV(databuf, sizeof(struct crypt_data));
10879# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10880 data->initialized = 0;
10881# endif
10882 res = crypt_r(s, saltp, data);
10883#else
10884 rb_nativethread_lock_lock(&crypt_mutex.lock);
10885 res = crypt(s, saltp);
10886#endif
10887 if (!res) {
10888 int err = errno;
10889 CRYPT_END();
10890 rb_syserr_fail(err, "crypt");
10891 }
10892#ifdef HAVE_CRYPT_R
10893 result = rb_str_new_cstr(res);
10894 CRYPT_END();
10895#else
10896 // We need to copy this buffer because it's static and we need to unlock the mutex
10897 // before allocating a new object (the string to be returned). If we allocate while
10898 // holding the lock, we could run GC which fires the VM barrier and causes a deadlock
10899 // if other ractors are waiting on this lock.
10900 size_t res_size = strlen(res)+1;
10901 tmp_buf = ALLOCA_N(char, res_size); // should be small enough to alloca
10902 memcpy(tmp_buf, res, res_size);
10903 res = tmp_buf;
10904 CRYPT_END();
10905 result = rb_str_new_cstr(res);
10906#endif
10907 return result;
10908}
10909
10910
10911/*
10912 * call-seq:
10913 * ord -> integer
10914 *
10915 * :include: doc/string/ord.rdoc
10916 *
10917 */
10918
10919static VALUE
10920rb_str_ord(VALUE s)
10921{
10922 unsigned int c;
10923
10924 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10925 return UINT2NUM(c);
10926}
10927/*
10928 * call-seq:
10929 * sum(n = 16) -> integer
10930 *
10931 * :include: doc/string/sum.rdoc
10932 *
10933 */
10934
10935static VALUE
10936rb_str_sum(int argc, VALUE *argv, VALUE str)
10937{
10938 int bits = 16;
10939 char *ptr, *p, *pend;
10940 long len;
10941 VALUE sum = INT2FIX(0);
10942 unsigned long sum0 = 0;
10943
10944 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10945 bits = 0;
10946 }
10947 ptr = p = RSTRING_PTR(str);
10948 len = RSTRING_LEN(str);
10949 pend = p + len;
10950
10951 while (p < pend) {
10952 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10953 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10954 str_mod_check(str, ptr, len);
10955 sum0 = 0;
10956 }
10957 sum0 += (unsigned char)*p;
10958 p++;
10959 }
10960
10961 if (bits == 0) {
10962 if (sum0) {
10963 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10964 }
10965 }
10966 else {
10967 if (sum == INT2FIX(0)) {
10968 if (bits < (int)sizeof(long)*CHAR_BIT) {
10969 sum0 &= (((unsigned long)1)<<bits)-1;
10970 }
10971 sum = LONG2FIX(sum0);
10972 }
10973 else {
10974 VALUE mod;
10975
10976 if (sum0) {
10977 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10978 }
10979
10980 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10981 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10982 sum = rb_funcall(sum, '&', 1, mod);
10983 }
10984 }
10985 return sum;
10986}
10987
10988static VALUE
10989rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10990{
10991 rb_encoding *enc;
10992 VALUE w;
10993 long width, len, flen = 1, fclen = 1;
10994 VALUE res;
10995 char *p;
10996 const char *f = " ";
10997 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10998 VALUE pad;
10999 int singlebyte = 1, cr;
11000 int termlen;
11001
11002 rb_scan_args(argc, argv, "11", &w, &pad);
11003 enc = STR_ENC_GET(str);
11004 termlen = rb_enc_mbminlen(enc);
11005 width = NUM2LONG(w);
11006 if (argc == 2) {
11007 StringValue(pad);
11008 enc = rb_enc_check(str, pad);
11009 f = RSTRING_PTR(pad);
11010 flen = RSTRING_LEN(pad);
11011 fclen = str_strlen(pad, enc); /* rb_enc_check */
11012 singlebyte = single_byte_optimizable(pad);
11013 if (flen == 0 || fclen == 0) {
11014 rb_raise(rb_eArgError, "zero width padding");
11015 }
11016 }
11017 len = str_strlen(str, enc); /* rb_enc_check */
11018 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
11019 n = width - len;
11020 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
11021 rlen = n - llen;
11022 cr = ENC_CODERANGE(str);
11023 if (flen > 1) {
11024 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
11025 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
11026 }
11027 size = RSTRING_LEN(str);
11028 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
11029 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
11030 (len += llen2 + rlen2) >= LONG_MAX - size) {
11031 rb_raise(rb_eArgError, "argument too big");
11032 }
11033 len += size;
11034 res = str_enc_new(rb_cString, 0, len, enc);
11035 p = RSTRING_PTR(res);
11036 if (flen <= 1) {
11037 memset(p, *f, llen);
11038 p += llen;
11039 }
11040 else {
11041 while (llen >= fclen) {
11042 memcpy(p,f,flen);
11043 p += flen;
11044 llen -= fclen;
11045 }
11046 if (llen > 0) {
11047 memcpy(p, f, llen2);
11048 p += llen2;
11049 }
11050 }
11051 memcpy(p, RSTRING_PTR(str), size);
11052 p += size;
11053 if (flen <= 1) {
11054 memset(p, *f, rlen);
11055 p += rlen;
11056 }
11057 else {
11058 while (rlen >= fclen) {
11059 memcpy(p,f,flen);
11060 p += flen;
11061 rlen -= fclen;
11062 }
11063 if (rlen > 0) {
11064 memcpy(p, f, rlen2);
11065 p += rlen2;
11066 }
11067 }
11068 TERM_FILL(p, termlen);
11069 STR_SET_LEN(res, p-RSTRING_PTR(res));
11070
11071 if (argc == 2)
11072 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
11073 if (cr != ENC_CODERANGE_BROKEN)
11074 ENC_CODERANGE_SET(res, cr);
11075
11076 RB_GC_GUARD(pad);
11077 return res;
11078}
11079
11080
11081/*
11082 * call-seq:
11083 * ljust(size, pad_string = ' ') -> new_string
11084 *
11085 * :include: doc/string/ljust.rdoc
11086 *
11087 * Related: String#rjust, String#center.
11088 *
11089 */
11090
11091static VALUE
11092rb_str_ljust(int argc, VALUE *argv, VALUE str)
11093{
11094 return rb_str_justify(argc, argv, str, 'l');
11095}
11096
11097/*
11098 * call-seq:
11099 * rjust(size, pad_string = ' ') -> new_string
11100 *
11101 * :include: doc/string/rjust.rdoc
11102 *
11103 * Related: String#ljust, String#center.
11104 *
11105 */
11106
11107static VALUE
11108rb_str_rjust(int argc, VALUE *argv, VALUE str)
11109{
11110 return rb_str_justify(argc, argv, str, 'r');
11111}
11112
11113
11114/*
11115 * call-seq:
11116 * center(size, pad_string = ' ') -> new_string
11117 *
11118 * :include: doc/string/center.rdoc
11119 *
11120 * Related: String#ljust, String#rjust.
11121 *
11122 */
11123
11124static VALUE
11125rb_str_center(int argc, VALUE *argv, VALUE str)
11126{
11127 return rb_str_justify(argc, argv, str, 'c');
11128}
11129
11130/*
11131 * call-seq:
11132 * partition(string_or_regexp) -> [head, match, tail]
11133 *
11134 * :include: doc/string/partition.rdoc
11135 *
11136 */
11137
11138static VALUE
11139rb_str_partition(VALUE str, VALUE sep)
11140{
11141 long pos;
11142
11143 sep = get_pat_quoted(sep, 0);
11144 if (RB_TYPE_P(sep, T_REGEXP)) {
11145 if (rb_reg_search(sep, str, 0, 0) < 0) {
11146 goto failed;
11147 }
11148 VALUE match = rb_backref_get();
11149 struct re_registers *regs = RMATCH_REGS(match);
11150
11151 pos = BEG(0);
11152 sep = rb_str_subseq(str, pos, END(0) - pos);
11153 }
11154 else {
11155 pos = rb_str_index(str, sep, 0);
11156 if (pos < 0) goto failed;
11157 }
11158 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11159 sep,
11160 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11161 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11162
11163 failed:
11164 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11165}
11166
11167/*
11168 * call-seq:
11169 * rpartition(sep) -> [head, match, tail]
11170 *
11171 * :include: doc/string/rpartition.rdoc
11172 *
11173 */
11174
11175static VALUE
11176rb_str_rpartition(VALUE str, VALUE sep)
11177{
11178 long pos = RSTRING_LEN(str);
11179
11180 sep = get_pat_quoted(sep, 0);
11181 if (RB_TYPE_P(sep, T_REGEXP)) {
11182 if (rb_reg_search(sep, str, pos, 1) < 0) {
11183 goto failed;
11184 }
11185 VALUE match = rb_backref_get();
11186 struct re_registers *regs = RMATCH_REGS(match);
11187
11188 pos = BEG(0);
11189 sep = rb_str_subseq(str, pos, END(0) - pos);
11190 }
11191 else {
11192 pos = rb_str_sublen(str, pos);
11193 pos = rb_str_rindex(str, sep, pos);
11194 if (pos < 0) {
11195 goto failed;
11196 }
11197 }
11198
11199 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11200 sep,
11201 rb_str_subseq(str, pos+RSTRING_LEN(sep),
11202 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11203 failed:
11204 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11205}
11206
11207/*
11208 * call-seq:
11209 * start_with?(*string_or_regexp) -> true or false
11210 *
11211 * :include: doc/string/start_with_p.rdoc
11212 *
11213 */
11214
11215static VALUE
11216rb_str_start_with(int argc, VALUE *argv, VALUE str)
11217{
11218 int i;
11219
11220 for (i=0; i<argc; i++) {
11221 VALUE tmp = argv[i];
11222 if (RB_TYPE_P(tmp, T_REGEXP)) {
11223 if (rb_reg_start_with_p(tmp, str))
11224 return Qtrue;
11225 }
11226 else {
11227 const char *p, *s, *e;
11228 long slen, tlen;
11229 rb_encoding *enc;
11230
11231 StringValue(tmp);
11232 enc = rb_enc_check(str, tmp);
11233 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11234 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11235 p = RSTRING_PTR(str);
11236 e = p + slen;
11237 s = p + tlen;
11238 if (!at_char_right_boundary(p, s, e, enc))
11239 continue;
11240 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11241 return Qtrue;
11242 }
11243 }
11244 return Qfalse;
11245}
11246
11247/*
11248 * call-seq:
11249 * end_with?(*strings) -> true or false
11250 *
11251 * :include: doc/string/end_with_p.rdoc
11252 *
11253 */
11254
11255static VALUE
11256rb_str_end_with(int argc, VALUE *argv, VALUE str)
11257{
11258 int i;
11259
11260 for (i=0; i<argc; i++) {
11261 VALUE tmp = argv[i];
11262 const char *p, *s, *e;
11263 long slen, tlen;
11264 rb_encoding *enc;
11265
11266 StringValue(tmp);
11267 enc = rb_enc_check(str, tmp);
11268 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11269 if ((slen = RSTRING_LEN(str)) < tlen) continue;
11270 p = RSTRING_PTR(str);
11271 e = p + slen;
11272 s = e - tlen;
11273 if (!at_char_boundary(p, s, e, enc))
11274 continue;
11275 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11276 return Qtrue;
11277 }
11278 return Qfalse;
11279}
11280
11290static long
11291deleted_prefix_length(VALUE str, VALUE prefix)
11292{
11293 const char *strptr, *prefixptr;
11294 long olen, prefixlen;
11295 rb_encoding *enc = rb_enc_get(str);
11296
11297 StringValue(prefix);
11298
11299 if (!is_broken_string(prefix) ||
11300 !rb_enc_asciicompat(enc) ||
11301 !rb_enc_asciicompat(rb_enc_get(prefix))) {
11302 enc = rb_enc_check(str, prefix);
11303 }
11304
11305 /* return 0 if not start with prefix */
11306 prefixlen = RSTRING_LEN(prefix);
11307 if (prefixlen <= 0) return 0;
11308 olen = RSTRING_LEN(str);
11309 if (olen < prefixlen) return 0;
11310 strptr = RSTRING_PTR(str);
11311 prefixptr = RSTRING_PTR(prefix);
11312 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11313 if (is_broken_string(prefix)) {
11314 if (!is_broken_string(str)) {
11315 /* prefix in a valid string cannot be broken */
11316 return 0;
11317 }
11318 const char *strend = strptr + olen;
11319 const char *after_prefix = strptr + prefixlen;
11320 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11321 /* prefix does not end at char-boundary */
11322 return 0;
11323 }
11324 }
11325 /* prefix part in `str` also should be valid. */
11326
11327 return prefixlen;
11328}
11329
11330/*
11331 * call-seq:
11332 * delete_prefix!(prefix) -> self or nil
11333 *
11334 * Like String#delete_prefix, except that +self+ is modified in place.
11335 * Returns +self+ if the prefix is removed, +nil+ otherwise.
11336 *
11337 */
11338
11339static VALUE
11340rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11341{
11342 long prefixlen;
11343 str_modify_keep_cr(str);
11344
11345 prefixlen = deleted_prefix_length(str, prefix);
11346 if (prefixlen <= 0) return Qnil;
11347
11348 return rb_str_drop_bytes(str, prefixlen);
11349}
11350
11351/*
11352 * call-seq:
11353 * delete_prefix(prefix) -> new_string
11354 *
11355 * :include: doc/string/delete_prefix.rdoc
11356 *
11357 */
11358
11359static VALUE
11360rb_str_delete_prefix(VALUE str, VALUE prefix)
11361{
11362 long prefixlen;
11363
11364 prefixlen = deleted_prefix_length(str, prefix);
11365 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11366
11367 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11368}
11369
11379static long
11380deleted_suffix_length(VALUE str, VALUE suffix)
11381{
11382 const char *strptr, *suffixptr;
11383 long olen, suffixlen;
11384 rb_encoding *enc;
11385
11386 StringValue(suffix);
11387 if (is_broken_string(suffix)) return 0;
11388 enc = rb_enc_check(str, suffix);
11389
11390 /* return 0 if not start with suffix */
11391 suffixlen = RSTRING_LEN(suffix);
11392 if (suffixlen <= 0) return 0;
11393 olen = RSTRING_LEN(str);
11394 if (olen < suffixlen) return 0;
11395 strptr = RSTRING_PTR(str);
11396 suffixptr = RSTRING_PTR(suffix);
11397 const char *strend = strptr + olen;
11398 const char *before_suffix = strend - suffixlen;
11399 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11400 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11401
11402 return suffixlen;
11403}
11404
11405/*
11406 * call-seq:
11407 * delete_suffix!(suffix) -> self or nil
11408 *
11409 * Like String#delete_suffix, except that +self+ is modified in place.
11410 * Returns +self+ if the suffix is removed, +nil+ otherwise.
11411 *
11412 */
11413
11414static VALUE
11415rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11416{
11417 long olen, suffixlen, len;
11418 str_modifiable(str);
11419
11420 suffixlen = deleted_suffix_length(str, suffix);
11421 if (suffixlen <= 0) return Qnil;
11422
11423 olen = RSTRING_LEN(str);
11424 str_modify_keep_cr(str);
11425 len = olen - suffixlen;
11426 STR_SET_LEN(str, len);
11427 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11428 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11430 }
11431 return str;
11432}
11433
11434/*
11435 * call-seq:
11436 * delete_suffix(suffix) -> new_string
11437 *
11438 * :include: doc/string/delete_suffix.rdoc
11439 *
11440 */
11441
11442static VALUE
11443rb_str_delete_suffix(VALUE str, VALUE suffix)
11444{
11445 long suffixlen;
11446
11447 suffixlen = deleted_suffix_length(str, suffix);
11448 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11449
11450 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11451}
11452
11453void
11454rb_str_setter(VALUE val, ID id, VALUE *var)
11455{
11456 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11457 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11458 }
11459 *var = val;
11460}
11461
11462static void
11463rb_fs_setter(VALUE val, ID id, VALUE *var)
11464{
11465 val = rb_fs_check(val);
11466 if (!val) {
11467 rb_raise(rb_eTypeError,
11468 "value of %"PRIsVALUE" must be String or Regexp",
11469 rb_id2str(id));
11470 }
11471 if (!NIL_P(val)) {
11472 rb_warn_deprecated("'$;'", NULL);
11473 }
11474 *var = val;
11475}
11476
11477
11478/*
11479 * call-seq:
11480 * force_encoding(encoding) -> self
11481 *
11482 * :include: doc/string/force_encoding.rdoc
11483 *
11484 */
11485
11486static VALUE
11487rb_str_force_encoding(VALUE str, VALUE enc)
11488{
11489 str_modifiable(str);
11490
11491 rb_encoding *encoding = rb_to_encoding(enc);
11492 int idx = rb_enc_to_index(encoding);
11493
11494 // If the encoding is unchanged, we do nothing.
11495 if (ENCODING_GET(str) == idx) {
11496 return str;
11497 }
11498
11499 rb_enc_associate_index(str, idx);
11500
11501 // If the coderange was 7bit and the new encoding is ASCII-compatible
11502 // we can keep the coderange.
11503 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11504 return str;
11505 }
11506
11508 return str;
11509}
11510
11511/*
11512 * call-seq:
11513 * b -> new_string
11514 *
11515 * :include: doc/string/b.rdoc
11516 *
11517 */
11518
11519static VALUE
11520rb_str_b(VALUE str)
11521{
11522 VALUE str2;
11523 if (STR_EMBED_P(str)) {
11524 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11525 }
11526 else {
11527 str2 = str_alloc_heap(rb_cString);
11528 }
11529 str_replace_shared_without_enc(str2, str);
11530
11531 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11532 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11533 // If we know the receiver's code range then we know the result's code range.
11534 int cr = ENC_CODERANGE(str);
11535 switch (cr) {
11536 case ENC_CODERANGE_7BIT:
11538 break;
11542 break;
11543 default:
11544 ENC_CODERANGE_CLEAR(str2);
11545 break;
11546 }
11547 }
11548
11549 return str2;
11550}
11551
11552/*
11553 * call-seq:
11554 * valid_encoding? -> true or false
11555 *
11556 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11557 *
11558 * "\xc2\xa1".force_encoding(Encoding::UTF_8).valid_encoding? # => true
11559 * "\xc2".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11560 * "\x80".force_encoding(Encoding::UTF_8).valid_encoding? # => false
11561 */
11562
11563static VALUE
11564rb_str_valid_encoding_p(VALUE str)
11565{
11566 int cr = rb_enc_str_coderange(str);
11567
11568 return RBOOL(cr != ENC_CODERANGE_BROKEN);
11569}
11570
11571/*
11572 * call-seq:
11573 * ascii_only? -> true or false
11574 *
11575 * Returns whether +self+ contains only ASCII characters:
11576 *
11577 * 'abc'.ascii_only? # => true
11578 * "abc\u{6666}".ascii_only? # => false
11579 *
11580 * Related: see {Querying}[rdoc-ref:String@Querying].
11581 */
11582
11583static VALUE
11584rb_str_is_ascii_only_p(VALUE str)
11585{
11586 int cr = rb_enc_str_coderange(str);
11587
11588 return RBOOL(cr == ENC_CODERANGE_7BIT);
11589}
11590
11591VALUE
11593{
11594 static const char ellipsis[] = "...";
11595 const long ellipsislen = sizeof(ellipsis) - 1;
11596 rb_encoding *const enc = rb_enc_get(str);
11597 const long blen = RSTRING_LEN(str);
11598 const char *const p = RSTRING_PTR(str), *e = p + blen;
11599 VALUE estr, ret = 0;
11600
11601 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11602 if (len * rb_enc_mbminlen(enc) >= blen ||
11603 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11604 ret = str;
11605 }
11606 else if (len <= ellipsislen ||
11607 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11608 if (rb_enc_asciicompat(enc)) {
11609 ret = rb_str_new(ellipsis, len);
11610 rb_enc_associate(ret, enc);
11611 }
11612 else {
11613 estr = rb_usascii_str_new(ellipsis, len);
11614 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11615 }
11616 }
11617 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11618 rb_str_cat(ret, ellipsis, ellipsislen);
11619 }
11620 else {
11621 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11622 rb_enc_from_encoding(enc), 0, Qnil);
11623 rb_str_append(ret, estr);
11624 }
11625 return ret;
11626}
11627
11628static VALUE
11629str_compat_and_valid(VALUE str, rb_encoding *enc)
11630{
11631 int cr;
11632 str = StringValue(str);
11633 cr = rb_enc_str_coderange(str);
11634 if (cr == ENC_CODERANGE_BROKEN) {
11635 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11636 }
11637 else {
11638 rb_encoding *e = STR_ENC_GET(str);
11639 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11640 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11641 rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11642 }
11643 }
11644 return str;
11645}
11646
11647static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11648
11649VALUE
11651{
11652 rb_encoding *enc = STR_ENC_GET(str);
11653 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11654}
11655
11656VALUE
11657rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11658{
11659 int cr = ENC_CODERANGE_UNKNOWN;
11660 if (enc == STR_ENC_GET(str)) {
11661 /* cached coderange makes sense only when enc equals the
11662 * actual encoding of str */
11663 cr = ENC_CODERANGE(str);
11664 }
11665 return enc_str_scrub(enc, str, repl, cr);
11666}
11667
11668static VALUE
11669enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11670{
11671 int encidx;
11672 VALUE buf = Qnil;
11673 const char *rep, *p, *e, *p1, *sp;
11674 long replen = -1;
11675 long slen;
11676
11677 if (rb_block_given_p()) {
11678 if (!NIL_P(repl))
11679 rb_raise(rb_eArgError, "both of block and replacement given");
11680 replen = 0;
11681 }
11682
11683 if (ENC_CODERANGE_CLEAN_P(cr))
11684 return Qnil;
11685
11686 if (!NIL_P(repl)) {
11687 repl = str_compat_and_valid(repl, enc);
11688 }
11689
11690 if (rb_enc_dummy_p(enc)) {
11691 return Qnil;
11692 }
11693 encidx = rb_enc_to_index(enc);
11694
11695#define DEFAULT_REPLACE_CHAR(str) do { \
11696 RBIMPL_ATTR_NONSTRING() static const char replace[sizeof(str)-1] = str; \
11697 rep = replace; replen = (int)sizeof(replace); \
11698 } while (0)
11699
11700 slen = RSTRING_LEN(str);
11701 p = RSTRING_PTR(str);
11702 e = RSTRING_END(str);
11703 p1 = p;
11704 sp = p;
11705
11706 if (rb_enc_asciicompat(enc)) {
11707 int rep7bit_p;
11708 if (!replen) {
11709 rep = NULL;
11710 rep7bit_p = FALSE;
11711 }
11712 else if (!NIL_P(repl)) {
11713 rep = RSTRING_PTR(repl);
11714 replen = RSTRING_LEN(repl);
11715 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11716 }
11717 else if (encidx == rb_utf8_encindex()) {
11718 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11719 rep7bit_p = FALSE;
11720 }
11721 else {
11722 DEFAULT_REPLACE_CHAR("?");
11723 rep7bit_p = TRUE;
11724 }
11725 cr = ENC_CODERANGE_7BIT;
11726
11727 p = search_nonascii(p, e);
11728 if (!p) {
11729 p = e;
11730 }
11731 while (p < e) {
11732 int ret = rb_enc_precise_mbclen(p, e, enc);
11733 if (MBCLEN_NEEDMORE_P(ret)) {
11734 break;
11735 }
11736 else if (MBCLEN_CHARFOUND_P(ret)) {
11738 p += MBCLEN_CHARFOUND_LEN(ret);
11739 }
11740 else if (MBCLEN_INVALID_P(ret)) {
11741 /*
11742 * p1~p: valid ascii/multibyte chars
11743 * p ~e: invalid bytes + unknown bytes
11744 */
11745 long clen = rb_enc_mbmaxlen(enc);
11746 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11747 if (p > p1) {
11748 rb_str_buf_cat(buf, p1, p - p1);
11749 }
11750
11751 if (e - p < clen) clen = e - p;
11752 if (clen <= 2) {
11753 clen = 1;
11754 }
11755 else {
11756 const char *q = p;
11757 clen--;
11758 for (; clen > 1; clen--) {
11759 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11760 if (MBCLEN_NEEDMORE_P(ret)) break;
11761 if (MBCLEN_INVALID_P(ret)) continue;
11763 }
11764 }
11765 if (rep) {
11766 rb_str_buf_cat(buf, rep, replen);
11767 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11768 }
11769 else {
11770 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11771 str_mod_check(str, sp, slen);
11772 repl = str_compat_and_valid(repl, enc);
11773 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11776 }
11777 p += clen;
11778 p1 = p;
11779 p = search_nonascii(p, e);
11780 if (!p) {
11781 p = e;
11782 break;
11783 }
11784 }
11785 else {
11787 }
11788 }
11789 if (NIL_P(buf)) {
11790 if (p == e) {
11791 ENC_CODERANGE_SET(str, cr);
11792 return Qnil;
11793 }
11794 buf = rb_str_buf_new(RSTRING_LEN(str));
11795 }
11796 if (p1 < p) {
11797 rb_str_buf_cat(buf, p1, p - p1);
11798 }
11799 if (p < e) {
11800 if (rep) {
11801 rb_str_buf_cat(buf, rep, replen);
11802 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11803 }
11804 else {
11805 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11806 str_mod_check(str, sp, slen);
11807 repl = str_compat_and_valid(repl, enc);
11808 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11811 }
11812 }
11813 }
11814 else {
11815 /* ASCII incompatible */
11816 long mbminlen = rb_enc_mbminlen(enc);
11817 if (!replen) {
11818 rep = NULL;
11819 }
11820 else if (!NIL_P(repl)) {
11821 rep = RSTRING_PTR(repl);
11822 replen = RSTRING_LEN(repl);
11823 }
11824 else if (encidx == ENCINDEX_UTF_16BE) {
11825 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11826 }
11827 else if (encidx == ENCINDEX_UTF_16LE) {
11828 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11829 }
11830 else if (encidx == ENCINDEX_UTF_32BE) {
11831 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11832 }
11833 else if (encidx == ENCINDEX_UTF_32LE) {
11834 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11835 }
11836 else {
11837 DEFAULT_REPLACE_CHAR("?");
11838 }
11839
11840 while (p < e) {
11841 int ret = rb_enc_precise_mbclen(p, e, enc);
11842 if (MBCLEN_NEEDMORE_P(ret)) {
11843 break;
11844 }
11845 else if (MBCLEN_CHARFOUND_P(ret)) {
11846 p += MBCLEN_CHARFOUND_LEN(ret);
11847 }
11848 else if (MBCLEN_INVALID_P(ret)) {
11849 const char *q = p;
11850 long clen = rb_enc_mbmaxlen(enc);
11851 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11852 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11853
11854 if (e - p < clen) clen = e - p;
11855 if (clen <= mbminlen * 2) {
11856 clen = mbminlen;
11857 }
11858 else {
11859 clen -= mbminlen;
11860 for (; clen > mbminlen; clen-=mbminlen) {
11861 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11862 if (MBCLEN_NEEDMORE_P(ret)) break;
11863 if (MBCLEN_INVALID_P(ret)) continue;
11865 }
11866 }
11867 if (rep) {
11868 rb_str_buf_cat(buf, rep, replen);
11869 }
11870 else {
11871 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11872 str_mod_check(str, sp, slen);
11873 repl = str_compat_and_valid(repl, enc);
11874 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11875 }
11876 p += clen;
11877 p1 = p;
11878 }
11879 else {
11881 }
11882 }
11883 if (NIL_P(buf)) {
11884 if (p == e) {
11886 return Qnil;
11887 }
11888 buf = rb_str_buf_new(RSTRING_LEN(str));
11889 }
11890 if (p1 < p) {
11891 rb_str_buf_cat(buf, p1, p - p1);
11892 }
11893 if (p < e) {
11894 if (rep) {
11895 rb_str_buf_cat(buf, rep, replen);
11896 }
11897 else {
11898 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11899 str_mod_check(str, sp, slen);
11900 repl = str_compat_and_valid(repl, enc);
11901 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11902 }
11903 }
11905 }
11906 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11907 return buf;
11908}
11909
11910/*
11911 * call-seq:
11912 * scrub(replacement_string = default_replacement) -> new_string
11913 * scrub{|bytes| ... } -> new_string
11914 *
11915 * :include: doc/string/scrub.rdoc
11916 *
11917 */
11918static VALUE
11919str_scrub(int argc, VALUE *argv, VALUE str)
11920{
11921 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11922 VALUE new = rb_str_scrub(str, repl);
11923 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11924}
11925
11926/*
11927 * call-seq:
11928 * scrub! -> self
11929 * scrub!(replacement_string = default_replacement) -> self
11930 * scrub!{|bytes| ... } -> self
11931 *
11932 * Like String#scrub, except that any replacements are made in +self+.
11933 *
11934 */
11935static VALUE
11936str_scrub_bang(int argc, VALUE *argv, VALUE str)
11937{
11938 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11939 VALUE new = rb_str_scrub(str, repl);
11940 if (!NIL_P(new)) rb_str_replace(str, new);
11941 return str;
11942}
11943
11944static ID id_normalize;
11945static ID id_normalized_p;
11946static VALUE mUnicodeNormalize;
11947
11948static VALUE
11949unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11950{
11951 static int UnicodeNormalizeRequired = 0;
11952 VALUE argv2[2];
11953
11954 if (!UnicodeNormalizeRequired) {
11955 rb_require("unicode_normalize/normalize.rb");
11956 UnicodeNormalizeRequired = 1;
11957 }
11958 argv2[0] = str;
11959 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11960 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11961}
11962
11963/*
11964 * call-seq:
11965 * unicode_normalize(form = :nfc) -> string
11966 *
11967 * Returns a copy of +self+ with
11968 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11969 *
11970 * Argument +form+ must be one of the following symbols
11971 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11972 *
11973 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11974 * - +:nfd+: Canonical decomposition.
11975 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11976 * - +:nfkd+: Compatibility decomposition.
11977 *
11978 * The encoding of +self+ must be one of:
11979 *
11980 * - Encoding::UTF_8
11981 * - Encoding::UTF_16BE
11982 * - Encoding::UTF_16LE
11983 * - Encoding::UTF_32BE
11984 * - Encoding::UTF_32LE
11985 * - Encoding::GB18030
11986 * - Encoding::UCS_2BE
11987 * - Encoding::UCS_4BE
11988 *
11989 * Examples:
11990 *
11991 * "a\u0300".unicode_normalize # => "a"
11992 * "\u00E0".unicode_normalize(:nfd) # => "a "
11993 *
11994 * Related: String#unicode_normalize!, String#unicode_normalized?.
11995 */
11996static VALUE
11997rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11998{
11999 return unicode_normalize_common(argc, argv, str, id_normalize);
12000}
12001
12002/*
12003 * call-seq:
12004 * unicode_normalize!(form = :nfc) -> self
12005 *
12006 * Like String#unicode_normalize, except that the normalization
12007 * is performed on +self+.
12008 *
12009 * Related String#unicode_normalized?.
12010 *
12011 */
12012static VALUE
12013rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
12014{
12015 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
12016}
12017
12018/* call-seq:
12019 * unicode_normalized?(form = :nfc) -> true or false
12020 *
12021 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
12022 * +false+ otherwise.
12023 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
12024 *
12025 * Examples:
12026 *
12027 * "a\u0300".unicode_normalized? # => false
12028 * "a\u0300".unicode_normalized?(:nfd) # => true
12029 * "\u00E0".unicode_normalized? # => true
12030 * "\u00E0".unicode_normalized?(:nfd) # => false
12031 *
12032 *
12033 * Raises an exception if +self+ is not in a Unicode encoding:
12034 *
12035 * s = "\xE0".force_encoding(Encoding::ISO_8859_1)
12036 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
12037 *
12038 * Related: String#unicode_normalize, String#unicode_normalize!.
12039 *
12040 */
12041static VALUE
12042rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
12043{
12044 return unicode_normalize_common(argc, argv, str, id_normalized_p);
12045}
12046
12047/**********************************************************************
12048 * Document-class: Symbol
12049 *
12050 * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
12051 *
12052 * You can create a +Symbol+ object explicitly with:
12053 *
12054 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
12055 *
12056 * The same +Symbol+ object will be
12057 * created for a given name or string for the duration of a program's
12058 * execution, regardless of the context or meaning of that name. Thus
12059 * if <code>Fred</code> is a constant in one context, a method in
12060 * another, and a class in a third, the +Symbol+ <code>:Fred</code>
12061 * will be the same object in all three contexts.
12062 *
12063 * module One
12064 * class Fred
12065 * end
12066 * $f1 = :Fred
12067 * end
12068 * module Two
12069 * Fred = 1
12070 * $f2 = :Fred
12071 * end
12072 * def Fred()
12073 * end
12074 * $f3 = :Fred
12075 * $f1.object_id #=> 2514190
12076 * $f2.object_id #=> 2514190
12077 * $f3.object_id #=> 2514190
12078 *
12079 * Constant, method, and variable names are returned as symbols:
12080 *
12081 * module One
12082 * Two = 2
12083 * def three; 3 end
12084 * @four = 4
12085 * @@five = 5
12086 * $six = 6
12087 * end
12088 * seven = 7
12089 *
12090 * One.constants
12091 * # => [:Two]
12092 * One.instance_methods(true)
12093 * # => [:three]
12094 * One.instance_variables
12095 * # => [:@four]
12096 * One.class_variables
12097 * # => [:@@five]
12098 * global_variables.grep(/six/)
12099 * # => [:$six]
12100 * local_variables
12101 * # => [:seven]
12102 *
12103 * A +Symbol+ object differs from a String object in that
12104 * a +Symbol+ object represents an identifier, while a String object
12105 * represents text or data.
12106 *
12107 * == What's Here
12108 *
12109 * First, what's elsewhere. Class +Symbol+:
12110 *
12111 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
12112 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
12113 *
12114 * Here, class +Symbol+ provides methods that are useful for:
12115 *
12116 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
12117 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
12118 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
12119 *
12120 * === Methods for Querying
12121 *
12122 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
12123 * - #=~: Returns the index of the first substring in symbol that matches a
12124 * given Regexp or other object; returns +nil+ if no match is found.
12125 * - #[], #slice : Returns a substring of symbol
12126 * determined by a given index, start/length, or range, or string.
12127 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
12128 * - #encoding: Returns the Encoding object that represents the encoding
12129 * of symbol.
12130 * - #end_with?: Returns +true+ if symbol ends with
12131 * any of the given strings.
12132 * - #match: Returns a MatchData object if symbol
12133 * matches a given Regexp; +nil+ otherwise.
12134 * - #match?: Returns +true+ if symbol
12135 * matches a given Regexp; +false+ otherwise.
12136 * - #length, #size: Returns the number of characters in symbol.
12137 * - #start_with?: Returns +true+ if symbol starts with
12138 * any of the given strings.
12139 *
12140 * === Methods for Comparing
12141 *
12142 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12143 * or larger than symbol.
12144 * - #==, #===: Returns +true+ if a given symbol has the same content and
12145 * encoding.
12146 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12147 * symbol is smaller than, equal to, or larger than symbol.
12148 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12149 * after Unicode case folding; +false+ otherwise.
12150 *
12151 * === Methods for Converting
12152 *
12153 * - #capitalize: Returns symbol with the first character upcased
12154 * and all other characters downcased.
12155 * - #downcase: Returns symbol with all characters downcased.
12156 * - #inspect: Returns the string representation of +self+ as a symbol literal.
12157 * - #name: Returns the frozen string corresponding to symbol.
12158 * - #succ, #next: Returns the symbol that is the successor to symbol.
12159 * - #swapcase: Returns symbol with all upcase characters downcased
12160 * and all downcase characters upcased.
12161 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12162 * - #to_s, #id2name: Returns the string corresponding to +self+.
12163 * - #to_sym, #intern: Returns +self+.
12164 * - #upcase: Returns symbol with all characters upcased.
12165 *
12166 */
12167
12168
12169/*
12170 * call-seq:
12171 * symbol == object -> true or false
12172 *
12173 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12174 */
12175
12176#define sym_equal rb_obj_equal
12177
12178static int
12179sym_printable(const char *s, const char *send, rb_encoding *enc)
12180{
12181 while (s < send) {
12182 int n;
12183 int c = rb_enc_precise_mbclen(s, send, enc);
12184
12185 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12186 n = MBCLEN_CHARFOUND_LEN(c);
12187 c = rb_enc_mbc_to_codepoint(s, send, enc);
12188 if (!rb_enc_isprint(c, enc)) return FALSE;
12189 s += n;
12190 }
12191 return TRUE;
12192}
12193
12194int
12195rb_str_symname_p(VALUE sym)
12196{
12197 rb_encoding *enc;
12198 const char *ptr;
12199 long len;
12200 rb_encoding *resenc = rb_default_internal_encoding();
12201
12202 if (resenc == NULL) resenc = rb_default_external_encoding();
12203 enc = STR_ENC_GET(sym);
12204 ptr = RSTRING_PTR(sym);
12205 len = RSTRING_LEN(sym);
12206 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12207 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12208 return FALSE;
12209 }
12210 return TRUE;
12211}
12212
12213VALUE
12214rb_str_quote_unprintable(VALUE str)
12215{
12216 rb_encoding *enc;
12217 const char *ptr;
12218 long len;
12219 rb_encoding *resenc;
12220
12221 Check_Type(str, T_STRING);
12222 resenc = rb_default_internal_encoding();
12223 if (resenc == NULL) resenc = rb_default_external_encoding();
12224 enc = STR_ENC_GET(str);
12225 ptr = RSTRING_PTR(str);
12226 len = RSTRING_LEN(str);
12227 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12228 !sym_printable(ptr, ptr + len, enc)) {
12229 return rb_str_escape(str);
12230 }
12231 return str;
12232}
12233
12234VALUE
12235rb_id_quote_unprintable(ID id)
12236{
12237 VALUE str = rb_id2str(id);
12238 if (!rb_str_symname_p(str)) {
12239 return rb_str_escape(str);
12240 }
12241 return str;
12242}
12243
12244/*
12245 * call-seq:
12246 * inspect -> string
12247 *
12248 * Returns a string representation of +self+ (including the leading colon):
12249 *
12250 * :foo.inspect # => ":foo"
12251 *
12252 * Related: Symbol#to_s, Symbol#name.
12253 *
12254 */
12255
12256static VALUE
12257sym_inspect(VALUE sym)
12258{
12259 VALUE str = rb_sym2str(sym);
12260 const char *ptr;
12261 long len;
12262 char *dest;
12263
12264 if (!rb_str_symname_p(str)) {
12265 str = rb_str_inspect(str);
12266 len = RSTRING_LEN(str);
12267 rb_str_resize(str, len + 1);
12268 dest = RSTRING_PTR(str);
12269 memmove(dest + 1, dest, len);
12270 }
12271 else {
12272 rb_encoding *enc = STR_ENC_GET(str);
12273 VALUE orig_str = str;
12274
12275 len = RSTRING_LEN(orig_str);
12276 str = rb_enc_str_new(0, len + 1, enc);
12277
12278 // Get data pointer after allocation
12279 ptr = RSTRING_PTR(orig_str);
12280 dest = RSTRING_PTR(str);
12281 memcpy(dest + 1, ptr, len);
12282
12283 RB_GC_GUARD(orig_str);
12284 }
12285 dest[0] = ':';
12286
12288
12289 return str;
12290}
12291
12292VALUE
12294{
12295 VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12296 FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12297 return str;
12298}
12299
12300VALUE
12301rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12302{
12303 VALUE obj;
12304
12305 if (argc < 1) {
12306 rb_raise(rb_eArgError, "no receiver given");
12307 }
12308 obj = argv[0];
12309 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12310}
12311
12312/*
12313 * call-seq:
12314 * succ
12315 *
12316 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12317 *
12318 * :foo.succ # => :fop
12319 *
12320 * Related: String#succ.
12321 */
12322
12323static VALUE
12324sym_succ(VALUE sym)
12325{
12326 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12327}
12328
12329/*
12330 * call-seq:
12331 * symbol <=> object -> -1, 0, +1, or nil
12332 *
12333 * If +object+ is a symbol,
12334 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12335 *
12336 * :bar <=> :foo # => -1
12337 * :foo <=> :foo # => 0
12338 * :foo <=> :bar # => 1
12339 *
12340 * Otherwise, returns +nil+:
12341 *
12342 * :foo <=> 'bar' # => nil
12343 *
12344 * Related: String#<=>.
12345 */
12346
12347static VALUE
12348sym_cmp(VALUE sym, VALUE other)
12349{
12350 if (!SYMBOL_P(other)) {
12351 return Qnil;
12352 }
12353 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12354}
12355
12356/*
12357 * call-seq:
12358 * casecmp(object) -> -1, 0, 1, or nil
12359 *
12360 * :include: doc/symbol/casecmp.rdoc
12361 *
12362 */
12363
12364static VALUE
12365sym_casecmp(VALUE sym, VALUE other)
12366{
12367 if (!SYMBOL_P(other)) {
12368 return Qnil;
12369 }
12370 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12371}
12372
12373/*
12374 * call-seq:
12375 * casecmp?(object) -> true, false, or nil
12376 *
12377 * :include: doc/symbol/casecmp_p.rdoc
12378 *
12379 */
12380
12381static VALUE
12382sym_casecmp_p(VALUE sym, VALUE other)
12383{
12384 if (!SYMBOL_P(other)) {
12385 return Qnil;
12386 }
12387 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12388}
12389
12390/*
12391 * call-seq:
12392 * symbol =~ object -> integer or nil
12393 *
12394 * Equivalent to <tt>symbol.to_s =~ object</tt>,
12395 * including possible updates to global variables;
12396 * see String#=~.
12397 *
12398 */
12399
12400static VALUE
12401sym_match(VALUE sym, VALUE other)
12402{
12403 return rb_str_match(rb_sym2str(sym), other);
12404}
12405
12406/*
12407 * call-seq:
12408 * match(pattern, offset = 0) -> matchdata or nil
12409 * match(pattern, offset = 0) {|matchdata| } -> object
12410 *
12411 * Equivalent to <tt>self.to_s.match</tt>,
12412 * including possible updates to global variables;
12413 * see String#match.
12414 *
12415 */
12416
12417static VALUE
12418sym_match_m(int argc, VALUE *argv, VALUE sym)
12419{
12420 return rb_str_match_m(argc, argv, rb_sym2str(sym));
12421}
12422
12423/*
12424 * call-seq:
12425 * match?(pattern, offset) -> true or false
12426 *
12427 * Equivalent to <tt>sym.to_s.match?</tt>;
12428 * see String#match.
12429 *
12430 */
12431
12432static VALUE
12433sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12434{
12435 return rb_str_match_m_p(argc, argv, sym);
12436}
12437
12438/*
12439 * call-seq:
12440 * symbol[index] -> string or nil
12441 * symbol[start, length] -> string or nil
12442 * symbol[range] -> string or nil
12443 * symbol[regexp, capture = 0] -> string or nil
12444 * symbol[substring] -> string or nil
12445 *
12446 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12447 *
12448 */
12449
12450static VALUE
12451sym_aref(int argc, VALUE *argv, VALUE sym)
12452{
12453 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12454}
12455
12456/*
12457 * call-seq:
12458 * length -> integer
12459 *
12460 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12461 */
12462
12463static VALUE
12464sym_length(VALUE sym)
12465{
12466 return rb_str_length(rb_sym2str(sym));
12467}
12468
12469/*
12470 * call-seq:
12471 * empty? -> true or false
12472 *
12473 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12474 *
12475 */
12476
12477static VALUE
12478sym_empty(VALUE sym)
12479{
12480 return rb_str_empty(rb_sym2str(sym));
12481}
12482
12483/*
12484 * call-seq:
12485 * upcase(mapping) -> symbol
12486 *
12487 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12488 *
12489 * See String#upcase.
12490 *
12491 */
12492
12493static VALUE
12494sym_upcase(int argc, VALUE *argv, VALUE sym)
12495{
12496 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12497}
12498
12499/*
12500 * call-seq:
12501 * downcase(mapping) -> symbol
12502 *
12503 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12504 *
12505 * See String#downcase.
12506 *
12507 * Related: Symbol#upcase.
12508 *
12509 */
12510
12511static VALUE
12512sym_downcase(int argc, VALUE *argv, VALUE sym)
12513{
12514 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12515}
12516
12517/*
12518 * call-seq:
12519 * capitalize(mapping) -> symbol
12520 *
12521 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12522 *
12523 * See String#capitalize.
12524 *
12525 */
12526
12527static VALUE
12528sym_capitalize(int argc, VALUE *argv, VALUE sym)
12529{
12530 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12531}
12532
12533/*
12534 * call-seq:
12535 * swapcase(mapping) -> symbol
12536 *
12537 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12538 *
12539 * See String#swapcase.
12540 *
12541 */
12542
12543static VALUE
12544sym_swapcase(int argc, VALUE *argv, VALUE sym)
12545{
12546 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12547}
12548
12549/*
12550 * call-seq:
12551 * start_with?(*string_or_regexp) -> true or false
12552 *
12553 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12554 *
12555 */
12556
12557static VALUE
12558sym_start_with(int argc, VALUE *argv, VALUE sym)
12559{
12560 return rb_str_start_with(argc, argv, rb_sym2str(sym));
12561}
12562
12563/*
12564 * call-seq:
12565 * end_with?(*strings) -> true or false
12566 *
12567 *
12568 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12569 *
12570 */
12571
12572static VALUE
12573sym_end_with(int argc, VALUE *argv, VALUE sym)
12574{
12575 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12576}
12577
12578/*
12579 * call-seq:
12580 * encoding -> encoding
12581 *
12582 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12583 *
12584 */
12585
12586static VALUE
12587sym_encoding(VALUE sym)
12588{
12589 return rb_obj_encoding(rb_sym2str(sym));
12590}
12591
12592static VALUE
12593string_for_symbol(VALUE name)
12594{
12595 if (!RB_TYPE_P(name, T_STRING)) {
12596 VALUE tmp = rb_check_string_type(name);
12597 if (NIL_P(tmp)) {
12598 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12599 name);
12600 }
12601 name = tmp;
12602 }
12603 return name;
12604}
12605
12606ID
12608{
12609 if (SYMBOL_P(name)) {
12610 return SYM2ID(name);
12611 }
12612 name = string_for_symbol(name);
12613 return rb_intern_str(name);
12614}
12615
12616VALUE
12618{
12619 if (SYMBOL_P(name)) {
12620 return name;
12621 }
12622 name = string_for_symbol(name);
12623 return rb_str_intern(name);
12624}
12625
12626/*
12627 * call-seq:
12628 * Symbol.all_symbols -> array_of_symbols
12629 *
12630 * Returns an array of all symbols currently in Ruby's symbol table:
12631 *
12632 * Symbol.all_symbols.size # => 9334
12633 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12634 *
12635 */
12636
12637static VALUE
12638sym_all_symbols(VALUE _)
12639{
12640 return rb_sym_all_symbols();
12641}
12642
12643VALUE
12644rb_str_to_interned_str(VALUE str)
12645{
12646 return rb_fstring(str);
12647}
12648
12649VALUE
12650rb_interned_str(const char *ptr, long len)
12651{
12652 struct RString fake_str;
12653 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12654}
12655
12656VALUE
12658{
12659 return rb_interned_str(ptr, strlen(ptr));
12660}
12661
12662VALUE
12663rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12664{
12665 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12666 rb_enc_autoload(enc);
12667 }
12668
12669 struct RString fake_str;
12670 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12671}
12672
12673VALUE
12674rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12675{
12676 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12677 rb_enc_autoload(enc);
12678 }
12679
12680 struct RString fake_str;
12681 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12682}
12683
12684VALUE
12686{
12687 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12688}
12689
12690#if USE_YJIT
12691void
12692rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12693{
12694 if (RB_LIKELY(ENCODING_GET_INLINED(str) == rb_ascii8bit_encindex())) {
12695 ssize_t code = RB_NUM2SSIZE(codepoint);
12696
12697 if (RB_LIKELY(code >= 0 && code < 0xff)) {
12698 rb_str_buf_cat_byte(str, (char) code);
12699 return;
12700 }
12701 }
12702
12703 rb_str_concat(str, codepoint);
12704}
12705#endif
12706
12707static int
12708fstring_set_class_i(VALUE *str, void *data)
12709{
12710 RBASIC_SET_CLASS(*str, rb_cString);
12711
12712 return ST_CONTINUE;
12713}
12714
12715void
12716Init_String(void)
12717{
12718 rb_cString = rb_define_class("String", rb_cObject);
12719
12720 rb_ractor_safe_set_foreach_with_replace(fstring_table_obj, fstring_set_class_i, NULL);
12721
12723 rb_define_alloc_func(rb_cString, empty_str_alloc);
12724 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12725 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12726 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12727 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12728 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12731 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12732 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12733 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12734 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12737 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12738 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12739 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12740 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12743 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12744 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12745 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12746 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12747 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12749 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12751 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12752 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12753 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12754 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12755 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12756 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12758 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12759 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12760 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12761 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12762 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12763 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12764 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12765 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12767 rb_define_method(rb_cString, "+@", str_uplus, 0);
12768 rb_define_method(rb_cString, "-@", str_uminus, 0);
12769 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12770 rb_define_alias(rb_cString, "dedup", "-@");
12771
12772 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12773 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12774 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12775 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12778 rb_define_method(rb_cString, "undump", str_undump, 0);
12779
12780 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12781 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12782 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12783 sym_fold = ID2SYM(rb_intern_const("fold"));
12784
12785 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12786 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12787 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12788 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12789
12790 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12791 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12792 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12793 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12794
12795 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12796 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12797 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12798 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12799 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12800 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12801 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12802 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12803 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12804 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12805 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12806 rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12808 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12809 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12810 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12811 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12812 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12813
12814 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12815 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12816 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12817
12818 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12819
12820 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12821 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12822 rb_define_method(rb_cString, "center", rb_str_center, -1);
12823
12824 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12825 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12826 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12827 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12828 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12829 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12830 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12831 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12832 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12833
12834 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12835 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12836 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12837 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12838 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12839 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12840 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12841 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12842 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12843
12844 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12845 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12846 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12847 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12848 rb_define_method(rb_cString, "count", rb_str_count, -1);
12849
12850 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12851 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12852 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12853 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12854
12855 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12856 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12857 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12858 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12859 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12860
12861 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12862
12863 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12864 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12865
12866 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12867 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12868
12869 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12870 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12871 rb_define_method(rb_cString, "b", rb_str_b, 0);
12872 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12873 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12874
12875 /* define UnicodeNormalize module here so that we don't have to look it up */
12876 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12877 id_normalize = rb_intern_const("normalize");
12878 id_normalized_p = rb_intern_const("normalized?");
12879
12880 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12881 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12882 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12883
12884 rb_fs = Qnil;
12885 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12886 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12887 rb_gc_register_address(&rb_fs);
12888
12889 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12893 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12894
12895 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12896 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12897 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12898 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12899 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12900 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12901
12902 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12903 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12904 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12905 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12906
12907 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12908 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12909 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12910 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12911 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12912 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12913 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12914
12915 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12916 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12917 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12918 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12919
12920 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12921 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12922
12923 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12924}
12925
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:219
Atomic operations.
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:877
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition fl_type.h:463
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1696
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:1479
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1597
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2843
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2663
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:943
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2922
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:133
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:404
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:205
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:401
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:131
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:128
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition fl_type.h:125
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition memory.h:405
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:130
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:66
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:132
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:129
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:137
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:476
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3905
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1434
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1437
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1432
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:646
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2125
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2143
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1311
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3539
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:553
VALUE rb_cSymbol
Symbol class.
Definition string.c:84
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:175
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1299
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3223
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:603
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:683
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:704
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:571
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:447
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:99
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:619
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:726
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1316
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:931
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1181
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2986
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1200
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12663
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:253
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2293
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3690
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1129
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1421
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1322
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:950
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12685
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:815
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:444
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1490
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2670
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2934
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1746
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1117
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1204
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:479
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:700
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1861
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1071
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1867
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1926
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1236
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4225
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3722
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1490
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1927
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1716
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1486
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2445
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3755
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1397
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:12293
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2518
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1373
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1710
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:3014
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:5416
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:4131
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:3111
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11592
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1768
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1752
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1163
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:985
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1492
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1955
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:4117
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3523
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2382
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1973
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6654
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:3119
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12657
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1403
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3721
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:3061
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:4240
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3345
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:7329
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2748
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12650
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:4187
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:4004
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:4162
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3697
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3236
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5926
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11650
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1666
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2908
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:3208
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3327
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1175
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2702
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7443
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1385
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1682
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2396
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5844
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:9539
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1169
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:911
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1814
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2090
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition variable.c:2167
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3094
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1419
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:999
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12617
ID rb_to_id(VALUE str)
Definition string.c:12607
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1866
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3501
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4469
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1372
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:360
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE type(ANYARGS)
ANYARGS-ed function type.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
Defines RBIMPL_ATTR_NONSTRING.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:163
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1415
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2885
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2767
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1409
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2780
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1743
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1580
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:81
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
Definition string.c:8401
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:295
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376
ruby_value_type
C-level type of an object.
Definition value_type.h:113