Ruby  3.4.0dev (2024-11-05 revision 348a53415339076afc4a02fcd09f3ae36e9c4c61)
string.c (348a53415339076afc4a02fcd09f3ae36e9c4c61)
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/internal/config.h"
15 
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
19 
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
23 
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "id.h"
27 #include "internal.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
41 #include "probes.h"
42 #include "ruby/encoding.h"
43 #include "ruby/re.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
46 #include "vm_sync.h"
47 
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
50 # include <crypt.h>
51 # endif
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
55 #endif
56 
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
59 
60 #undef rb_str_new
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
63 #undef rb_enc_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
72 #undef rb_str_buf_cat
73 #undef rb_str_buf_cat2
74 #undef rb_str_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
77 
80 
81 /* Flags of RString
82  *
83  * 1: RSTRING_NOEMBED
84  * The string is not embedded. When a string is embedded, the contents
85  * follow the header. When a string is not embedded, the contents is
86  * on a separately allocated buffer.
87  * 2: STR_SHARED (equal to ELTS_SHARED)
88  * The string is shared. The buffer this string points to is owned by
89  * another string (the shared root).
90  * 3: STR_CHILLED (will be frozen in a future version)
91  * The string appears frozen but can be mutated with a warning.
92  * 4: STR_PRECOMPUTED_HASH
93  * The string is embedded and has its precomputed hascode stored
94  * after the terminator.
95  * 5: STR_SHARED_ROOT
96  * Other strings may point to the contents of this string. When this
97  * flag is set, STR_SHARED must not be set.
98  * 6: STR_BORROWED
99  * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
100  * to be unshared by rb_str_tmp_frozen_release.
101  * 7: STR_TMPLOCK
102  * The pointer to the buffer is passed to a system call such as
103  * read(2). Any modification and realloc is prohibited.
104  * 8-9: ENC_CODERANGE
105  * Stores the coderange of the string.
106  * 10-16: ENCODING
107  * Stores the encoding of the string.
108  * 17: RSTRING_FSTR
109  * The string is a fstring. The string is deduplicated in the fstring
110  * table.
111  * 18: STR_NOFREE
112  * Do not free this string's buffer when the string is reclaimed
113  * by the garbage collector. Used for when the string buffer is a C
114  * string literal.
115  * 19: STR_FAKESTR
116  * The string is not allocated or managed by the garbage collector.
117  * Typically, the string object header (struct RString) is temporarily
118  * allocated on C stack.
119  */
120 
121 #define RUBY_MAX_CHAR_LEN 16
122 #define STR_PRECOMPUTED_HASH FL_USER4
123 #define STR_SHARED_ROOT FL_USER5
124 #define STR_BORROWED FL_USER6
125 #define STR_TMPLOCK FL_USER7
126 #define STR_NOFREE FL_USER18
127 #define STR_FAKESTR FL_USER19
128 
129 #define STR_SET_NOEMBED(str) do {\
130  FL_SET((str), STR_NOEMBED);\
131  FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
132 } while (0)
133 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
134 
135 #define STR_SET_LEN(str, n) do { \
136  RSTRING(str)->len = (n); \
137 } while (0)
138 
139 static inline bool
140 str_encindex_fastpath(int encindex)
141 {
142  // The overwhelming majority of strings are in one of these 3 encodings.
143  switch (encindex) {
144  case ENCINDEX_ASCII_8BIT:
145  case ENCINDEX_UTF_8:
146  case ENCINDEX_US_ASCII:
147  return true;
148  default:
149  return false;
150  }
151 }
152 
153 static inline bool
154 str_enc_fastpath(VALUE str)
155 {
156  return str_encindex_fastpath(ENCODING_GET_INLINED(str));
157 }
158 
159 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
160 #define TERM_FILL(ptr, termlen) do {\
161  char *const term_fill_ptr = (ptr);\
162  const int term_fill_len = (termlen);\
163  *term_fill_ptr = '\0';\
164  if (UNLIKELY(term_fill_len > 1))\
165  memset(term_fill_ptr, 0, term_fill_len);\
166 } while (0)
167 
168 #define RESIZE_CAPA(str,capacity) do {\
169  const int termlen = TERM_LEN(str);\
170  RESIZE_CAPA_TERM(str,capacity,termlen);\
171 } while (0)
172 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
173  if (STR_EMBED_P(str)) {\
174  if (str_embed_capa(str) < capacity + termlen) {\
175  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
176  const long tlen = RSTRING_LEN(str);\
177  memcpy(tmp, RSTRING_PTR(str), tlen);\
178  RSTRING(str)->as.heap.ptr = tmp;\
179  RSTRING(str)->len = tlen;\
180  STR_SET_NOEMBED(str);\
181  RSTRING(str)->as.heap.aux.capa = (capacity);\
182  }\
183  }\
184  else {\
185  RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
186  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
187  (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
188  RSTRING(str)->as.heap.aux.capa = (capacity);\
189  }\
190 } while (0)
191 
192 #define STR_SET_SHARED(str, shared_str) do { \
193  if (!FL_TEST(str, STR_FAKESTR)) { \
194  RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
195  RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
196  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
197  FL_SET((str), STR_SHARED); \
198  FL_SET((shared_str), STR_SHARED_ROOT); \
199  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
200  FL_SET_RAW((shared_str), STR_BORROWED); \
201  } \
202 } while (0)
203 
204 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
205 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
206 /* TODO: include the terminator size in capa. */
207 
208 #define STR_ENC_GET(str) get_encoding(str)
209 
210 #if !defined SHARABLE_MIDDLE_SUBSTRING
211 # define SHARABLE_MIDDLE_SUBSTRING 0
212 #endif
213 #if !SHARABLE_MIDDLE_SUBSTRING
214 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
215 #else
216 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
217 #endif
218 
219 
220 static inline long
221 str_embed_capa(VALUE str)
222 {
223  return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
224 }
225 
226 bool
227 rb_str_reembeddable_p(VALUE str)
228 {
229  return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
230 }
231 
232 static inline size_t
233 rb_str_embed_size(long capa)
234 {
235  return offsetof(struct RString, as.embed.ary) + capa;
236 }
237 
238 size_t
239 rb_str_size_as_embedded(VALUE str)
240 {
241  size_t real_size;
242  if (STR_EMBED_P(str)) {
243  real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
244  }
245  /* if the string is not currently embedded, but it can be embedded, how
246  * much space would it require */
247  else if (rb_str_reembeddable_p(str)) {
248  real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
249  }
250  else {
251  real_size = sizeof(struct RString);
252  }
253 
254  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
255  real_size += sizeof(st_index_t);
256  }
257 
258  return real_size;
259 }
260 
261 static inline bool
262 STR_EMBEDDABLE_P(long len, long termlen)
263 {
264  return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
265 }
266 
267 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
268 static VALUE str_new_frozen(VALUE klass, VALUE orig);
269 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
270 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
271 static VALUE str_new(VALUE klass, const char *ptr, long len);
272 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
273 static inline void str_modifiable(VALUE str);
274 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
275 static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
276 
277 static inline void
278 str_make_independent(VALUE str)
279 {
280  long len = RSTRING_LEN(str);
281  int termlen = TERM_LEN(str);
282  str_make_independent_expand((str), len, 0L, termlen);
283 }
284 
285 static inline int str_dependent_p(VALUE str);
286 
287 void
288 rb_str_make_independent(VALUE str)
289 {
290  if (str_dependent_p(str)) {
291  str_make_independent(str);
292  }
293 }
294 
295 void
296 rb_str_make_embedded(VALUE str)
297 {
298  RUBY_ASSERT(rb_str_reembeddable_p(str));
299  RUBY_ASSERT(!STR_EMBED_P(str));
300 
301  char *buf = RSTRING(str)->as.heap.ptr;
302  long len = RSTRING(str)->len;
303 
304  STR_SET_EMBED(str);
305  STR_SET_LEN(str, len);
306 
307  if (len > 0) {
308  memcpy(RSTRING_PTR(str), buf, len);
309  ruby_xfree(buf);
310  }
311 
312  TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
313 }
314 
315 void
316 rb_debug_rstring_null_ptr(const char *func)
317 {
318  fprintf(stderr, "%s is returning NULL!! "
319  "SIGSEGV is highly expected to follow immediately.\n"
320  "If you could reproduce, attach your debugger here, "
321  "and look at the passed string.\n",
322  func);
323 }
324 
325 /* symbols for [up|down|swap]case/capitalize options */
326 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
327 
328 static rb_encoding *
329 get_encoding(VALUE str)
330 {
331  return rb_enc_from_index(ENCODING_GET(str));
332 }
333 
334 static void
335 mustnot_broken(VALUE str)
336 {
337  if (is_broken_string(str)) {
338  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
339  }
340 }
341 
342 static void
343 mustnot_wchar(VALUE str)
344 {
345  rb_encoding *enc = STR_ENC_GET(str);
346  if (rb_enc_mbminlen(enc) > 1) {
347  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
348  }
349 }
350 
351 static int fstring_cmp(VALUE a, VALUE b);
352 
353 static VALUE register_fstring(VALUE str, bool copy, bool precompute_hash);
354 
355 #if SIZEOF_LONG == SIZEOF_VOIDP
356 static st_index_t
357 fstring_hash(VALUE str)
358 {
359  if (FL_TEST_RAW(str, STR_FAKESTR)) {
360  // register_fstring precomputes the hash and stores it in capa for fake strings
361  return (st_index_t)RSTRING(str)->as.heap.aux.capa;
362  }
363  else {
364  return rb_str_hash(str);
365  }
366 }
367 #else
368 #define fstring_hash rb_str_hash
369 #endif
370 const struct st_hash_type rb_fstring_hash_type = {
371  fstring_cmp,
372  fstring_hash,
373 };
374 
375 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
376 
377 static inline st_index_t
378 str_do_hash(VALUE str)
379 {
380  st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
381  int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
382  if (e && !is_ascii_string(str)) {
383  h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
384  }
385  return h;
386 }
387 
388 static VALUE
389 str_store_precomputed_hash(VALUE str, st_index_t hash)
390 {
391  RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
392  RUBY_ASSERT(STR_EMBED_P(str));
393 
394 #if RUBY_DEBUG
395  size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
396  size_t free_bytes = str_embed_capa(str) - used_bytes;
397  RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
398 #endif
399 
400  memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
401 
402  FL_SET(str, STR_PRECOMPUTED_HASH);
403 
404  return str;
405 }
406 
408  VALUE fstr;
409  bool copy;
410  bool precompute_hash;
411 };
412 
413 static int
414 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
415 {
416  struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
417  VALUE str = (VALUE)*key;
418 
419  if (existing) {
420  /* because of lazy sweep, str may be unmarked already and swept
421  * at next time */
422 
423  if (rb_objspace_garbage_object_p(str)) {
424  arg->fstr = Qundef;
425  return ST_DELETE;
426  }
427 
428  arg->fstr = str;
429  return ST_STOP;
430  }
431  else {
432  if (FL_TEST_RAW(str, STR_FAKESTR)) {
433  if (arg->copy) {
434  VALUE new_str;
435  long len = RSTRING_LEN(str);
436  long capa = len + sizeof(st_index_t);
437  int term_len = TERM_LEN(str);
438 
439  if (arg->precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
440  new_str = str_alloc_embed(rb_cString, capa + term_len);
441  memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
442  STR_SET_LEN(new_str, RSTRING_LEN(str));
443  TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
444  rb_enc_copy(new_str, str);
445  str_store_precomputed_hash(new_str, fstring_hash(str));
446  }
447  else {
448  new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
449  rb_enc_copy(new_str, str);
450  }
451  str = new_str;
452  }
453  else {
454  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
455  RSTRING(str)->len,
456  ENCODING_GET(str));
457  }
458  OBJ_FREEZE(str);
459  }
460  else {
461  if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
462  str = str_new_frozen(rb_cString, str);
463  }
464  if (STR_SHARED_P(str)) { /* str should not be shared */
465  /* shared substring */
466  str_make_independent(str);
467  RUBY_ASSERT(OBJ_FROZEN(str));
468  }
469  if (!BARE_STRING_P(str)) {
470  str = str_new_frozen(rb_cString, str);
471  }
472  }
473  RBASIC(str)->flags |= RSTRING_FSTR;
474 
475  *key = *value = arg->fstr = str;
476  return ST_CONTINUE;
477  }
478 }
479 
480 VALUE
481 rb_fstring(VALUE str)
482 {
483  VALUE fstr;
484  int bare;
485 
486  Check_Type(str, T_STRING);
487 
488  if (FL_TEST(str, RSTRING_FSTR))
489  return str;
490 
491  bare = BARE_STRING_P(str);
492  if (!bare) {
493  if (STR_EMBED_P(str)) {
494  OBJ_FREEZE(str);
495  return str;
496  }
497 
498  if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
499  RUBY_ASSERT(OBJ_FROZEN(str));
500  return str;
501  }
502  }
503 
504  if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
505  rb_str_resize(str, RSTRING_LEN(str));
506 
507  fstr = register_fstring(str, false, false);
508 
509  if (!bare) {
510  str_replace_shared_without_enc(str, fstr);
511  OBJ_FREEZE(str);
512  return str;
513  }
514  return fstr;
515 }
516 
517 static VALUE
518 register_fstring(VALUE str, bool copy, bool precompute_hash)
519 {
520  struct fstr_update_arg args = {
521  .copy = copy,
522  .precompute_hash = precompute_hash
523  };
524 
525 #if SIZEOF_VOIDP == SIZEOF_LONG
526  if (FL_TEST_RAW(str, STR_FAKESTR)) {
527  // if the string hasn't been interned, we'll need the hash twice, so we
528  // compute it once and store it in capa
529  RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
530  }
531 #endif
532 
533  RB_VM_LOCK_ENTER();
534  {
535  st_table *frozen_strings = rb_vm_fstring_table();
536  do {
537  args.fstr = str;
538  st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
539  } while (UNDEF_P(args.fstr));
540  }
541  RB_VM_LOCK_LEAVE();
542 
543  RUBY_ASSERT(OBJ_FROZEN(args.fstr));
544  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
545  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
546  RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
547 
548  return args.fstr;
549 }
550 
551 static VALUE
552 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
553 {
554  fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
555 
556  if (!name) {
557  RUBY_ASSERT_ALWAYS(len == 0);
558  name = "";
559  }
560 
561  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
562 
563  RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
564  fake_str->len = len;
565  fake_str->as.heap.ptr = (char *)name;
566  fake_str->as.heap.aux.capa = len;
567  return (VALUE)fake_str;
568 }
569 
570 /*
571  * set up a fake string which refers a static string literal.
572  */
573 VALUE
574 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
575 {
576  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
577 }
578 
579 /*
580  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
581  * shared string which refers a static string literal. `ptr` must
582  * point a constant string.
583  */
584 VALUE
585 rb_fstring_new(const char *ptr, long len)
586 {
587  struct RString fake_str;
588  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
589 }
590 
591 VALUE
592 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
593 {
594  struct RString fake_str;
595  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
596 }
597 
598 VALUE
599 rb_fstring_cstr(const char *ptr)
600 {
601  return rb_fstring_new(ptr, strlen(ptr));
602 }
603 
604 static int
605 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
606 {
607  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
608  return ST_CONTINUE;
609 }
610 
611 static int
612 fstring_cmp(VALUE a, VALUE b)
613 {
614  long alen, blen;
615  const char *aptr, *bptr;
616  RSTRING_GETMEM(a, aptr, alen);
617  RSTRING_GETMEM(b, bptr, blen);
618  return (alen != blen ||
619  ENCODING_GET(a) != ENCODING_GET(b) ||
620  memcmp(aptr, bptr, alen) != 0);
621 }
622 
623 static inline bool
624 single_byte_optimizable(VALUE str)
625 {
626  int encindex = ENCODING_GET(str);
627  switch (encindex) {
628  case ENCINDEX_ASCII_8BIT:
629  case ENCINDEX_US_ASCII:
630  return true;
631  case ENCINDEX_UTF_8:
632  // For UTF-8 it's worth scanning the string coderange when unknown.
634  }
635  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
636  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
637  return true;
638  }
639 
640  if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
641  return true;
642  }
643 
644  /* Conservative. Possibly single byte.
645  * "\xa1" in Shift_JIS for example. */
646  return false;
647 }
648 
650 
651 static inline const char *
652 search_nonascii(const char *p, const char *e)
653 {
654  const uintptr_t *s, *t;
655 
656 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
657 # if SIZEOF_UINTPTR_T == 8
658 # define NONASCII_MASK UINT64_C(0x8080808080808080)
659 # elif SIZEOF_UINTPTR_T == 4
660 # define NONASCII_MASK UINT32_C(0x80808080)
661 # else
662 # error "don't know what to do."
663 # endif
664 #else
665 # if SIZEOF_UINTPTR_T == 8
666 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
667 # elif SIZEOF_UINTPTR_T == 4
668 # define NONASCII_MASK 0x80808080UL /* or...? */
669 # else
670 # error "don't know what to do."
671 # endif
672 #endif
673 
674  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
675 #if !UNALIGNED_WORD_ACCESS
676  if ((uintptr_t)p % SIZEOF_VOIDP) {
677  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
678  p += l;
679  switch (l) {
680  default: UNREACHABLE;
681 #if SIZEOF_VOIDP > 4
682  case 7: if (p[-7]&0x80) return p-7;
683  case 6: if (p[-6]&0x80) return p-6;
684  case 5: if (p[-5]&0x80) return p-5;
685  case 4: if (p[-4]&0x80) return p-4;
686 #endif
687  case 3: if (p[-3]&0x80) return p-3;
688  case 2: if (p[-2]&0x80) return p-2;
689  case 1: if (p[-1]&0x80) return p-1;
690  case 0: break;
691  }
692  }
693 #endif
694 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
695 #define aligned_ptr(value) \
696  __builtin_assume_aligned((value), sizeof(uintptr_t))
697 #else
698 #define aligned_ptr(value) (uintptr_t *)(value)
699 #endif
700  s = aligned_ptr(p);
701  t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
702 #undef aligned_ptr
703  for (;s < t; s++) {
704  if (*s & NONASCII_MASK) {
705 #ifdef WORDS_BIGENDIAN
706  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
707 #else
708  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
709 #endif
710  }
711  }
712  p = (const char *)s;
713  }
714 
715  switch (e - p) {
716  default: UNREACHABLE;
717 #if SIZEOF_VOIDP > 4
718  case 7: if (e[-7]&0x80) return e-7;
719  case 6: if (e[-6]&0x80) return e-6;
720  case 5: if (e[-5]&0x80) return e-5;
721  case 4: if (e[-4]&0x80) return e-4;
722 #endif
723  case 3: if (e[-3]&0x80) return e-3;
724  case 2: if (e[-2]&0x80) return e-2;
725  case 1: if (e[-1]&0x80) return e-1;
726  case 0: return NULL;
727  }
728 }
729 
730 static int
731 coderange_scan(const char *p, long len, rb_encoding *enc)
732 {
733  const char *e = p + len;
734 
735  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
736  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
737  p = search_nonascii(p, e);
739  }
740 
741  if (rb_enc_asciicompat(enc)) {
742  p = search_nonascii(p, e);
743  if (!p) return ENC_CODERANGE_7BIT;
744  for (;;) {
745  int ret = rb_enc_precise_mbclen(p, e, enc);
746  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
747  p += MBCLEN_CHARFOUND_LEN(ret);
748  if (p == e) break;
749  p = search_nonascii(p, e);
750  if (!p) break;
751  }
752  }
753  else {
754  while (p < e) {
755  int ret = rb_enc_precise_mbclen(p, e, enc);
756  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
757  p += MBCLEN_CHARFOUND_LEN(ret);
758  }
759  }
760  return ENC_CODERANGE_VALID;
761 }
762 
763 long
764 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
765 {
766  const char *p = s;
767 
768  if (*cr == ENC_CODERANGE_BROKEN)
769  return e - s;
770 
771  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
772  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
773  if (*cr == ENC_CODERANGE_VALID) return e - s;
774  p = search_nonascii(p, e);
776  return e - s;
777  }
778  else if (rb_enc_asciicompat(enc)) {
779  p = search_nonascii(p, e);
780  if (!p) {
781  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
782  return e - s;
783  }
784  for (;;) {
785  int ret = rb_enc_precise_mbclen(p, e, enc);
786  if (!MBCLEN_CHARFOUND_P(ret)) {
788  return p - s;
789  }
790  p += MBCLEN_CHARFOUND_LEN(ret);
791  if (p == e) break;
792  p = search_nonascii(p, e);
793  if (!p) break;
794  }
795  }
796  else {
797  while (p < e) {
798  int ret = rb_enc_precise_mbclen(p, e, enc);
799  if (!MBCLEN_CHARFOUND_P(ret)) {
801  return p - s;
802  }
803  p += MBCLEN_CHARFOUND_LEN(ret);
804  }
805  }
806  *cr = ENC_CODERANGE_VALID;
807  return e - s;
808 }
809 
810 static inline void
811 str_enc_copy(VALUE str1, VALUE str2)
812 {
813  rb_enc_set_index(str1, ENCODING_GET(str2));
814 }
815 
816 /* Like str_enc_copy, but does not check frozen status of str1.
817  * You should use this only if you're certain that str1 is not frozen. */
818 static inline void
819 str_enc_copy_direct(VALUE str1, VALUE str2)
820 {
821  int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
822  if (inlined_encoding == ENCODING_INLINE_MAX) {
823  rb_enc_set_index(str1, rb_enc_get_index(str2));
824  }
825  else {
826  ENCODING_SET_INLINED(str1, inlined_encoding);
827  }
828 }
829 
830 static void
831 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
832 {
833  /* this function is designed for copying encoding and coderange
834  * from src to new string "dest" which is made from the part of src.
835  */
836  str_enc_copy(dest, src);
837  if (RSTRING_LEN(dest) == 0) {
838  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
840  else
842  return;
843  }
844  switch (ENC_CODERANGE(src)) {
845  case ENC_CODERANGE_7BIT:
847  break;
848  case ENC_CODERANGE_VALID:
849  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
850  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
852  else
854  break;
855  default:
856  break;
857  }
858 }
859 
860 static void
861 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
862 {
863  str_enc_copy(dest, src);
864  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
865 }
866 
867 static int
868 enc_coderange_scan(VALUE str, rb_encoding *enc)
869 {
870  return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
871 }
872 
873 int
874 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
875 {
876  return enc_coderange_scan(str, enc);
877 }
878 
879 int
881 {
882  int cr = ENC_CODERANGE(str);
883 
884  if (cr == ENC_CODERANGE_UNKNOWN) {
885  cr = enc_coderange_scan(str, get_encoding(str));
886  ENC_CODERANGE_SET(str, cr);
887  }
888  return cr;
889 }
890 
891 static inline bool
892 rb_enc_str_asciicompat(VALUE str)
893 {
894  int encindex = ENCODING_GET_INLINED(str);
895  return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
896 }
897 
898 int
900 {
901  switch(ENC_CODERANGE(str)) {
903  return rb_enc_str_asciicompat(str) && is_ascii_string(str);
904  case ENC_CODERANGE_7BIT:
905  return true;
906  default:
907  return false;
908  }
909 }
910 
911 static inline void
912 str_mod_check(VALUE s, const char *p, long len)
913 {
914  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
915  rb_raise(rb_eRuntimeError, "string modified");
916  }
917 }
918 
919 static size_t
920 str_capacity(VALUE str, const int termlen)
921 {
922  if (STR_EMBED_P(str)) {
923  return str_embed_capa(str) - termlen;
924  }
925  else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
926  return RSTRING(str)->len;
927  }
928  else {
929  return RSTRING(str)->as.heap.aux.capa;
930  }
931 }
932 
933 size_t
935 {
936  return str_capacity(str, TERM_LEN(str));
937 }
938 
939 static inline void
940 must_not_null(const char *ptr)
941 {
942  if (!ptr) {
943  rb_raise(rb_eArgError, "NULL pointer given");
944  }
945 }
946 
947 static inline VALUE
948 str_alloc_embed(VALUE klass, size_t capa)
949 {
950  size_t size = rb_str_embed_size(capa);
951  RUBY_ASSERT(size > 0);
952  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
953 
954  NEWOBJ_OF(str, struct RString, klass,
956 
957  return (VALUE)str;
958 }
959 
960 static inline VALUE
961 str_alloc_heap(VALUE klass)
962 {
963  NEWOBJ_OF(str, struct RString, klass,
964  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
965 
966  return (VALUE)str;
967 }
968 
969 static inline VALUE
970 empty_str_alloc(VALUE klass)
971 {
972  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
973  VALUE str = str_alloc_embed(klass, 0);
974  memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
976  return str;
977 }
978 
979 static VALUE
980 str_new0(VALUE klass, const char *ptr, long len, int termlen)
981 {
982  VALUE str;
983 
984  if (len < 0) {
985  rb_raise(rb_eArgError, "negative string size (or size too big)");
986  }
987 
988  RUBY_DTRACE_CREATE_HOOK(STRING, len);
989 
990  if (STR_EMBEDDABLE_P(len, termlen)) {
991  str = str_alloc_embed(klass, len + termlen);
992  if (len == 0) {
994  }
995  }
996  else {
997  str = str_alloc_heap(klass);
998  RSTRING(str)->as.heap.aux.capa = len;
999  /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1000  * integer overflow. If we can STATIC_ASSERT that, the following
1001  * mul_add_mul can be reverted to a simple ALLOC_N. */
1002  RSTRING(str)->as.heap.ptr =
1003  rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1004  }
1005  if (ptr) {
1006  memcpy(RSTRING_PTR(str), ptr, len);
1007  }
1008  STR_SET_LEN(str, len);
1009  TERM_FILL(RSTRING_PTR(str) + len, termlen);
1010  return str;
1011 }
1012 
1013 static VALUE
1014 str_new(VALUE klass, const char *ptr, long len)
1015 {
1016  return str_new0(klass, ptr, len, 1);
1017 }
1018 
1019 VALUE
1020 rb_str_new(const char *ptr, long len)
1021 {
1022  return str_new(rb_cString, ptr, len);
1023 }
1024 
1025 VALUE
1026 rb_usascii_str_new(const char *ptr, long len)
1027 {
1028  VALUE str = rb_str_new(ptr, len);
1030  return str;
1031 }
1032 
1033 VALUE
1034 rb_utf8_str_new(const char *ptr, long len)
1035 {
1036  VALUE str = str_new(rb_cString, ptr, len);
1038  return str;
1039 }
1040 
1041 VALUE
1042 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1043 {
1044  VALUE str;
1045 
1046  if (!enc) return rb_str_new(ptr, len);
1047 
1048  str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
1049  rb_enc_associate(str, enc);
1050  return str;
1051 }
1052 
1053 VALUE
1054 rb_str_new_cstr(const char *ptr)
1055 {
1056  must_not_null(ptr);
1057  /* rb_str_new_cstr() can take pointer from non-malloc-generated
1058  * memory regions, and that cannot be detected by the MSAN. Just
1059  * trust the programmer that the argument passed here is a sane C
1060  * string. */
1061  __msan_unpoison_string(ptr);
1062  return rb_str_new(ptr, strlen(ptr));
1063 }
1064 
1065 VALUE
1067 {
1068  VALUE str = rb_str_new_cstr(ptr);
1070  return str;
1071 }
1072 
1073 VALUE
1075 {
1076  VALUE str = rb_str_new_cstr(ptr);
1078  return str;
1079 }
1080 
1081 VALUE
1083 {
1084  must_not_null(ptr);
1085  if (rb_enc_mbminlen(enc) != 1) {
1086  rb_raise(rb_eArgError, "wchar encoding given");
1087  }
1088  return rb_enc_str_new(ptr, strlen(ptr), enc);
1089 }
1090 
1091 static VALUE
1092 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1093 {
1094  VALUE str;
1095 
1096  if (len < 0) {
1097  rb_raise(rb_eArgError, "negative string size (or size too big)");
1098  }
1099 
1100  if (!ptr) {
1101  rb_encoding *enc = rb_enc_get_from_index(encindex);
1102  str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1103  }
1104  else {
1105  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1106  str = str_alloc_heap(klass);
1107  RSTRING(str)->len = len;
1108  RSTRING(str)->as.heap.ptr = (char *)ptr;
1109  RSTRING(str)->as.heap.aux.capa = len;
1110  RBASIC(str)->flags |= STR_NOFREE;
1111  }
1112  rb_enc_associate_index(str, encindex);
1113  return str;
1114 }
1115 
1116 VALUE
1117 rb_str_new_static(const char *ptr, long len)
1118 {
1119  return str_new_static(rb_cString, ptr, len, 0);
1120 }
1121 
1122 VALUE
1124 {
1125  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1126 }
1127 
1128 VALUE
1129 rb_utf8_str_new_static(const char *ptr, long len)
1130 {
1131  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1132 }
1133 
1134 VALUE
1135 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1136 {
1137  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1138 }
1139 
1140 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1141  rb_encoding *from, rb_encoding *to,
1142  int ecflags, VALUE ecopts);
1143 
1144 static inline bool
1145 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1146 {
1147  int encidx = rb_enc_to_index(enc);
1148  if (rb_enc_get_index(str) == encidx)
1149  return is_ascii_string(str);
1150  return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1151 }
1152 
1153 VALUE
1154 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1155 {
1156  long len;
1157  const char *ptr;
1158  VALUE newstr;
1159 
1160  if (!to) return str;
1161  if (!from) from = rb_enc_get(str);
1162  if (from == to) return str;
1163  if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1164  rb_is_ascii8bit_enc(to)) {
1165  if (STR_ENC_GET(str) != to) {
1166  str = rb_str_dup(str);
1167  rb_enc_associate(str, to);
1168  }
1169  return str;
1170  }
1171 
1172  RSTRING_GETMEM(str, ptr, len);
1173  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1174  from, to, ecflags, ecopts);
1175  if (NIL_P(newstr)) {
1176  /* some error, return original */
1177  return str;
1178  }
1179  return newstr;
1180 }
1181 
1182 VALUE
1183 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1184  rb_encoding *from, int ecflags, VALUE ecopts)
1185 {
1186  long olen;
1187 
1188  olen = RSTRING_LEN(newstr);
1189  if (ofs < -olen || olen < ofs)
1190  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1191  if (ofs < 0) ofs += olen;
1192  if (!from) {
1193  STR_SET_LEN(newstr, ofs);
1194  return rb_str_cat(newstr, ptr, len);
1195  }
1196 
1197  rb_str_modify(newstr);
1198  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1199  rb_enc_get(newstr),
1200  ecflags, ecopts);
1201 }
1202 
1203 VALUE
1204 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1205 {
1206  STR_SET_LEN(str, 0);
1207  rb_enc_associate(str, enc);
1208  rb_str_cat(str, ptr, len);
1209  return str;
1210 }
1211 
1212 static VALUE
1213 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1214  rb_encoding *from, rb_encoding *to,
1215  int ecflags, VALUE ecopts)
1216 {
1217  rb_econv_t *ec;
1218  rb_econv_result_t ret;
1219  long olen;
1220  VALUE econv_wrapper;
1221  const unsigned char *start, *sp;
1222  unsigned char *dest, *dp;
1223  size_t converted_output = (size_t)ofs;
1224 
1225  olen = rb_str_capacity(newstr);
1226 
1227  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1228  RBASIC_CLEAR_CLASS(econv_wrapper);
1229  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1230  if (!ec) return Qnil;
1231  DATA_PTR(econv_wrapper) = ec;
1232 
1233  sp = (unsigned char*)ptr;
1234  start = sp;
1235  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1236  (dp = dest + converted_output),
1237  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1239  /* destination buffer short */
1240  size_t converted_input = sp - start;
1241  size_t rest = len - converted_input;
1242  converted_output = dp - dest;
1243  rb_str_set_len(newstr, converted_output);
1244  if (converted_input && converted_output &&
1245  rest < (LONG_MAX / converted_output)) {
1246  rest = (rest * converted_output) / converted_input;
1247  }
1248  else {
1249  rest = olen;
1250  }
1251  olen += rest < 2 ? 2 : rest;
1252  rb_str_resize(newstr, olen);
1253  }
1254  DATA_PTR(econv_wrapper) = 0;
1255  RB_GC_GUARD(econv_wrapper);
1256  rb_econv_close(ec);
1257  switch (ret) {
1258  case econv_finished:
1259  len = dp - (unsigned char*)RSTRING_PTR(newstr);
1260  rb_str_set_len(newstr, len);
1261  rb_enc_associate(newstr, to);
1262  return newstr;
1263 
1264  default:
1265  return Qnil;
1266  }
1267 }
1268 
1269 VALUE
1271 {
1272  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1273 }
1274 
1275 VALUE
1277 {
1278  rb_encoding *ienc;
1279  VALUE str;
1280  const int eidx = rb_enc_to_index(eenc);
1281 
1282  if (!ptr) {
1283  return rb_enc_str_new(ptr, len, eenc);
1284  }
1285 
1286  /* ASCII-8BIT case, no conversion */
1287  if ((eidx == rb_ascii8bit_encindex()) ||
1288  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1289  return rb_str_new(ptr, len);
1290  }
1291  /* no default_internal or same encoding, no conversion */
1293  if (!ienc || eenc == ienc) {
1294  return rb_enc_str_new(ptr, len, eenc);
1295  }
1296  /* ASCII compatible, and ASCII only string, no conversion in
1297  * default_internal */
1298  if ((eidx == rb_ascii8bit_encindex()) ||
1299  (eidx == rb_usascii_encindex()) ||
1300  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1301  return rb_enc_str_new(ptr, len, ienc);
1302  }
1303  /* convert from the given encoding to default_internal */
1304  str = rb_enc_str_new(NULL, 0, ienc);
1305  /* when the conversion failed for some reason, just ignore the
1306  * default_internal and result in the given encoding as-is. */
1307  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1308  rb_str_initialize(str, ptr, len, eenc);
1309  }
1310  return str;
1311 }
1312 
1313 VALUE
1314 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1315 {
1316  int eidx = rb_enc_to_index(eenc);
1317  if (eidx == rb_usascii_encindex() &&
1318  !is_ascii_string(str)) {
1320  return str;
1321  }
1322  rb_enc_associate_index(str, eidx);
1323  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1324 }
1325 
1326 VALUE
1327 rb_external_str_new(const char *ptr, long len)
1328 {
1330 }
1331 
1332 VALUE
1334 {
1336 }
1337 
1338 VALUE
1339 rb_locale_str_new(const char *ptr, long len)
1340 {
1342 }
1343 
1344 VALUE
1346 {
1348 }
1349 
1350 VALUE
1351 rb_filesystem_str_new(const char *ptr, long len)
1352 {
1354 }
1355 
1356 VALUE
1358 {
1360 }
1361 
1362 VALUE
1364 {
1366 }
1367 
1368 VALUE
1370 {
1372 }
1373 
1374 VALUE
1376 {
1377  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1378 }
1379 
1380 static VALUE
1381 str_replace_shared_without_enc(VALUE str2, VALUE str)
1382 {
1383  const int termlen = TERM_LEN(str);
1384  char *ptr;
1385  long len;
1386 
1387  RSTRING_GETMEM(str, ptr, len);
1388  if (str_embed_capa(str2) >= len + termlen) {
1389  char *ptr2 = RSTRING(str2)->as.embed.ary;
1390  STR_SET_EMBED(str2);
1391  memcpy(ptr2, RSTRING_PTR(str), len);
1392  TERM_FILL(ptr2+len, termlen);
1393  }
1394  else {
1395  VALUE root;
1396  if (STR_SHARED_P(str)) {
1397  root = RSTRING(str)->as.heap.aux.shared;
1398  RSTRING_GETMEM(str, ptr, len);
1399  }
1400  else {
1401  root = rb_str_new_frozen(str);
1402  RSTRING_GETMEM(root, ptr, len);
1403  }
1404  RUBY_ASSERT(OBJ_FROZEN(root));
1405 
1406  if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1407  if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1408  rb_fatal("about to free a possible shared root");
1409  }
1410  char *ptr2 = STR_HEAP_PTR(str2);
1411  if (ptr2 != ptr) {
1412  ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1413  }
1414  }
1415  FL_SET(str2, STR_NOEMBED);
1416  RSTRING(str2)->as.heap.ptr = ptr;
1417  STR_SET_SHARED(str2, root);
1418  }
1419 
1420  STR_SET_LEN(str2, len);
1421 
1422  return str2;
1423 }
1424 
1425 static VALUE
1426 str_replace_shared(VALUE str2, VALUE str)
1427 {
1428  str_replace_shared_without_enc(str2, str);
1429  rb_enc_cr_str_exact_copy(str2, str);
1430  return str2;
1431 }
1432 
1433 static VALUE
1434 str_new_shared(VALUE klass, VALUE str)
1435 {
1436  return str_replace_shared(str_alloc_heap(klass), str);
1437 }
1438 
1439 VALUE
1441 {
1442  return str_new_shared(rb_obj_class(str), str);
1443 }
1444 
1445 VALUE
1447 {
1448  if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1449  return str_new_frozen(rb_obj_class(orig), orig);
1450 }
1451 
1452 static VALUE
1453 rb_str_new_frozen_String(VALUE orig)
1454 {
1455  if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1456  return str_new_frozen(rb_cString, orig);
1457 }
1458 
1459 VALUE
1460 rb_str_tmp_frozen_acquire(VALUE orig)
1461 {
1462  if (OBJ_FROZEN_RAW(orig)) return orig;
1463  return str_new_frozen_buffer(0, orig, FALSE);
1464 }
1465 
1466 VALUE
1467 rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1468 {
1469  if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1470  if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1471 
1472  VALUE str = str_alloc_heap(0);
1473  OBJ_FREEZE(str);
1474  /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1475  FL_SET(str, STR_SHARED_ROOT);
1476 
1477  size_t capa = str_capacity(orig, TERM_LEN(orig));
1478 
1479  /* If the string is embedded then we want to create a copy that is heap
1480  * allocated. If the string is shared then the shared root must be
1481  * embedded, so we want to create a copy. If the string is a shared root
1482  * then it must be embedded, so we want to create a copy. */
1483  if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1484  RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1485  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1486  }
1487  else {
1488  /* orig must be heap allocated and not shared, so we can safely transfer
1489  * the pointer to str. */
1490  RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1491  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1492  RBASIC(orig)->flags &= ~STR_NOFREE;
1493  STR_SET_SHARED(orig, str);
1494  }
1495 
1496  RSTRING(str)->len = RSTRING(orig)->len;
1497  RSTRING(str)->as.heap.aux.capa = capa;
1498 
1499  return str;
1500 }
1501 
1502 void
1503 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1504 {
1505  if (RBASIC_CLASS(tmp) != 0)
1506  return;
1507 
1508  if (STR_EMBED_P(tmp)) {
1510  }
1511  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1512  !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1513  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1514 
1515  if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1516  RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1517  RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1518 
1519  /* Unshare orig since the root (tmp) only has this one child. */
1520  FL_UNSET_RAW(orig, STR_SHARED);
1521  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1522  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1524 
1525  /* Make tmp embedded and empty so it is safe for sweeping. */
1526  STR_SET_EMBED(tmp);
1527  STR_SET_LEN(tmp, 0);
1528  }
1529  }
1530 }
1531 
1532 static VALUE
1533 str_new_frozen(VALUE klass, VALUE orig)
1534 {
1535  return str_new_frozen_buffer(klass, orig, TRUE);
1536 }
1537 
1538 static VALUE
1539 heap_str_make_shared(VALUE klass, VALUE orig)
1540 {
1541  RUBY_ASSERT(!STR_EMBED_P(orig));
1542  RUBY_ASSERT(!STR_SHARED_P(orig));
1543 
1544  VALUE str = str_alloc_heap(klass);
1545  STR_SET_LEN(str, RSTRING_LEN(orig));
1546  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1547  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1548  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1549  RBASIC(orig)->flags &= ~STR_NOFREE;
1550  STR_SET_SHARED(orig, str);
1551  if (klass == 0)
1552  FL_UNSET_RAW(str, STR_BORROWED);
1553  return str;
1554 }
1555 
1556 static VALUE
1557 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1558 {
1559  VALUE str;
1560 
1561  long len = RSTRING_LEN(orig);
1562  int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1563 
1564  if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1565  str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1566  RUBY_ASSERT(STR_EMBED_P(str));
1567  }
1568  else {
1569  if (FL_TEST_RAW(orig, STR_SHARED)) {
1570  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1571  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1572  long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1573  RUBY_ASSERT(ofs >= 0);
1574  RUBY_ASSERT(rest >= 0);
1575  RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1577 
1578  if ((ofs > 0) || (rest > 0) ||
1579  (klass != RBASIC(shared)->klass) ||
1580  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1581  str = str_new_shared(klass, shared);
1582  RUBY_ASSERT(!STR_EMBED_P(str));
1583  RSTRING(str)->as.heap.ptr += ofs;
1584  STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1585  }
1586  else {
1587  if (RBASIC_CLASS(shared) == 0)
1588  FL_SET_RAW(shared, STR_BORROWED);
1589  return shared;
1590  }
1591  }
1592  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1593  str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1594  STR_SET_EMBED(str);
1595  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1596  STR_SET_LEN(str, RSTRING_LEN(orig));
1597  ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1598  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1599  }
1600  else {
1601  str = heap_str_make_shared(klass, orig);
1602  }
1603  }
1604 
1605  if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1606  OBJ_FREEZE(str);
1607  return str;
1608 }
1609 
1610 VALUE
1611 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1612 {
1613  return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1614 }
1615 
1616 static VALUE
1617 str_new_empty_String(VALUE str)
1618 {
1619  VALUE v = rb_str_new(0, 0);
1620  rb_enc_copy(v, str);
1621  return v;
1622 }
1623 
1624 #define STR_BUF_MIN_SIZE 63
1625 
1626 VALUE
1628 {
1629  if (STR_EMBEDDABLE_P(capa, 1)) {
1630  return str_alloc_embed(rb_cString, capa + 1);
1631  }
1632 
1633  VALUE str = str_alloc_heap(rb_cString);
1634 
1635  RSTRING(str)->as.heap.aux.capa = capa;
1636  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1637  RSTRING(str)->as.heap.ptr[0] = '\0';
1638 
1639  return str;
1640 }
1641 
1642 VALUE
1644 {
1645  VALUE str;
1646  long len = strlen(ptr);
1647 
1648  str = rb_str_buf_new(len);
1649  rb_str_buf_cat(str, ptr, len);
1650 
1651  return str;
1652 }
1653 
1654 VALUE
1656 {
1657  return str_new(0, 0, len);
1658 }
1659 
1660 void
1662 {
1663  if (FL_TEST(str, RSTRING_FSTR)) {
1664  st_data_t fstr = (st_data_t)str;
1665 
1666  RB_VM_LOCK_ENTER();
1667  {
1668  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1669  RB_DEBUG_COUNTER_INC(obj_str_fstr);
1670  }
1671  RB_VM_LOCK_LEAVE();
1672  }
1673 
1674  if (STR_EMBED_P(str)) {
1675  RB_DEBUG_COUNTER_INC(obj_str_embed);
1676  }
1677  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1678  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1679  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1680  }
1681  else {
1682  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1683  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1684  }
1685 }
1686 
1687 size_t
1688 rb_str_memsize(VALUE str)
1689 {
1690  if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1691  return STR_HEAP_SIZE(str);
1692  }
1693  else {
1694  return 0;
1695  }
1696 }
1697 
1698 VALUE
1700 {
1701  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1702 }
1703 
1704 static inline void str_discard(VALUE str);
1705 static void str_shared_replace(VALUE str, VALUE str2);
1706 
1707 void
1709 {
1710  if (str != str2) str_shared_replace(str, str2);
1711 }
1712 
1713 static void
1714 str_shared_replace(VALUE str, VALUE str2)
1715 {
1716  rb_encoding *enc;
1717  int cr;
1718  int termlen;
1719 
1720  RUBY_ASSERT(str2 != str);
1721  enc = STR_ENC_GET(str2);
1722  cr = ENC_CODERANGE(str2);
1723  str_discard(str);
1724  termlen = rb_enc_mbminlen(enc);
1725 
1726  STR_SET_LEN(str, RSTRING_LEN(str2));
1727 
1728  if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1729  STR_SET_EMBED(str);
1730  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1731  rb_enc_associate(str, enc);
1732  ENC_CODERANGE_SET(str, cr);
1733  }
1734  else {
1735  if (STR_EMBED_P(str2)) {
1736  RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1737  long len = RSTRING_LEN(str2);
1738  RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1739 
1740  char *new_ptr = ALLOC_N(char, len + termlen);
1741  memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1742  RSTRING(str2)->as.heap.ptr = new_ptr;
1743  STR_SET_LEN(str2, len);
1744  RSTRING(str2)->as.heap.aux.capa = len;
1745  STR_SET_NOEMBED(str2);
1746  }
1747 
1748  STR_SET_NOEMBED(str);
1749  FL_UNSET(str, STR_SHARED);
1750  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1751 
1752  if (FL_TEST(str2, STR_SHARED)) {
1753  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1754  STR_SET_SHARED(str, shared);
1755  }
1756  else {
1757  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1758  }
1759 
1760  /* abandon str2 */
1761  STR_SET_EMBED(str2);
1762  RSTRING_PTR(str2)[0] = 0;
1763  STR_SET_LEN(str2, 0);
1764  rb_enc_associate(str, enc);
1765  ENC_CODERANGE_SET(str, cr);
1766  }
1767 }
1768 
1769 VALUE
1771 {
1772  VALUE str;
1773 
1774  if (RB_TYPE_P(obj, T_STRING)) {
1775  return obj;
1776  }
1777  str = rb_funcall(obj, idTo_s, 0);
1778  return rb_obj_as_string_result(str, obj);
1779 }
1780 
1781 VALUE
1782 rb_obj_as_string_result(VALUE str, VALUE obj)
1783 {
1784  if (!RB_TYPE_P(str, T_STRING))
1785  return rb_any_to_s(obj);
1786  return str;
1787 }
1788 
1789 static VALUE
1790 str_replace(VALUE str, VALUE str2)
1791 {
1792  long len;
1793 
1794  len = RSTRING_LEN(str2);
1795  if (STR_SHARED_P(str2)) {
1796  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1798  STR_SET_NOEMBED(str);
1799  STR_SET_LEN(str, len);
1800  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1801  STR_SET_SHARED(str, shared);
1802  rb_enc_cr_str_exact_copy(str, str2);
1803  }
1804  else {
1805  str_replace_shared(str, str2);
1806  }
1807 
1808  return str;
1809 }
1810 
1811 static inline VALUE
1812 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1813 {
1814  size_t size = rb_str_embed_size(capa);
1815  RUBY_ASSERT(size > 0);
1816  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1817 
1818  NEWOBJ_OF(str, struct RString, klass,
1820 
1821  return (VALUE)str;
1822 }
1823 
1824 static inline VALUE
1825 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1826 {
1827  NEWOBJ_OF(str, struct RString, klass,
1828  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1829 
1830  return (VALUE)str;
1831 }
1832 
1833 static inline VALUE
1834 str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1835 {
1836  int encidx = 0;
1837  if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1838  encidx = rb_enc_get_index(str);
1839  flags &= ~ENCODING_MASK;
1840  }
1841  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1842  if (encidx) rb_enc_associate_index(dup, encidx);
1843  return dup;
1844 }
1845 
1846 static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1847 
1848 static inline VALUE
1849 str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1850 {
1851  VALUE flags = FL_TEST_RAW(str, flag_mask);
1852  long len = RSTRING_LEN(str);
1853 
1854  RUBY_ASSERT(STR_EMBED_P(dup));
1855  RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1856  MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1857  STR_SET_LEN(dup, RSTRING_LEN(str));
1858  return str_duplicate_setup_encoding(str, dup, flags);
1859 }
1860 
1861 static inline VALUE
1862 str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1863 {
1864  VALUE flags = FL_TEST_RAW(str, flag_mask);
1865  VALUE root = str;
1866  if (FL_TEST_RAW(str, STR_SHARED)) {
1867  root = RSTRING(str)->as.heap.aux.shared;
1868  }
1869  else if (UNLIKELY(!(flags & FL_FREEZE))) {
1870  root = str = str_new_frozen(klass, str);
1871  flags = FL_TEST_RAW(str, flag_mask);
1872  }
1873  RUBY_ASSERT(!STR_SHARED_P(root));
1875 
1876  RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1877  FL_SET(root, STR_SHARED_ROOT);
1878  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1879  flags |= RSTRING_NOEMBED | STR_SHARED;
1880 
1881  STR_SET_LEN(dup, RSTRING_LEN(str));
1882  return str_duplicate_setup_encoding(str, dup, flags);
1883 }
1884 
1885 static inline VALUE
1886 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1887 {
1888  if (STR_EMBED_P(str)) {
1889  return str_duplicate_setup_embed(klass, str, dup);
1890  }
1891  else {
1892  return str_duplicate_setup_heap(klass, str, dup);
1893  }
1894 }
1895 
1896 static inline VALUE
1897 str_duplicate(VALUE klass, VALUE str)
1898 {
1899  VALUE dup;
1900  if (STR_EMBED_P(str)) {
1901  dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1902  }
1903  else {
1904  dup = str_alloc_heap(klass);
1905  }
1906 
1907  return str_duplicate_setup(klass, str, dup);
1908 }
1909 
1910 VALUE
1912 {
1913  return str_duplicate(rb_obj_class(str), str);
1914 }
1915 
1916 /* :nodoc: */
1917 VALUE
1918 rb_str_dup_m(VALUE str)
1919 {
1920  if (LIKELY(BARE_STRING_P(str))) {
1921  return str_duplicate(rb_obj_class(str), str);
1922  }
1923  else {
1924  return rb_obj_dup(str);
1925  }
1926 }
1927 
1928 VALUE
1930 {
1931  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1932  return str_duplicate(rb_cString, str);
1933 }
1934 
1935 VALUE
1936 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1937 {
1938  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1939  VALUE new_str, klass = rb_cString;
1940 
1941  if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1942  new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1943  str_duplicate_setup_embed(klass, str, new_str);
1944  }
1945  else {
1946  new_str = ec_str_alloc_heap(ec, klass);
1947  str_duplicate_setup_heap(klass, str, new_str);
1948  }
1949  if (chilled) {
1950  STR_CHILL_RAW(new_str);
1951  }
1952  return new_str;
1953 }
1954 
1955 VALUE
1956 rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1957 {
1958  VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1959  if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1960  rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1961  STR_CHILL_RAW(str);
1962  return rb_str_freeze(str);
1963 }
1964 
1965 /*
1966  *
1967  * call-seq:
1968  * String.new(string = '', **opts) -> new_string
1969  *
1970  * :include: doc/string/new.rdoc
1971  *
1972  */
1973 
1974 static VALUE
1975 rb_str_init(int argc, VALUE *argv, VALUE str)
1976 {
1977  static ID keyword_ids[2];
1978  VALUE orig, opt, venc, vcapa;
1979  VALUE kwargs[2];
1980  rb_encoding *enc = 0;
1981  int n;
1982 
1983  if (!keyword_ids[0]) {
1984  keyword_ids[0] = rb_id_encoding();
1985  CONST_ID(keyword_ids[1], "capacity");
1986  }
1987 
1988  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1989  if (!NIL_P(opt)) {
1990  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1991  venc = kwargs[0];
1992  vcapa = kwargs[1];
1993  if (!UNDEF_P(venc) && !NIL_P(venc)) {
1994  enc = rb_to_encoding(venc);
1995  }
1996  if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1997  long capa = NUM2LONG(vcapa);
1998  long len = 0;
1999  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2000 
2001  if (capa < STR_BUF_MIN_SIZE) {
2002  capa = STR_BUF_MIN_SIZE;
2003  }
2004  if (n == 1) {
2005  StringValue(orig);
2006  len = RSTRING_LEN(orig);
2007  if (capa < len) {
2008  capa = len;
2009  }
2010  if (orig == str) n = 0;
2011  }
2012  str_modifiable(str);
2013  if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2014  /* make noembed always */
2015  const size_t size = (size_t)capa + termlen;
2016  const char *const old_ptr = RSTRING_PTR(str);
2017  const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2018  char *new_ptr = ALLOC_N(char, size);
2019  if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2020  memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2021  FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2022  RSTRING(str)->as.heap.ptr = new_ptr;
2023  }
2024  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2025  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2026  (size_t)capa + termlen, STR_HEAP_SIZE(str));
2027  }
2028  STR_SET_LEN(str, len);
2029  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2030  if (n == 1) {
2031  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2032  rb_enc_cr_str_exact_copy(str, orig);
2033  }
2034  FL_SET(str, STR_NOEMBED);
2035  RSTRING(str)->as.heap.aux.capa = capa;
2036  }
2037  else if (n == 1) {
2038  rb_str_replace(str, orig);
2039  }
2040  if (enc) {
2041  rb_enc_associate(str, enc);
2042  ENC_CODERANGE_CLEAR(str);
2043  }
2044  }
2045  else if (n == 1) {
2046  rb_str_replace(str, orig);
2047  }
2048  return str;
2049 }
2050 
2051 /* :nodoc: */
2052 static VALUE
2053 rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2054 {
2055  if (klass != rb_cString) {
2056  return rb_class_new_instance_pass_kw(argc, argv, klass);
2057  }
2058 
2059  static ID keyword_ids[2];
2060  VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2061  VALUE kwargs[2];
2062  rb_encoding *enc = NULL;
2063 
2064  int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2065  if (NIL_P(opt)) {
2066  return rb_class_new_instance_pass_kw(argc, argv, klass);
2067  }
2068 
2069  keyword_ids[0] = rb_id_encoding();
2070  CONST_ID(keyword_ids[1], "capacity");
2071  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2072  encoding = kwargs[0];
2073  capacity = kwargs[1];
2074 
2075  int termlen = 1;
2076 
2077  if (n == 1) {
2078  orig = StringValue(orig);
2079  }
2080  else {
2081  orig = Qnil;
2082  }
2083 
2084  if (UNDEF_P(encoding)) {
2085  if (!NIL_P(orig)) {
2086  encoding = rb_obj_encoding(orig);
2087  }
2088  }
2089 
2090  if (!UNDEF_P(encoding)) {
2091  enc = rb_to_encoding(encoding);
2092  termlen = rb_enc_mbminlen(enc);
2093  }
2094 
2095  // If capacity is nil, we're basically just duping `orig`.
2096  if (UNDEF_P(capacity)) {
2097  if (NIL_P(orig)) {
2098  VALUE empty_str = str_new(klass, "", 0);
2099  if (enc) {
2100  rb_enc_associate(empty_str, enc);
2101  }
2102  return empty_str;
2103  }
2104  VALUE copy = str_duplicate(klass, orig);
2105  rb_enc_associate(copy, enc);
2106  ENC_CODERANGE_CLEAR(copy);
2107  return copy;
2108  }
2109 
2110  long capa = 0;
2111  capa = NUM2LONG(capacity);
2112  if (capa < 0) {
2113  capa = 0;
2114  }
2115 
2116  if (!NIL_P(orig)) {
2117  long orig_capa = rb_str_capacity(orig);
2118  if (orig_capa > capa) {
2119  capa = orig_capa;
2120  }
2121  }
2122 
2123  VALUE str = str_new0(klass, NULL, capa, termlen);
2124  STR_SET_LEN(str, 0);
2125  TERM_FILL(RSTRING_PTR(str), termlen);
2126 
2127  if (enc) {
2128  rb_enc_associate(str, enc);
2129  }
2130 
2131  if (!NIL_P(orig)) {
2132  rb_str_buf_append(str, orig);
2133  }
2134 
2135  return str;
2136 }
2137 
2138 #ifdef NONASCII_MASK
2139 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2140 
2141 /*
2142  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2143  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2144  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2145  *
2146  * if (!(byte & 0x80))
2147  * byte |= 0x40; // turn on bit6
2148  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2149  *
2150  * This function calculates whether a byte is leading or not for all bytes
2151  * in the argument word by concurrently using the above logic, and then
2152  * adds up the number of leading bytes in the word.
2153  */
2154 static inline uintptr_t
2155 count_utf8_lead_bytes_with_word(const uintptr_t *s)
2156 {
2157  uintptr_t d = *s;
2158 
2159  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2160  d = (d>>6) | (~d>>7);
2161  d &= NONASCII_MASK >> 7;
2162 
2163  /* Gather all bytes. */
2164 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2165  /* use only if it can use POPCNT */
2166  return rb_popcount_intptr(d);
2167 #else
2168  d += (d>>8);
2169  d += (d>>16);
2170 # if SIZEOF_VOIDP == 8
2171  d += (d>>32);
2172 # endif
2173  return (d&0xF);
2174 #endif
2175 }
2176 #endif
2177 
2178 static inline long
2179 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2180 {
2181  long c;
2182  const char *q;
2183 
2184  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2185  long diff = (long)(e - p);
2186  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2187  }
2188 #ifdef NONASCII_MASK
2189  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2190  uintptr_t len = 0;
2191  if ((int)sizeof(uintptr_t) * 2 < e - p) {
2192  const uintptr_t *s, *t;
2193  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2194  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2195  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2196  while (p < (const char *)s) {
2197  if (is_utf8_lead_byte(*p)) len++;
2198  p++;
2199  }
2200  while (s < t) {
2201  len += count_utf8_lead_bytes_with_word(s);
2202  s++;
2203  }
2204  p = (const char *)s;
2205  }
2206  while (p < e) {
2207  if (is_utf8_lead_byte(*p)) len++;
2208  p++;
2209  }
2210  return (long)len;
2211  }
2212 #endif
2213  else if (rb_enc_asciicompat(enc)) {
2214  c = 0;
2215  if (ENC_CODERANGE_CLEAN_P(cr)) {
2216  while (p < e) {
2217  if (ISASCII(*p)) {
2218  q = search_nonascii(p, e);
2219  if (!q)
2220  return c + (e - p);
2221  c += q - p;
2222  p = q;
2223  }
2224  p += rb_enc_fast_mbclen(p, e, enc);
2225  c++;
2226  }
2227  }
2228  else {
2229  while (p < e) {
2230  if (ISASCII(*p)) {
2231  q = search_nonascii(p, e);
2232  if (!q)
2233  return c + (e - p);
2234  c += q - p;
2235  p = q;
2236  }
2237  p += rb_enc_mbclen(p, e, enc);
2238  c++;
2239  }
2240  }
2241  return c;
2242  }
2243 
2244  for (c=0; p<e; c++) {
2245  p += rb_enc_mbclen(p, e, enc);
2246  }
2247  return c;
2248 }
2249 
2250 long
2251 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2252 {
2253  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2254 }
2255 
2256 /* To get strlen with cr
2257  * Note that given cr is not used.
2258  */
2259 long
2260 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2261 {
2262  long c;
2263  const char *q;
2264  int ret;
2265 
2266  *cr = 0;
2267  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2268  long diff = (long)(e - p);
2269  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2270  }
2271  else if (rb_enc_asciicompat(enc)) {
2272  c = 0;
2273  while (p < e) {
2274  if (ISASCII(*p)) {
2275  q = search_nonascii(p, e);
2276  if (!q) {
2277  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2278  return c + (e - p);
2279  }
2280  c += q - p;
2281  p = q;
2282  }
2283  ret = rb_enc_precise_mbclen(p, e, enc);
2284  if (MBCLEN_CHARFOUND_P(ret)) {
2285  *cr |= ENC_CODERANGE_VALID;
2286  p += MBCLEN_CHARFOUND_LEN(ret);
2287  }
2288  else {
2289  *cr = ENC_CODERANGE_BROKEN;
2290  p++;
2291  }
2292  c++;
2293  }
2294  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2295  return c;
2296  }
2297 
2298  for (c=0; p<e; c++) {
2299  ret = rb_enc_precise_mbclen(p, e, enc);
2300  if (MBCLEN_CHARFOUND_P(ret)) {
2301  *cr |= ENC_CODERANGE_VALID;
2302  p += MBCLEN_CHARFOUND_LEN(ret);
2303  }
2304  else {
2305  *cr = ENC_CODERANGE_BROKEN;
2306  if (p + rb_enc_mbminlen(enc) <= e)
2307  p += rb_enc_mbminlen(enc);
2308  else
2309  p = e;
2310  }
2311  }
2312  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2313  return c;
2314 }
2315 
2316 /* enc must be str's enc or rb_enc_check(str, str2) */
2317 static long
2318 str_strlen(VALUE str, rb_encoding *enc)
2319 {
2320  const char *p, *e;
2321  int cr;
2322 
2323  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2324  if (!enc) enc = STR_ENC_GET(str);
2325  p = RSTRING_PTR(str);
2326  e = RSTRING_END(str);
2327  cr = ENC_CODERANGE(str);
2328 
2329  if (cr == ENC_CODERANGE_UNKNOWN) {
2330  long n = rb_enc_strlen_cr(p, e, enc, &cr);
2331  if (cr) ENC_CODERANGE_SET(str, cr);
2332  return n;
2333  }
2334  else {
2335  return enc_strlen(p, e, enc, cr);
2336  }
2337 }
2338 
2339 long
2341 {
2342  return str_strlen(str, NULL);
2343 }
2344 
2345 /*
2346  * call-seq:
2347  * length -> integer
2348  *
2349  * :include: doc/string/length.rdoc
2350  *
2351  */
2352 
2353 VALUE
2355 {
2356  return LONG2NUM(str_strlen(str, NULL));
2357 }
2358 
2359 /*
2360  * call-seq:
2361  * bytesize -> integer
2362  *
2363  * :include: doc/string/bytesize.rdoc
2364  *
2365  */
2366 
2367 VALUE
2368 rb_str_bytesize(VALUE str)
2369 {
2370  return LONG2NUM(RSTRING_LEN(str));
2371 }
2372 
2373 /*
2374  * call-seq:
2375  * empty? -> true or false
2376  *
2377  * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2378  *
2379  * "hello".empty? # => false
2380  * " ".empty? # => false
2381  * "".empty? # => true
2382  *
2383  */
2384 
2385 static VALUE
2386 rb_str_empty(VALUE str)
2387 {
2388  return RBOOL(RSTRING_LEN(str) == 0);
2389 }
2390 
2391 /*
2392  * call-seq:
2393  * string + other_string -> new_string
2394  *
2395  * Returns a new +String+ containing +other_string+ concatenated to +self+:
2396  *
2397  * "Hello from " + self.to_s # => "Hello from main"
2398  *
2399  */
2400 
2401 VALUE
2403 {
2404  VALUE str3;
2405  rb_encoding *enc;
2406  char *ptr1, *ptr2, *ptr3;
2407  long len1, len2;
2408  int termlen;
2409 
2410  StringValue(str2);
2411  enc = rb_enc_check_str(str1, str2);
2412  RSTRING_GETMEM(str1, ptr1, len1);
2413  RSTRING_GETMEM(str2, ptr2, len2);
2414  termlen = rb_enc_mbminlen(enc);
2415  if (len1 > LONG_MAX - len2) {
2416  rb_raise(rb_eArgError, "string size too big");
2417  }
2418  str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2419  ptr3 = RSTRING_PTR(str3);
2420  memcpy(ptr3, ptr1, len1);
2421  memcpy(ptr3+len1, ptr2, len2);
2422  TERM_FILL(&ptr3[len1+len2], termlen);
2423 
2426  RB_GC_GUARD(str1);
2427  RB_GC_GUARD(str2);
2428  return str3;
2429 }
2430 
2431 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2432 VALUE
2433 rb_str_opt_plus(VALUE str1, VALUE str2)
2434 {
2437  long len1, len2;
2438  MAYBE_UNUSED(char) *ptr1, *ptr2;
2439  RSTRING_GETMEM(str1, ptr1, len1);
2440  RSTRING_GETMEM(str2, ptr2, len2);
2441  int enc1 = rb_enc_get_index(str1);
2442  int enc2 = rb_enc_get_index(str2);
2443 
2444  if (enc1 < 0) {
2445  return Qundef;
2446  }
2447  else if (enc2 < 0) {
2448  return Qundef;
2449  }
2450  else if (enc1 != enc2) {
2451  return Qundef;
2452  }
2453  else if (len1 > LONG_MAX - len2) {
2454  return Qundef;
2455  }
2456  else {
2457  return rb_str_plus(str1, str2);
2458  }
2459 
2460 }
2461 
2462 /*
2463  * call-seq:
2464  * string * integer -> new_string
2465  *
2466  * Returns a new +String+ containing +integer+ copies of +self+:
2467  *
2468  * "Ho! " * 3 # => "Ho! Ho! Ho! "
2469  * "Ho! " * 0 # => ""
2470  *
2471  */
2472 
2473 VALUE
2475 {
2476  VALUE str2;
2477  long n, len;
2478  char *ptr2;
2479  int termlen;
2480 
2481  if (times == INT2FIX(1)) {
2482  return str_duplicate(rb_cString, str);
2483  }
2484  if (times == INT2FIX(0)) {
2485  str2 = str_alloc_embed(rb_cString, 0);
2486  rb_enc_copy(str2, str);
2487  return str2;
2488  }
2489  len = NUM2LONG(times);
2490  if (len < 0) {
2491  rb_raise(rb_eArgError, "negative argument");
2492  }
2493  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2494  if (STR_EMBEDDABLE_P(len, 1)) {
2495  str2 = str_alloc_embed(rb_cString, len + 1);
2496  memset(RSTRING_PTR(str2), 0, len + 1);
2497  }
2498  else {
2499  str2 = str_alloc_heap(rb_cString);
2500  RSTRING(str2)->as.heap.aux.capa = len;
2501  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2502  }
2503  STR_SET_LEN(str2, len);
2504  rb_enc_copy(str2, str);
2505  return str2;
2506  }
2507  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2508  rb_raise(rb_eArgError, "argument too big");
2509  }
2510 
2511  len *= RSTRING_LEN(str);
2512  termlen = TERM_LEN(str);
2513  str2 = str_new0(rb_cString, 0, len, termlen);
2514  ptr2 = RSTRING_PTR(str2);
2515  if (len) {
2516  n = RSTRING_LEN(str);
2517  memcpy(ptr2, RSTRING_PTR(str), n);
2518  while (n <= len/2) {
2519  memcpy(ptr2 + n, ptr2, n);
2520  n *= 2;
2521  }
2522  memcpy(ptr2 + n, ptr2, len-n);
2523  }
2524  STR_SET_LEN(str2, len);
2525  TERM_FILL(&ptr2[len], termlen);
2526  rb_enc_cr_str_copy_for_substr(str2, str);
2527 
2528  return str2;
2529 }
2530 
2531 /*
2532  * call-seq:
2533  * string % object -> new_string
2534  *
2535  * Returns the result of formatting +object+ into the format specification +self+
2536  * (see Kernel#sprintf for formatting details):
2537  *
2538  * "%05d" % 123 # => "00123"
2539  *
2540  * If +self+ contains multiple substitutions, +object+ must be
2541  * an Array or Hash containing the values to be substituted:
2542  *
2543  * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2544  * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2545  * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2546  *
2547  */
2548 
2549 static VALUE
2550 rb_str_format_m(VALUE str, VALUE arg)
2551 {
2552  VALUE tmp = rb_check_array_type(arg);
2553 
2554  if (!NIL_P(tmp)) {
2555  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2556  }
2557  return rb_str_format(1, &arg, str);
2558 }
2559 
2560 static inline void
2561 rb_check_lockedtmp(VALUE str)
2562 {
2563  if (FL_TEST(str, STR_TMPLOCK)) {
2564  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2565  }
2566 }
2567 
2568 // If none of these flags are set, we know we have an modifiable string.
2569 // If any is set, we need to do more detailed checks.
2570 #define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2571 static inline void
2572 str_modifiable(VALUE str)
2573 {
2574  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2575  if (CHILLED_STRING_P(str)) {
2576  CHILLED_STRING_MUTATED(str);
2577  }
2578  rb_check_lockedtmp(str);
2579  rb_check_frozen(str);
2580  }
2581 }
2582 
2583 static inline int
2584 str_dependent_p(VALUE str)
2585 {
2586  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2587  return FALSE;
2588  }
2589  else {
2590  return TRUE;
2591  }
2592 }
2593 
2594 // If none of these flags are set, we know we have an independent string.
2595 // If any is set, we need to do more detailed checks.
2596 #define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2597 static inline int
2598 str_independent(VALUE str)
2599 {
2600  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2601  str_modifiable(str);
2602  return !str_dependent_p(str);
2603  }
2604  return TRUE;
2605 }
2606 
2607 static void
2608 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2609 {
2610  char *ptr;
2611  char *oldptr;
2612  long capa = len + expand;
2613 
2614  if (len > capa) len = capa;
2615 
2616  if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2617  ptr = RSTRING(str)->as.heap.ptr;
2618  STR_SET_EMBED(str);
2619  memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2620  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2621  STR_SET_LEN(str, len);
2622  return;
2623  }
2624 
2625  ptr = ALLOC_N(char, (size_t)capa + termlen);
2626  oldptr = RSTRING_PTR(str);
2627  if (oldptr) {
2628  memcpy(ptr, oldptr, len);
2629  }
2630  if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2631  xfree(oldptr);
2632  }
2633  STR_SET_NOEMBED(str);
2634  FL_UNSET(str, STR_SHARED|STR_NOFREE);
2635  TERM_FILL(ptr + len, termlen);
2636  RSTRING(str)->as.heap.ptr = ptr;
2637  STR_SET_LEN(str, len);
2638  RSTRING(str)->as.heap.aux.capa = capa;
2639 }
2640 
2641 void
2643 {
2644  if (!str_independent(str))
2645  str_make_independent(str);
2646  ENC_CODERANGE_CLEAR(str);
2647 }
2648 
2649 void
2650 rb_str_modify_expand(VALUE str, long expand)
2651 {
2652  int termlen = TERM_LEN(str);
2653  long len = RSTRING_LEN(str);
2654 
2655  if (expand < 0) {
2656  rb_raise(rb_eArgError, "negative expanding string size");
2657  }
2658  if (expand >= LONG_MAX - len) {
2659  rb_raise(rb_eArgError, "string size too big");
2660  }
2661 
2662  if (!str_independent(str)) {
2663  str_make_independent_expand(str, len, expand, termlen);
2664  }
2665  else if (expand > 0) {
2666  RESIZE_CAPA_TERM(str, len + expand, termlen);
2667  }
2668  ENC_CODERANGE_CLEAR(str);
2669 }
2670 
2671 /* As rb_str_modify(), but don't clear coderange */
2672 static void
2673 str_modify_keep_cr(VALUE str)
2674 {
2675  if (!str_independent(str))
2676  str_make_independent(str);
2677  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2678  /* Force re-scan later */
2679  ENC_CODERANGE_CLEAR(str);
2680 }
2681 
2682 static inline void
2683 str_discard(VALUE str)
2684 {
2685  str_modifiable(str);
2686  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2687  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2688  RSTRING(str)->as.heap.ptr = 0;
2689  STR_SET_LEN(str, 0);
2690  }
2691 }
2692 
2693 void
2695 {
2696  rb_encoding *enc = rb_enc_get(str);
2697  if (!enc) {
2698  rb_raise(rb_eTypeError, "not encoding capable object");
2699  }
2700  if (!rb_enc_asciicompat(enc)) {
2701  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2702  }
2703 }
2704 
2705 VALUE
2707 {
2708  VALUE s = *ptr;
2709  if (!RB_TYPE_P(s, T_STRING)) {
2710  s = rb_str_to_str(s);
2711  *ptr = s;
2712  }
2713  return s;
2714 }
2715 
2716 char *
2718 {
2719  VALUE str = rb_string_value(ptr);
2720  return RSTRING_PTR(str);
2721 }
2722 
2723 static int
2724 zero_filled(const char *s, int n)
2725 {
2726  for (; n > 0; --n) {
2727  if (*s++) return 0;
2728  }
2729  return 1;
2730 }
2731 
2732 static const char *
2733 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2734 {
2735  const char *e = s + len;
2736 
2737  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2738  if (zero_filled(s, minlen)) return s;
2739  }
2740  return 0;
2741 }
2742 
2743 static char *
2744 str_fill_term(VALUE str, char *s, long len, int termlen)
2745 {
2746  /* This function assumes that (capa + termlen) bytes of memory
2747  * is allocated, like many other functions in this file.
2748  */
2749  if (str_dependent_p(str)) {
2750  if (!zero_filled(s + len, termlen))
2751  str_make_independent_expand(str, len, 0L, termlen);
2752  }
2753  else {
2754  TERM_FILL(s + len, termlen);
2755  return s;
2756  }
2757  return RSTRING_PTR(str);
2758 }
2759 
2760 void
2761 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2762 {
2763  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2764  long len = RSTRING_LEN(str);
2765 
2766  RUBY_ASSERT(capa >= len);
2767  if (capa - len < termlen) {
2768  rb_check_lockedtmp(str);
2769  str_make_independent_expand(str, len, 0L, termlen);
2770  }
2771  else if (str_dependent_p(str)) {
2772  if (termlen > oldtermlen)
2773  str_make_independent_expand(str, len, 0L, termlen);
2774  }
2775  else {
2776  if (!STR_EMBED_P(str)) {
2777  /* modify capa instead of realloc */
2778  RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2779  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2780  }
2781  if (termlen > oldtermlen) {
2782  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2783  }
2784  }
2785 
2786  return;
2787 }
2788 
2789 static char *
2790 str_null_check(VALUE str, int *w)
2791 {
2792  char *s = RSTRING_PTR(str);
2793  long len = RSTRING_LEN(str);
2794  rb_encoding *enc = rb_enc_get(str);
2795  const int minlen = rb_enc_mbminlen(enc);
2796 
2797  if (minlen > 1) {
2798  *w = 1;
2799  if (str_null_char(s, len, minlen, enc)) {
2800  return NULL;
2801  }
2802  return str_fill_term(str, s, len, minlen);
2803  }
2804  *w = 0;
2805  if (!s || memchr(s, 0, len)) {
2806  return NULL;
2807  }
2808  if (s[len]) {
2809  s = str_fill_term(str, s, len, minlen);
2810  }
2811  return s;
2812 }
2813 
2814 char *
2815 rb_str_to_cstr(VALUE str)
2816 {
2817  int w;
2818  return str_null_check(str, &w);
2819 }
2820 
2821 char *
2823 {
2824  VALUE str = rb_string_value(ptr);
2825  int w;
2826  char *s = str_null_check(str, &w);
2827  if (!s) {
2828  if (w) {
2829  rb_raise(rb_eArgError, "string contains null char");
2830  }
2831  rb_raise(rb_eArgError, "string contains null byte");
2832  }
2833  return s;
2834 }
2835 
2836 char *
2837 rb_str_fill_terminator(VALUE str, const int newminlen)
2838 {
2839  char *s = RSTRING_PTR(str);
2840  long len = RSTRING_LEN(str);
2841  return str_fill_term(str, s, len, newminlen);
2842 }
2843 
2844 VALUE
2846 {
2847  str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2848  return str;
2849 }
2850 
2851 /*
2852  * call-seq:
2853  * String.try_convert(object) -> object, new_string, or nil
2854  *
2855  * If +object+ is a +String+ object, returns +object+.
2856  *
2857  * Otherwise if +object+ responds to <tt>:to_str</tt>,
2858  * calls <tt>object.to_str</tt> and returns the result.
2859  *
2860  * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2861  *
2862  * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2863  */
2864 static VALUE
2865 rb_str_s_try_convert(VALUE dummy, VALUE str)
2866 {
2867  return rb_check_string_type(str);
2868 }
2869 
2870 static char*
2871 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2872 {
2873  long nth = *nthp;
2874  if (rb_enc_mbmaxlen(enc) == 1) {
2875  p += nth;
2876  }
2877  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2878  p += nth * rb_enc_mbmaxlen(enc);
2879  }
2880  else if (rb_enc_asciicompat(enc)) {
2881  const char *p2, *e2;
2882  int n;
2883 
2884  while (p < e && 0 < nth) {
2885  e2 = p + nth;
2886  if (e < e2) {
2887  *nthp = nth;
2888  return (char *)e;
2889  }
2890  if (ISASCII(*p)) {
2891  p2 = search_nonascii(p, e2);
2892  if (!p2) {
2893  nth -= e2 - p;
2894  *nthp = nth;
2895  return (char *)e2;
2896  }
2897  nth -= p2 - p;
2898  p = p2;
2899  }
2900  n = rb_enc_mbclen(p, e, enc);
2901  p += n;
2902  nth--;
2903  }
2904  *nthp = nth;
2905  if (nth != 0) {
2906  return (char *)e;
2907  }
2908  return (char *)p;
2909  }
2910  else {
2911  while (p < e && nth--) {
2912  p += rb_enc_mbclen(p, e, enc);
2913  }
2914  }
2915  if (p > e) p = e;
2916  *nthp = nth;
2917  return (char*)p;
2918 }
2919 
2920 char*
2921 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2922 {
2923  return str_nth_len(p, e, &nth, enc);
2924 }
2925 
2926 static char*
2927 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2928 {
2929  if (singlebyte)
2930  p += nth;
2931  else {
2932  p = str_nth_len(p, e, &nth, enc);
2933  }
2934  if (!p) return 0;
2935  if (p > e) p = e;
2936  return (char *)p;
2937 }
2938 
2939 /* char offset to byte offset */
2940 static long
2941 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2942 {
2943  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2944  if (!pp) return e - p;
2945  return pp - p;
2946 }
2947 
2948 long
2949 rb_str_offset(VALUE str, long pos)
2950 {
2951  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2952  STR_ENC_GET(str), single_byte_optimizable(str));
2953 }
2954 
2955 #ifdef NONASCII_MASK
2956 static char *
2957 str_utf8_nth(const char *p, const char *e, long *nthp)
2958 {
2959  long nth = *nthp;
2960  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2961  const uintptr_t *s, *t;
2962  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2963  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2964  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2965  while (p < (const char *)s) {
2966  if (is_utf8_lead_byte(*p)) nth--;
2967  p++;
2968  }
2969  do {
2970  nth -= count_utf8_lead_bytes_with_word(s);
2971  s++;
2972  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2973  p = (char *)s;
2974  }
2975  while (p < e) {
2976  if (is_utf8_lead_byte(*p)) {
2977  if (nth == 0) break;
2978  nth--;
2979  }
2980  p++;
2981  }
2982  *nthp = nth;
2983  return (char *)p;
2984 }
2985 
2986 static long
2987 str_utf8_offset(const char *p, const char *e, long nth)
2988 {
2989  const char *pp = str_utf8_nth(p, e, &nth);
2990  return pp - p;
2991 }
2992 #endif
2993 
2994 /* byte offset to char offset */
2995 long
2996 rb_str_sublen(VALUE str, long pos)
2997 {
2998  if (single_byte_optimizable(str) || pos < 0)
2999  return pos;
3000  else {
3001  char *p = RSTRING_PTR(str);
3002  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3003  }
3004 }
3005 
3006 static VALUE
3007 str_subseq(VALUE str, long beg, long len)
3008 {
3009  VALUE str2;
3010 
3011  RUBY_ASSERT(beg >= 0);
3012  RUBY_ASSERT(len >= 0);
3013  RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3014 
3015  const int termlen = TERM_LEN(str);
3016  if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3017  str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3018  RB_GC_GUARD(str);
3019  return str2;
3020  }
3021 
3022  str2 = str_alloc_heap(rb_cString);
3023  if (str_embed_capa(str2) >= len + termlen) {
3024  char *ptr2 = RSTRING(str2)->as.embed.ary;
3025  STR_SET_EMBED(str2);
3026  memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3027  TERM_FILL(ptr2+len, termlen);
3028 
3029  STR_SET_LEN(str2, len);
3030  RB_GC_GUARD(str);
3031  }
3032  else {
3033  str_replace_shared(str2, str);
3034  RUBY_ASSERT(!STR_EMBED_P(str2));
3035  ENC_CODERANGE_CLEAR(str2);
3036  RSTRING(str2)->as.heap.ptr += beg;
3037  if (RSTRING_LEN(str2) > len) {
3038  STR_SET_LEN(str2, len);
3039  }
3040  }
3041 
3042  return str2;
3043 }
3044 
3045 VALUE
3046 rb_str_subseq(VALUE str, long beg, long len)
3047 {
3048  VALUE str2 = str_subseq(str, beg, len);
3049  rb_enc_cr_str_copy_for_substr(str2, str);
3050  return str2;
3051 }
3052 
3053 char *
3054 rb_str_subpos(VALUE str, long beg, long *lenp)
3055 {
3056  long len = *lenp;
3057  long slen = -1L;
3058  long blen = RSTRING_LEN(str);
3059  rb_encoding *enc = STR_ENC_GET(str);
3060  char *p, *s = RSTRING_PTR(str), *e = s + blen;
3061 
3062  if (len < 0) return 0;
3063  if (!blen) {
3064  len = 0;
3065  }
3066  if (single_byte_optimizable(str)) {
3067  if (beg > blen) return 0;
3068  if (beg < 0) {
3069  beg += blen;
3070  if (beg < 0) return 0;
3071  }
3072  if (len > blen - beg)
3073  len = blen - beg;
3074  if (len < 0) return 0;
3075  p = s + beg;
3076  goto end;
3077  }
3078  if (beg < 0) {
3079  if (len > -beg) len = -beg;
3080  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
3081  beg = -beg;
3082  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3083  p = e;
3084  if (!p) return 0;
3085  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3086  if (!p) return 0;
3087  len = e - p;
3088  goto end;
3089  }
3090  else {
3091  slen = str_strlen(str, enc);
3092  beg += slen;
3093  if (beg < 0) return 0;
3094  p = s + beg;
3095  if (len == 0) goto end;
3096  }
3097  }
3098  else if (beg > 0 && beg > RSTRING_LEN(str)) {
3099  return 0;
3100  }
3101  if (len == 0) {
3102  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3103  p = s + beg;
3104  }
3105 #ifdef NONASCII_MASK
3106  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3107  enc == rb_utf8_encoding()) {
3108  p = str_utf8_nth(s, e, &beg);
3109  if (beg > 0) return 0;
3110  len = str_utf8_offset(p, e, len);
3111  }
3112 #endif
3113  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3114  int char_sz = rb_enc_mbmaxlen(enc);
3115 
3116  p = s + beg * char_sz;
3117  if (p > e) {
3118  return 0;
3119  }
3120  else if (len * char_sz > e - p)
3121  len = e - p;
3122  else
3123  len *= char_sz;
3124  }
3125  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3126  if (beg > 0) return 0;
3127  len = 0;
3128  }
3129  else {
3130  len = str_offset(p, e, len, enc, 0);
3131  }
3132  end:
3133  *lenp = len;
3134  RB_GC_GUARD(str);
3135  return p;
3136 }
3137 
3138 static VALUE str_substr(VALUE str, long beg, long len, int empty);
3139 
3140 VALUE
3141 rb_str_substr(VALUE str, long beg, long len)
3142 {
3143  return str_substr(str, beg, len, TRUE);
3144 }
3145 
3146 static VALUE
3147 str_substr(VALUE str, long beg, long len, int empty)
3148 {
3149  char *p = rb_str_subpos(str, beg, &len);
3150 
3151  if (!p) return Qnil;
3152  if (!len && !empty) return Qnil;
3153 
3154  beg = p - RSTRING_PTR(str);
3155 
3156  VALUE str2 = str_subseq(str, beg, len);
3157  rb_enc_cr_str_copy_for_substr(str2, str);
3158  return str2;
3159 }
3160 
3161 /* :nodoc: */
3162 VALUE
3164 {
3165  if (CHILLED_STRING_P(str)) {
3166  FL_UNSET_RAW(str, STR_CHILLED);
3167  }
3168 
3169  if (OBJ_FROZEN(str)) return str;
3170  rb_str_resize(str, RSTRING_LEN(str));
3171  return rb_obj_freeze(str);
3172 }
3173 
3174 /*
3175  * call-seq:
3176  * +string -> new_string or self
3177  *
3178  * Returns +self+ if +self+ is not frozen.
3179  *
3180  * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3181  */
3182 static VALUE
3183 str_uplus(VALUE str)
3184 {
3185  if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3186  return rb_str_dup(str);
3187  }
3188  else {
3189  return str;
3190  }
3191 }
3192 
3193 /*
3194  * call-seq:
3195  * -string -> frozen_string
3196  * dedup -> frozen_string
3197  *
3198  * Returns a frozen, possibly pre-existing copy of the string.
3199  *
3200  * The returned +String+ will be deduplicated as long as it does not have
3201  * any instance variables set on it and is not a String subclass.
3202  *
3203  * Note that <tt>-string</tt> variant is more convenient for defining
3204  * constants:
3205  *
3206  * FILENAME = -'config/database.yml'
3207  *
3208  * while +dedup+ is better suitable for using the method in chains
3209  * of calculations:
3210  *
3211  * @url_list.concat(urls.map(&:dedup))
3212  *
3213  */
3214 static VALUE
3215 str_uminus(VALUE str)
3216 {
3217  if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3218  str = rb_str_dup(str);
3219  }
3220  return rb_fstring(str);
3221 }
3222 
3223 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3224 #define rb_str_dup_frozen rb_str_new_frozen
3225 
3226 VALUE
3227 rb_str_locktmp(VALUE str)
3228 {
3229  if (FL_TEST(str, STR_TMPLOCK)) {
3230  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3231  }
3232  FL_SET(str, STR_TMPLOCK);
3233  return str;
3234 }
3235 
3236 VALUE
3238 {
3239  if (!FL_TEST(str, STR_TMPLOCK)) {
3240  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3241  }
3242  FL_UNSET(str, STR_TMPLOCK);
3243  return str;
3244 }
3245 
3246 VALUE
3247 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3248 {
3249  rb_str_locktmp(str);
3250  return rb_ensure(func, arg, rb_str_unlocktmp, str);
3251 }
3252 
3253 void
3255 {
3256  long capa;
3257  const int termlen = TERM_LEN(str);
3258 
3259  str_modifiable(str);
3260  if (STR_SHARED_P(str)) {
3261  rb_raise(rb_eRuntimeError, "can't set length of shared string");
3262  }
3263  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3264  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3265  }
3266 
3267  int cr = ENC_CODERANGE(str);
3268  if (len == 0) {
3269  /* Empty string does not contain non-ASCII */
3271  }
3272  else if (cr == ENC_CODERANGE_UNKNOWN) {
3273  /* Leave unknown. */
3274  }
3275  else if (len > RSTRING_LEN(str)) {
3276  if (ENC_CODERANGE_CLEAN_P(cr)) {
3277  /* Update the coderange regarding the extended part. */
3278  const char *const prev_end = RSTRING_END(str);
3279  const char *const new_end = RSTRING_PTR(str) + len;
3280  rb_encoding *enc = rb_enc_get(str);
3281  rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3282  ENC_CODERANGE_SET(str, cr);
3283  }
3284  else if (cr == ENC_CODERANGE_BROKEN) {
3285  /* May be valid now, by appended part. */
3287  }
3288  }
3289  else if (len < RSTRING_LEN(str)) {
3290  if (cr != ENC_CODERANGE_7BIT) {
3291  /* ASCII-only string is keeping after truncated. Valid
3292  * and broken may be invalid or valid, leave unknown. */
3294  }
3295  }
3296 
3297  STR_SET_LEN(str, len);
3298  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3299 }
3300 
3301 VALUE
3303 {
3304  if (len < 0) {
3305  rb_raise(rb_eArgError, "negative string size (or size too big)");
3306  }
3307 
3308  int independent = str_independent(str);
3309  long slen = RSTRING_LEN(str);
3310  const int termlen = TERM_LEN(str);
3311 
3312  if (slen > len || (termlen != 1 && slen < len)) {
3313  ENC_CODERANGE_CLEAR(str);
3314  }
3315 
3316  {
3317  long capa;
3318  if (STR_EMBED_P(str)) {
3319  if (len == slen) return str;
3320  if (str_embed_capa(str) >= len + termlen) {
3321  STR_SET_LEN(str, len);
3322  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3323  return str;
3324  }
3325  str_make_independent_expand(str, slen, len - slen, termlen);
3326  }
3327  else if (str_embed_capa(str) >= len + termlen) {
3328  char *ptr = STR_HEAP_PTR(str);
3329  STR_SET_EMBED(str);
3330  if (slen > len) slen = len;
3331  if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3332  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3333  STR_SET_LEN(str, len);
3334  if (independent) ruby_xfree(ptr);
3335  return str;
3336  }
3337  else if (!independent) {
3338  if (len == slen) return str;
3339  str_make_independent_expand(str, slen, len - slen, termlen);
3340  }
3341  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3342  (capa - len) > (len < 1024 ? len : 1024)) {
3343  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3344  (size_t)len + termlen, STR_HEAP_SIZE(str));
3345  RSTRING(str)->as.heap.aux.capa = len;
3346  }
3347  else if (len == slen) return str;
3348  STR_SET_LEN(str, len);
3349  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3350  }
3351  return str;
3352 }
3353 
3354 static void
3355 str_ensure_available_capa(VALUE str, long len)
3356 {
3357  str_modify_keep_cr(str);
3358 
3359  const int termlen = TERM_LEN(str);
3360  long olen = RSTRING_LEN(str);
3361 
3362  if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3363  rb_raise(rb_eArgError, "string sizes too big");
3364  }
3365 
3366  long total = olen + len;
3367  long capa = str_capacity(str, termlen);
3368 
3369  if (capa < total) {
3370  if (total >= LONG_MAX / 2) {
3371  capa = total;
3372  }
3373  while (total > capa) {
3374  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3375  }
3376  RESIZE_CAPA_TERM(str, capa, termlen);
3377  }
3378 }
3379 
3380 static VALUE
3381 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3382 {
3383  if (keep_cr) {
3384  str_modify_keep_cr(str);
3385  }
3386  else {
3387  rb_str_modify(str);
3388  }
3389  if (len == 0) return 0;
3390 
3391  long total, olen, off = -1;
3392  char *sptr;
3393  const int termlen = TERM_LEN(str);
3394 
3395  RSTRING_GETMEM(str, sptr, olen);
3396  if (ptr >= sptr && ptr <= sptr + olen) {
3397  off = ptr - sptr;
3398  }
3399 
3400  long capa = str_capacity(str, termlen);
3401 
3402  if (olen > LONG_MAX - len) {
3403  rb_raise(rb_eArgError, "string sizes too big");
3404  }
3405  total = olen + len;
3406  if (capa < total) {
3407  if (total >= LONG_MAX / 2) {
3408  capa = total;
3409  }
3410  while (total > capa) {
3411  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3412  }
3413  RESIZE_CAPA_TERM(str, capa, termlen);
3414  sptr = RSTRING_PTR(str);
3415  }
3416  if (off != -1) {
3417  ptr = sptr + off;
3418  }
3419  memcpy(sptr + olen, ptr, len);
3420  STR_SET_LEN(str, total);
3421  TERM_FILL(sptr + total, termlen); /* sentinel */
3422 
3423  return str;
3424 }
3425 
3426 #define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3427 #define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3428 
3429 VALUE
3430 rb_str_cat(VALUE str, const char *ptr, long len)
3431 {
3432  if (len == 0) return str;
3433  if (len < 0) {
3434  rb_raise(rb_eArgError, "negative string size (or size too big)");
3435  }
3436  return str_buf_cat(str, ptr, len);
3437 }
3438 
3439 VALUE
3440 rb_str_cat_cstr(VALUE str, const char *ptr)
3441 {
3442  must_not_null(ptr);
3443  return rb_str_buf_cat(str, ptr, strlen(ptr));
3444 }
3445 
3446 static void
3447 rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3448 {
3449  RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3450 
3451  // We can't write directly to shared strings without impacting others, so we must make the string independent.
3452  if (UNLIKELY(!str_independent(str))) {
3453  str_make_independent(str);
3454  }
3455 
3456  long string_length = -1;
3457  const int null_terminator_length = 1;
3458  char *sptr;
3459  RSTRING_GETMEM(str, sptr, string_length);
3460 
3461  // Ensure the resulting string wouldn't be too long.
3462  if (UNLIKELY(string_length > LONG_MAX - 1)) {
3463  rb_raise(rb_eArgError, "string sizes too big");
3464  }
3465 
3466  long string_capacity = str_capacity(str, null_terminator_length);
3467 
3468  // Get the code range before any modifications since those might clear the code range.
3469  int cr = ENC_CODERANGE(str);
3470 
3471  // Check if the string has spare string_capacity to write the new byte.
3472  if (LIKELY(string_capacity >= string_length + 1)) {
3473  // In fast path we can write the new byte and note the string's new length.
3474  sptr[string_length] = byte;
3475  STR_SET_LEN(str, string_length + 1);
3476  TERM_FILL(sptr + string_length + 1, null_terminator_length);
3477  }
3478  else {
3479  // If there's not enough string_capacity, make a call into the general string concatenation function.
3480  str_buf_cat(str, (char *)&byte, 1);
3481  }
3482 
3483  // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3484  // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3485  // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3486  // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3487  if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3488  if (ISASCII(byte)) {
3490  }
3491  else {
3493 
3494  // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3495  if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3496  rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3497  }
3498  }
3499  }
3500 }
3501 
3502 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3503 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3504 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3505 
3506 static VALUE
3507 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3508  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3509 {
3510  int str_encindex = ENCODING_GET(str);
3511  int res_encindex;
3512  int str_cr, res_cr;
3513  rb_encoding *str_enc, *ptr_enc;
3514 
3515  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3516 
3517  if (str_encindex == ptr_encindex) {
3518  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3519  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3520  }
3521  }
3522  else {
3523  str_enc = rb_enc_from_index(str_encindex);
3524  ptr_enc = rb_enc_from_index(ptr_encindex);
3525  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3526  if (len == 0)
3527  return str;
3528  if (RSTRING_LEN(str) == 0) {
3529  rb_str_buf_cat(str, ptr, len);
3530  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3531  rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3532  return str;
3533  }
3534  goto incompatible;
3535  }
3536  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3537  ptr_cr = coderange_scan(ptr, len, ptr_enc);
3538  }
3539  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3540  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3541  str_cr = rb_enc_str_coderange(str);
3542  }
3543  }
3544  }
3545  if (ptr_cr_ret)
3546  *ptr_cr_ret = ptr_cr;
3547 
3548  if (str_encindex != ptr_encindex &&
3549  str_cr != ENC_CODERANGE_7BIT &&
3550  ptr_cr != ENC_CODERANGE_7BIT) {
3551  str_enc = rb_enc_from_index(str_encindex);
3552  ptr_enc = rb_enc_from_index(ptr_encindex);
3553  goto incompatible;
3554  }
3555 
3556  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3557  res_encindex = str_encindex;
3558  res_cr = ENC_CODERANGE_UNKNOWN;
3559  }
3560  else if (str_cr == ENC_CODERANGE_7BIT) {
3561  if (ptr_cr == ENC_CODERANGE_7BIT) {
3562  res_encindex = str_encindex;
3563  res_cr = ENC_CODERANGE_7BIT;
3564  }
3565  else {
3566  res_encindex = ptr_encindex;
3567  res_cr = ptr_cr;
3568  }
3569  }
3570  else if (str_cr == ENC_CODERANGE_VALID) {
3571  res_encindex = str_encindex;
3572  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3573  res_cr = str_cr;
3574  else
3575  res_cr = ptr_cr;
3576  }
3577  else { /* str_cr == ENC_CODERANGE_BROKEN */
3578  res_encindex = str_encindex;
3579  res_cr = str_cr;
3580  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3581  }
3582 
3583  if (len < 0) {
3584  rb_raise(rb_eArgError, "negative string size (or size too big)");
3585  }
3586  str_buf_cat(str, ptr, len);
3587  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3588  return str;
3589 
3590  incompatible:
3591  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3592  rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3594 }
3595 
3596 VALUE
3597 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3598 {
3599  return rb_enc_cr_str_buf_cat(str, ptr, len,
3600  rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3601 }
3602 
3603 VALUE
3604 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3605 {
3606  /* ptr must reference NUL terminated ASCII string. */
3607  int encindex = ENCODING_GET(str);
3608  rb_encoding *enc = rb_enc_from_index(encindex);
3609  if (rb_enc_asciicompat(enc)) {
3610  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3611  encindex, ENC_CODERANGE_7BIT, 0);
3612  }
3613  else {
3614  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3615  while (*ptr) {
3616  unsigned int c = (unsigned char)*ptr;
3617  int len = rb_enc_codelen(c, enc);
3618  rb_enc_mbcput(c, buf, enc);
3619  rb_enc_cr_str_buf_cat(str, buf, len,
3620  encindex, ENC_CODERANGE_VALID, 0);
3621  ptr++;
3622  }
3623  return str;
3624  }
3625 }
3626 
3627 VALUE
3629 {
3630  int str2_cr = rb_enc_str_coderange(str2);
3631 
3632  if (str_enc_fastpath(str)) {
3633  switch (str2_cr) {
3634  case ENC_CODERANGE_7BIT:
3635  // If RHS is 7bit we can do simple concatenation
3636  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3637  RB_GC_GUARD(str2);
3638  return str;
3639  case ENC_CODERANGE_VALID:
3640  // If RHS is valid, we can do simple concatenation if encodings are the same
3641  if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3642  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3643  int str_cr = ENC_CODERANGE(str);
3644  if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3645  ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3646  }
3647  RB_GC_GUARD(str2);
3648  return str;
3649  }
3650  }
3651  }
3652 
3653  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3654  ENCODING_GET(str2), str2_cr, &str2_cr);
3655 
3656  ENC_CODERANGE_SET(str2, str2_cr);
3657 
3658  return str;
3659 }
3660 
3661 VALUE
3663 {
3664  StringValue(str2);
3665  return rb_str_buf_append(str, str2);
3666 }
3667 
3668 VALUE
3669 rb_str_concat_literals(size_t num, const VALUE *strary)
3670 {
3671  VALUE str;
3672  size_t i, s = 0;
3673  unsigned long len = 1;
3674 
3675  if (UNLIKELY(!num)) return rb_str_new(0, 0);
3676  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3677 
3678  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3679  str = rb_str_buf_new(len);
3680  str_enc_copy_direct(str, strary[0]);
3681 
3682  for (i = s; i < num; ++i) {
3683  const VALUE v = strary[i];
3684  int encidx = ENCODING_GET(v);
3685 
3686  rb_str_buf_append(str, v);
3687  if (encidx != ENCINDEX_US_ASCII) {
3688  if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3689  rb_enc_set_index(str, encidx);
3690  }
3691  }
3692  return str;
3693 }
3694 
3695 /*
3696  * call-seq:
3697  * concat(*objects) -> string
3698  *
3699  * Concatenates each object in +objects+ to +self+ and returns +self+:
3700  *
3701  * s = 'foo'
3702  * s.concat('bar', 'baz') # => "foobarbaz"
3703  * s # => "foobarbaz"
3704  *
3705  * For each given object +object+ that is an Integer,
3706  * the value is considered a codepoint and converted to a character before concatenation:
3707  *
3708  * s = 'foo'
3709  * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3710  *
3711  * Related: String#<<, which takes a single argument.
3712  */
3713 static VALUE
3714 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3715 {
3716  str_modifiable(str);
3717 
3718  if (argc == 1) {
3719  return rb_str_concat(str, argv[0]);
3720  }
3721  else if (argc > 1) {
3722  int i;
3723  VALUE arg_str = rb_str_tmp_new(0);
3724  rb_enc_copy(arg_str, str);
3725  for (i = 0; i < argc; i++) {
3726  rb_str_concat(arg_str, argv[i]);
3727  }
3728  rb_str_buf_append(str, arg_str);
3729  }
3730 
3731  return str;
3732 }
3733 
3734 /*
3735  * call-seq:
3736  * append_as_bytes(*objects) -> string
3737  *
3738  * Concatenates each object in +objects+ into +self+ without any encoding
3739  * validation or conversion and returns +self+:
3740  *
3741  * s = 'foo'
3742  * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3743  * s.valid_encoding? # => false
3744  * s.append_as_bytes("\xAC 12")
3745  * s.valid_encoding? # => true
3746  *
3747  * For each given object +object+ that is an Integer,
3748  * the value is considered a Byte. If the Integer is bigger
3749  * than one byte, only the lower byte is considered, similar to String#setbyte:
3750  *
3751  * s = ""
3752  * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3753  *
3754  * Related: String#<<, String#concat, which do an encoding aware concatenation.
3755  */
3756 
3757 VALUE
3758 rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3759 {
3760  long needed_capacity = 0;
3761  volatile VALUE t0;
3762  enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3763 
3764  for (int index = 0; index < argc; index++) {
3765  VALUE obj = argv[index];
3766  enum ruby_value_type type = types[index] = rb_type(obj);
3767  switch (type) {
3768  case T_FIXNUM:
3769  case T_BIGNUM:
3770  needed_capacity++;
3771  break;
3772  case T_STRING:
3773  needed_capacity += RSTRING_LEN(obj);
3774  break;
3775  default:
3776  rb_raise(
3777  rb_eTypeError,
3778  "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3779  rb_obj_class(obj)
3780  );
3781  break;
3782  }
3783  }
3784 
3785  str_ensure_available_capa(str, needed_capacity);
3786  char *sptr = RSTRING_END(str);
3787 
3788  for (int index = 0; index < argc; index++) {
3789  VALUE obj = argv[index];
3790  enum ruby_value_type type = types[index];
3791  switch (type) {
3792  case T_FIXNUM:
3793  case T_BIGNUM: {
3794  argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3795  char byte = (char)(NUM2INT(obj) & 0xFF);
3796  *sptr = byte;
3797  sptr++;
3798  break;
3799  }
3800  case T_STRING: {
3801  const char *ptr;
3802  long len;
3803  RSTRING_GETMEM(obj, ptr, len);
3804  memcpy(sptr, ptr, len);
3805  sptr += len;
3806  break;
3807  }
3808  default:
3809  rb_bug("append_as_bytes arguments should have been validated");
3810  }
3811  }
3812 
3813  STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3814  TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3815 
3816  int cr = ENC_CODERANGE(str);
3817  switch (cr) {
3818  case ENC_CODERANGE_7BIT: {
3819  for (int index = 0; index < argc; index++) {
3820  VALUE obj = argv[index];
3821  enum ruby_value_type type = types[index];
3822  switch (type) {
3823  case T_FIXNUM:
3824  case T_BIGNUM: {
3825  if (!ISASCII(NUM2INT(obj))) {
3826  goto clear_cr;
3827  }
3828  break;
3829  }
3830  case T_STRING: {
3831  if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3832  goto clear_cr;
3833  }
3834  break;
3835  }
3836  default:
3837  rb_bug("append_as_bytes arguments should have been validated");
3838  }
3839  }
3840  break;
3841  }
3842  case ENC_CODERANGE_VALID:
3843  if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3844  goto keep_cr;
3845  }
3846  else {
3847  goto clear_cr;
3848  }
3849  break;
3850  default:
3851  goto clear_cr;
3852  break;
3853  }
3854 
3855  RB_GC_GUARD(t0);
3856 
3857  clear_cr:
3858  // If no fast path was hit, we clear the coderange.
3859  // append_as_bytes is predominently meant to be used in
3860  // buffering situation, hence it's likely the coderange
3861  // will never be scanned, so it's not worth spending time
3862  // precomputing the coderange except for simple and common
3863  // situations.
3864  ENC_CODERANGE_CLEAR(str);
3865  keep_cr:
3866  return str;
3867 }
3868 
3869 /*
3870  * call-seq:
3871  * string << object -> string
3872  *
3873  * Concatenates +object+ to +self+ and returns +self+:
3874  *
3875  * s = 'foo'
3876  * s << 'bar' # => "foobar"
3877  * s # => "foobar"
3878  *
3879  * If +object+ is an Integer,
3880  * the value is considered a codepoint and converted to a character before concatenation:
3881  *
3882  * s = 'foo'
3883  * s << 33 # => "foo!"
3884  *
3885  * If that codepoint is not representable in the encoding of
3886  * _string_, RangeError is raised.
3887  *
3888  * s = 'foo'
3889  * s.encoding # => <Encoding:UTF-8>
3890  * s << 0x00110000 # 1114112 out of char range (RangeError)
3891  * s = 'foo'.encode('EUC-JP')
3892  * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3893  *
3894  * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3895  * is automatically promoted to ASCII-8BIT.
3896  *
3897  * s = 'foo'.encode('US-ASCII')
3898  * s << 0xff
3899  * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3900  *
3901  * Related: String#concat, which takes multiple arguments.
3902  */
3903 VALUE
3905 {
3906  unsigned int code;
3907  rb_encoding *enc = STR_ENC_GET(str1);
3908  int encidx;
3909 
3910  if (RB_INTEGER_TYPE_P(str2)) {
3911  if (rb_num_to_uint(str2, &code) == 0) {
3912  }
3913  else if (FIXNUM_P(str2)) {
3914  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3915  }
3916  else {
3917  rb_raise(rb_eRangeError, "bignum out of char range");
3918  }
3919  }
3920  else {
3921  return rb_str_append(str1, str2);
3922  }
3923 
3924  encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3925 
3926  if (encidx >= 0) {
3927  rb_str_buf_cat_byte(str1, (unsigned char)code);
3928  }
3929  else {
3930  long pos = RSTRING_LEN(str1);
3931  int cr = ENC_CODERANGE(str1);
3932  int len;
3933  char *buf;
3934 
3935  switch (len = rb_enc_codelen(code, enc)) {
3936  case ONIGERR_INVALID_CODE_POINT_VALUE:
3937  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3938  break;
3939  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3940  case 0:
3941  rb_raise(rb_eRangeError, "%u out of char range", code);
3942  break;
3943  }
3944  buf = ALLOCA_N(char, len + 1);
3945  rb_enc_mbcput(code, buf, enc);
3946  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3947  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3948  }
3949  rb_str_resize(str1, pos+len);
3950  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3951  if (cr == ENC_CODERANGE_7BIT && code > 127) {
3952  cr = ENC_CODERANGE_VALID;
3953  }
3954  else if (cr == ENC_CODERANGE_BROKEN) {
3955  cr = ENC_CODERANGE_UNKNOWN;
3956  }
3957  ENC_CODERANGE_SET(str1, cr);
3958  }
3959  return str1;
3960 }
3961 
3962 int
3963 rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3964 {
3965  int encidx = rb_enc_to_index(enc);
3966 
3967  if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3968  /* US-ASCII automatically extended to ASCII-8BIT */
3969  if (code > 0xFF) {
3970  rb_raise(rb_eRangeError, "%u out of char range", code);
3971  }
3972  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3973  return ENCINDEX_ASCII_8BIT;
3974  }
3975  return encidx;
3976  }
3977  else {
3978  return -1;
3979  }
3980 }
3981 
3982 /*
3983  * call-seq:
3984  * prepend(*other_strings) -> string
3985  *
3986  * Prepends each string in +other_strings+ to +self+ and returns +self+:
3987  *
3988  * s = 'foo'
3989  * s.prepend('bar', 'baz') # => "barbazfoo"
3990  * s # => "barbazfoo"
3991  *
3992  * Related: String#concat.
3993  */
3994 
3995 static VALUE
3996 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3997 {
3998  str_modifiable(str);
3999 
4000  if (argc == 1) {
4001  rb_str_update(str, 0L, 0L, argv[0]);
4002  }
4003  else if (argc > 1) {
4004  int i;
4005  VALUE arg_str = rb_str_tmp_new(0);
4006  rb_enc_copy(arg_str, str);
4007  for (i = 0; i < argc; i++) {
4008  rb_str_append(arg_str, argv[i]);
4009  }
4010  rb_str_update(str, 0L, 0L, arg_str);
4011  }
4012 
4013  return str;
4014 }
4015 
4016 st_index_t
4018 {
4019  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4020  st_index_t precomputed_hash;
4021  memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4022 
4023  RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4024  return precomputed_hash;
4025  }
4026 
4027  return str_do_hash(str);
4028 }
4029 
4030 int
4032 {
4033  long len1, len2;
4034  const char *ptr1, *ptr2;
4035  RSTRING_GETMEM(str1, ptr1, len1);
4036  RSTRING_GETMEM(str2, ptr2, len2);
4037  return (len1 != len2 ||
4038  !rb_str_comparable(str1, str2) ||
4039  memcmp(ptr1, ptr2, len1) != 0);
4040 }
4041 
4042 /*
4043  * call-seq:
4044  * hash -> integer
4045  *
4046  * Returns the integer hash value for +self+.
4047  * The value is based on the length, content and encoding of +self+.
4048  *
4049  * Related: Object#hash.
4050  */
4051 
4052 static VALUE
4053 rb_str_hash_m(VALUE str)
4054 {
4055  st_index_t hval = rb_str_hash(str);
4056  return ST2FIX(hval);
4057 }
4058 
4059 #define lesser(a,b) (((a)>(b))?(b):(a))
4060 
4061 int
4063 {
4064  int idx1, idx2;
4065  int rc1, rc2;
4066 
4067  if (RSTRING_LEN(str1) == 0) return TRUE;
4068  if (RSTRING_LEN(str2) == 0) return TRUE;
4069  idx1 = ENCODING_GET(str1);
4070  idx2 = ENCODING_GET(str2);
4071  if (idx1 == idx2) return TRUE;
4072  rc1 = rb_enc_str_coderange(str1);
4073  rc2 = rb_enc_str_coderange(str2);
4074  if (rc1 == ENC_CODERANGE_7BIT) {
4075  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4077  return TRUE;
4078  }
4079  if (rc2 == ENC_CODERANGE_7BIT) {
4081  return TRUE;
4082  }
4083  return FALSE;
4084 }
4085 
4086 int
4088 {
4089  long len1, len2;
4090  const char *ptr1, *ptr2;
4091  int retval;
4092 
4093  if (str1 == str2) return 0;
4094  RSTRING_GETMEM(str1, ptr1, len1);
4095  RSTRING_GETMEM(str2, ptr2, len2);
4096  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4097  if (len1 == len2) {
4098  if (!rb_str_comparable(str1, str2)) {
4099  if (ENCODING_GET(str1) > ENCODING_GET(str2))
4100  return 1;
4101  return -1;
4102  }
4103  return 0;
4104  }
4105  if (len1 > len2) return 1;
4106  return -1;
4107  }
4108  if (retval > 0) return 1;
4109  return -1;
4110 }
4111 
4112 /*
4113  * call-seq:
4114  * string == object -> true or false
4115  * string === object -> true or false
4116  *
4117  * Returns +true+ if +object+ has the same length and content;
4118  * as +self+; +false+ otherwise:
4119  *
4120  * s = 'foo'
4121  * s == 'foo' # => true
4122  * s == 'food' # => false
4123  * s == 'FOO' # => false
4124  *
4125  * Returns +false+ if the two strings' encodings are not compatible:
4126  * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4127  *
4128  * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4129  * two strings are compared using <code>object.==</code>.
4130  */
4131 
4132 VALUE
4134 {
4135  if (str1 == str2) return Qtrue;
4136  if (!RB_TYPE_P(str2, T_STRING)) {
4137  if (!rb_respond_to(str2, idTo_str)) {
4138  return Qfalse;
4139  }
4140  return rb_equal(str2, str1);
4141  }
4142  return rb_str_eql_internal(str1, str2);
4143 }
4144 
4145 /*
4146  * call-seq:
4147  * eql?(object) -> true or false
4148  *
4149  * Returns +true+ if +object+ has the same length and content;
4150  * as +self+; +false+ otherwise:
4151  *
4152  * s = 'foo'
4153  * s.eql?('foo') # => true
4154  * s.eql?('food') # => false
4155  * s.eql?('FOO') # => false
4156  *
4157  * Returns +false+ if the two strings' encodings are not compatible:
4158  *
4159  * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4160  *
4161  */
4162 
4163 VALUE
4164 rb_str_eql(VALUE str1, VALUE str2)
4165 {
4166  if (str1 == str2) return Qtrue;
4167  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4168  return rb_str_eql_internal(str1, str2);
4169 }
4170 
4171 /*
4172  * call-seq:
4173  * string <=> other_string -> -1, 0, 1, or nil
4174  *
4175  * Compares +self+ and +other_string+, returning:
4176  *
4177  * - -1 if +other_string+ is larger.
4178  * - 0 if the two are equal.
4179  * - 1 if +other_string+ is smaller.
4180  * - +nil+ if the two are incomparable.
4181  *
4182  * Examples:
4183  *
4184  * 'foo' <=> 'foo' # => 0
4185  * 'foo' <=> 'food' # => -1
4186  * 'food' <=> 'foo' # => 1
4187  * 'FOO' <=> 'foo' # => -1
4188  * 'foo' <=> 'FOO' # => 1
4189  * 'foo' <=> 1 # => nil
4190  *
4191  */
4192 
4193 static VALUE
4194 rb_str_cmp_m(VALUE str1, VALUE str2)
4195 {
4196  int result;
4197  VALUE s = rb_check_string_type(str2);
4198  if (NIL_P(s)) {
4199  return rb_invcmp(str1, str2);
4200  }
4201  result = rb_str_cmp(str1, s);
4202  return INT2FIX(result);
4203 }
4204 
4205 static VALUE str_casecmp(VALUE str1, VALUE str2);
4206 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4207 
4208 /*
4209  * call-seq:
4210  * casecmp(other_string) -> -1, 0, 1, or nil
4211  *
4212  * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4213  *
4214  * - -1 if <tt>other_string.downcase</tt> is larger.
4215  * - 0 if the two are equal.
4216  * - 1 if <tt>other_string.downcase</tt> is smaller.
4217  * - +nil+ if the two are incomparable.
4218  *
4219  * Examples:
4220  *
4221  * 'foo'.casecmp('foo') # => 0
4222  * 'foo'.casecmp('food') # => -1
4223  * 'food'.casecmp('foo') # => 1
4224  * 'FOO'.casecmp('foo') # => 0
4225  * 'foo'.casecmp('FOO') # => 0
4226  * 'foo'.casecmp(1) # => nil
4227  *
4228  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4229  *
4230  * Related: String#casecmp?.
4231  *
4232  */
4233 
4234 static VALUE
4235 rb_str_casecmp(VALUE str1, VALUE str2)
4236 {
4237  VALUE s = rb_check_string_type(str2);
4238  if (NIL_P(s)) {
4239  return Qnil;
4240  }
4241  return str_casecmp(str1, s);
4242 }
4243 
4244 static VALUE
4245 str_casecmp(VALUE str1, VALUE str2)
4246 {
4247  long len;
4248  rb_encoding *enc;
4249  const char *p1, *p1end, *p2, *p2end;
4250 
4251  enc = rb_enc_compatible(str1, str2);
4252  if (!enc) {
4253  return Qnil;
4254  }
4255 
4256  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4257  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4258  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4259  while (p1 < p1end && p2 < p2end) {
4260  if (*p1 != *p2) {
4261  unsigned int c1 = TOLOWER(*p1 & 0xff);
4262  unsigned int c2 = TOLOWER(*p2 & 0xff);
4263  if (c1 != c2)
4264  return INT2FIX(c1 < c2 ? -1 : 1);
4265  }
4266  p1++;
4267  p2++;
4268  }
4269  }
4270  else {
4271  while (p1 < p1end && p2 < p2end) {
4272  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4273  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4274 
4275  if (0 <= c1 && 0 <= c2) {
4276  c1 = TOLOWER(c1);
4277  c2 = TOLOWER(c2);
4278  if (c1 != c2)
4279  return INT2FIX(c1 < c2 ? -1 : 1);
4280  }
4281  else {
4282  int r;
4283  l1 = rb_enc_mbclen(p1, p1end, enc);
4284  l2 = rb_enc_mbclen(p2, p2end, enc);
4285  len = l1 < l2 ? l1 : l2;
4286  r = memcmp(p1, p2, len);
4287  if (r != 0)
4288  return INT2FIX(r < 0 ? -1 : 1);
4289  if (l1 != l2)
4290  return INT2FIX(l1 < l2 ? -1 : 1);
4291  }
4292  p1 += l1;
4293  p2 += l2;
4294  }
4295  }
4296  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4297  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4298  return INT2FIX(-1);
4299 }
4300 
4301 /*
4302  * call-seq:
4303  * casecmp?(other_string) -> true, false, or nil
4304  *
4305  * Returns +true+ if +self+ and +other_string+ are equal after
4306  * Unicode case folding, otherwise +false+:
4307  *
4308  * 'foo'.casecmp?('foo') # => true
4309  * 'foo'.casecmp?('food') # => false
4310  * 'food'.casecmp?('foo') # => false
4311  * 'FOO'.casecmp?('foo') # => true
4312  * 'foo'.casecmp?('FOO') # => true
4313  *
4314  * Returns +nil+ if the two values are incomparable:
4315  *
4316  * 'foo'.casecmp?(1) # => nil
4317  *
4318  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4319  *
4320  * Related: String#casecmp.
4321  *
4322  */
4323 
4324 static VALUE
4325 rb_str_casecmp_p(VALUE str1, VALUE str2)
4326 {
4327  VALUE s = rb_check_string_type(str2);
4328  if (NIL_P(s)) {
4329  return Qnil;
4330  }
4331  return str_casecmp_p(str1, s);
4332 }
4333 
4334 static VALUE
4335 str_casecmp_p(VALUE str1, VALUE str2)
4336 {
4337  rb_encoding *enc;
4338  VALUE folded_str1, folded_str2;
4339  VALUE fold_opt = sym_fold;
4340 
4341  enc = rb_enc_compatible(str1, str2);
4342  if (!enc) {
4343  return Qnil;
4344  }
4345 
4346  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4347  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4348 
4349  return rb_str_eql(folded_str1, folded_str2);
4350 }
4351 
4352 static long
4353 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4354  const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4355 {
4356  const char *search_start = str_ptr;
4357  long pos, search_len = str_len - offset;
4358 
4359  for (;;) {
4360  const char *t;
4361  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4362  if (pos < 0) return pos;
4363  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4364  if (t == search_start + pos) break;
4365  search_len -= t - search_start;
4366  if (search_len <= 0) return -1;
4367  offset += t - search_start;
4368  search_start = t;
4369  }
4370  return pos + offset;
4371 }
4372 
4373 /* found index in byte */
4374 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4375 #define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4376 
4377 static long
4378 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4379 {
4380  const char *str_ptr, *str_ptr_end, *sub_ptr;
4381  long str_len, sub_len;
4382  rb_encoding *enc;
4383 
4384  enc = rb_enc_check(str, sub);
4385  if (is_broken_string(sub)) return -1;
4386 
4387  str_ptr = RSTRING_PTR(str);
4388  str_ptr_end = RSTRING_END(str);
4389  str_len = RSTRING_LEN(str);
4390  sub_ptr = RSTRING_PTR(sub);
4391  sub_len = RSTRING_LEN(sub);
4392 
4393  if (str_len < sub_len) return -1;
4394 
4395  if (offset != 0) {
4396  long str_len_char, sub_len_char;
4397  int single_byte = single_byte_optimizable(str);
4398  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4399  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4400  if (offset < 0) {
4401  offset += str_len_char;
4402  if (offset < 0) return -1;
4403  }
4404  if (str_len_char - offset < sub_len_char) return -1;
4405  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4406  str_ptr += offset;
4407  }
4408  if (sub_len == 0) return offset;
4409 
4410  /* need proceed one character at a time */
4411  return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4412 }
4413 
4414 
4415 /*
4416  * call-seq:
4417  * index(substring, offset = 0) -> integer or nil
4418  * index(regexp, offset = 0) -> integer or nil
4419  *
4420  * :include: doc/string/index.rdoc
4421  *
4422  */
4423 
4424 static VALUE
4425 rb_str_index_m(int argc, VALUE *argv, VALUE str)
4426 {
4427  VALUE sub;
4428  VALUE initpos;
4429  rb_encoding *enc = STR_ENC_GET(str);
4430  long pos;
4431 
4432  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4433  long slen = str_strlen(str, enc); /* str's enc */
4434  pos = NUM2LONG(initpos);
4435  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4436  if (RB_TYPE_P(sub, T_REGEXP)) {
4438  }
4439  return Qnil;
4440  }
4441  }
4442  else {
4443  pos = 0;
4444  }
4445 
4446  if (RB_TYPE_P(sub, T_REGEXP)) {
4447  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4448  enc, single_byte_optimizable(str));
4449 
4450  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4451  VALUE match = rb_backref_get();
4452  struct re_registers *regs = RMATCH_REGS(match);
4453  pos = rb_str_sublen(str, BEG(0));
4454  return LONG2NUM(pos);
4455  }
4456  }
4457  else {
4458  StringValue(sub);
4459  pos = rb_str_index(str, sub, pos);
4460  if (pos >= 0) {
4461  pos = rb_str_sublen(str, pos);
4462  return LONG2NUM(pos);
4463  }
4464  }
4465  return Qnil;
4466 }
4467 
4468 /* Ensure that the given pos is a valid character boundary.
4469  * Note that in this function, "character" means a code point
4470  * (Unicode scalar value), not a grapheme cluster.
4471  */
4472 static void
4473 str_ensure_byte_pos(VALUE str, long pos)
4474 {
4475  if (!single_byte_optimizable(str)) {
4476  const char *s = RSTRING_PTR(str);
4477  const char *e = RSTRING_END(str);
4478  const char *p = s + pos;
4479  if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4481  "offset %ld does not land on character boundary", pos);
4482  }
4483  }
4484 }
4485 
4486 /*
4487  * call-seq:
4488  * byteindex(substring, offset = 0) -> integer or nil
4489  * byteindex(regexp, offset = 0) -> integer or nil
4490  *
4491  * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4492  * or +nil+ if none found:
4493  *
4494  * 'foo'.byteindex('f') # => 0
4495  * 'foo'.byteindex('o') # => 1
4496  * 'foo'.byteindex('oo') # => 1
4497  * 'foo'.byteindex('ooo') # => nil
4498  *
4499  * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4500  * or +nil+ if none found:
4501  *
4502  * 'foo'.byteindex(/f/) # => 0
4503  * 'foo'.byteindex(/o/) # => 1
4504  * 'foo'.byteindex(/oo/) # => 1
4505  * 'foo'.byteindex(/ooo/) # => nil
4506  *
4507  * Integer argument +offset+, if given, specifies the byte-based position in the
4508  * string to begin the search:
4509  *
4510  * 'foo'.byteindex('o', 1) # => 1
4511  * 'foo'.byteindex('o', 2) # => 2
4512  * 'foo'.byteindex('o', 3) # => nil
4513  *
4514  * If +offset+ is negative, counts backward from the end of +self+:
4515  *
4516  * 'foo'.byteindex('o', -1) # => 2
4517  * 'foo'.byteindex('o', -2) # => 1
4518  * 'foo'.byteindex('o', -3) # => 1
4519  * 'foo'.byteindex('o', -4) # => nil
4520  *
4521  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4522  * raised.
4523  *
4524  * Related: String#index, String#byterindex.
4525  */
4526 
4527 static VALUE
4528 rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4529 {
4530  VALUE sub;
4531  VALUE initpos;
4532  long pos;
4533 
4534  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4535  long slen = RSTRING_LEN(str);
4536  pos = NUM2LONG(initpos);
4537  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4538  if (RB_TYPE_P(sub, T_REGEXP)) {
4540  }
4541  return Qnil;
4542  }
4543  }
4544  else {
4545  pos = 0;
4546  }
4547 
4548  str_ensure_byte_pos(str, pos);
4549 
4550  if (RB_TYPE_P(sub, T_REGEXP)) {
4551  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4552  VALUE match = rb_backref_get();
4553  struct re_registers *regs = RMATCH_REGS(match);
4554  pos = BEG(0);
4555  return LONG2NUM(pos);
4556  }
4557  }
4558  else {
4559  StringValue(sub);
4560  pos = rb_str_byteindex(str, sub, pos);
4561  if (pos >= 0) return LONG2NUM(pos);
4562  }
4563  return Qnil;
4564 }
4565 
4566 #ifndef HAVE_MEMRCHR
4567 static void*
4568 memrchr(const char *search_str, int chr, long search_len)
4569 {
4570  const char *ptr = search_str + search_len;
4571  while (ptr > search_str) {
4572  if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4573  }
4574 
4575  return ((void *)0);
4576 }
4577 #endif
4578 
4579 static long
4580 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4581 {
4582  char *hit, *adjusted;
4583  int c;
4584  long slen, searchlen;
4585  char *sbeg, *e, *t;
4586 
4587  sbeg = RSTRING_PTR(str);
4588  slen = RSTRING_LEN(sub);
4589  if (slen == 0) return s - sbeg;
4590  e = RSTRING_END(str);
4591  t = RSTRING_PTR(sub);
4592  c = *t & 0xff;
4593  searchlen = s - sbeg + 1;
4594 
4595  if (memcmp(s, t, slen) == 0) {
4596  return s - sbeg;
4597  }
4598 
4599  do {
4600  hit = memrchr(sbeg, c, searchlen);
4601  if (!hit) break;
4602  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4603  if (hit != adjusted) {
4604  searchlen = adjusted - sbeg;
4605  continue;
4606  }
4607  if (memcmp(hit, t, slen) == 0)
4608  return hit - sbeg;
4609  searchlen = adjusted - sbeg;
4610  } while (searchlen > 0);
4611 
4612  return -1;
4613 }
4614 
4615 /* found index in byte */
4616 static long
4617 rb_str_rindex(VALUE str, VALUE sub, long pos)
4618 {
4619  long len, slen;
4620  char *sbeg, *s;
4621  rb_encoding *enc;
4622  int singlebyte;
4623 
4624  enc = rb_enc_check(str, sub);
4625  if (is_broken_string(sub)) return -1;
4626  singlebyte = single_byte_optimizable(str);
4627  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4628  slen = str_strlen(sub, enc); /* rb_enc_check */
4629 
4630  /* substring longer than string */
4631  if (len < slen) return -1;
4632  if (len - pos < slen) pos = len - slen;
4633  if (len == 0) return pos;
4634 
4635  sbeg = RSTRING_PTR(str);
4636 
4637  if (pos == 0) {
4638  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4639  return 0;
4640  else
4641  return -1;
4642  }
4643 
4644  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4645  return str_rindex(str, sub, s, enc);
4646 }
4647 
4648 /*
4649  * call-seq:
4650  * rindex(substring, offset = self.length) -> integer or nil
4651  * rindex(regexp, offset = self.length) -> integer or nil
4652  *
4653  * Returns the Integer index of the _last_ occurrence of the given +substring+,
4654  * or +nil+ if none found:
4655  *
4656  * 'foo'.rindex('f') # => 0
4657  * 'foo'.rindex('o') # => 2
4658  * 'foo'.rindex('oo') # => 1
4659  * 'foo'.rindex('ooo') # => nil
4660  *
4661  * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4662  * or +nil+ if none found:
4663  *
4664  * 'foo'.rindex(/f/) # => 0
4665  * 'foo'.rindex(/o/) # => 2
4666  * 'foo'.rindex(/oo/) # => 1
4667  * 'foo'.rindex(/ooo/) # => nil
4668  *
4669  * The _last_ match means starting at the possible last position, not
4670  * the last of longest matches.
4671  *
4672  * 'foo'.rindex(/o+/) # => 2
4673  * $~ #=> #<MatchData "o">
4674  *
4675  * To get the last longest match, needs to combine with negative
4676  * lookbehind.
4677  *
4678  * 'foo'.rindex(/(?<!o)o+/) # => 1
4679  * $~ #=> #<MatchData "oo">
4680  *
4681  * Or String#index with negative lookforward.
4682  *
4683  * 'foo'.index(/o+(?!.*o)/) # => 1
4684  * $~ #=> #<MatchData "oo">
4685  *
4686  * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4687  * string to _end_ the search:
4688  *
4689  * 'foo'.rindex('o', 0) # => nil
4690  * 'foo'.rindex('o', 1) # => 1
4691  * 'foo'.rindex('o', 2) # => 2
4692  * 'foo'.rindex('o', 3) # => 2
4693  *
4694  * If +offset+ is a negative Integer, the maximum starting position in the
4695  * string to _end_ the search is the sum of the string's length and +offset+:
4696  *
4697  * 'foo'.rindex('o', -1) # => 2
4698  * 'foo'.rindex('o', -2) # => 1
4699  * 'foo'.rindex('o', -3) # => nil
4700  * 'foo'.rindex('o', -4) # => nil
4701  *
4702  * Related: String#index.
4703  */
4704 
4705 static VALUE
4706 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4707 {
4708  VALUE sub;
4709  VALUE initpos;
4710  rb_encoding *enc = STR_ENC_GET(str);
4711  long pos, len = str_strlen(str, enc); /* str's enc */
4712 
4713  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4714  pos = NUM2LONG(initpos);
4715  if (pos < 0 && (pos += len) < 0) {
4716  if (RB_TYPE_P(sub, T_REGEXP)) {
4718  }
4719  return Qnil;
4720  }
4721  if (pos > len) pos = len;
4722  }
4723  else {
4724  pos = len;
4725  }
4726 
4727  if (RB_TYPE_P(sub, T_REGEXP)) {
4728  /* enc = rb_enc_check(str, sub); */
4729  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4730  enc, single_byte_optimizable(str));
4731 
4732  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4733  VALUE match = rb_backref_get();
4734  struct re_registers *regs = RMATCH_REGS(match);
4735  pos = rb_str_sublen(str, BEG(0));
4736  return LONG2NUM(pos);
4737  }
4738  }
4739  else {
4740  StringValue(sub);
4741  pos = rb_str_rindex(str, sub, pos);
4742  if (pos >= 0) {
4743  pos = rb_str_sublen(str, pos);
4744  return LONG2NUM(pos);
4745  }
4746  }
4747  return Qnil;
4748 }
4749 
4750 static long
4751 rb_str_byterindex(VALUE str, VALUE sub, long pos)
4752 {
4753  long len, slen;
4754  char *sbeg, *s;
4755  rb_encoding *enc;
4756 
4757  enc = rb_enc_check(str, sub);
4758  if (is_broken_string(sub)) return -1;
4759  len = RSTRING_LEN(str);
4760  slen = RSTRING_LEN(sub);
4761 
4762  /* substring longer than string */
4763  if (len < slen) return -1;
4764  if (len - pos < slen) pos = len - slen;
4765  if (len == 0) return pos;
4766 
4767  sbeg = RSTRING_PTR(str);
4768 
4769  if (pos == 0) {
4770  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4771  return 0;
4772  else
4773  return -1;
4774  }
4775 
4776  s = sbeg + pos;
4777  return str_rindex(str, sub, s, enc);
4778 }
4779 
4780 
4781 /*
4782  * call-seq:
4783  * byterindex(substring, offset = self.bytesize) -> integer or nil
4784  * byterindex(regexp, offset = self.bytesize) -> integer or nil
4785  *
4786  * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4787  * or +nil+ if none found:
4788  *
4789  * 'foo'.byterindex('f') # => 0
4790  * 'foo'.byterindex('o') # => 2
4791  * 'foo'.byterindex('oo') # => 1
4792  * 'foo'.byterindex('ooo') # => nil
4793  *
4794  * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4795  * or +nil+ if none found:
4796  *
4797  * 'foo'.byterindex(/f/) # => 0
4798  * 'foo'.byterindex(/o/) # => 2
4799  * 'foo'.byterindex(/oo/) # => 1
4800  * 'foo'.byterindex(/ooo/) # => nil
4801  *
4802  * The _last_ match means starting at the possible last position, not
4803  * the last of longest matches.
4804  *
4805  * 'foo'.byterindex(/o+/) # => 2
4806  * $~ #=> #<MatchData "o">
4807  *
4808  * To get the last longest match, needs to combine with negative
4809  * lookbehind.
4810  *
4811  * 'foo'.byterindex(/(?<!o)o+/) # => 1
4812  * $~ #=> #<MatchData "oo">
4813  *
4814  * Or String#byteindex with negative lookforward.
4815  *
4816  * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4817  * $~ #=> #<MatchData "oo">
4818  *
4819  * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4820  * string to _end_ the search:
4821  *
4822  * 'foo'.byterindex('o', 0) # => nil
4823  * 'foo'.byterindex('o', 1) # => 1
4824  * 'foo'.byterindex('o', 2) # => 2
4825  * 'foo'.byterindex('o', 3) # => 2
4826  *
4827  * If +offset+ is a negative Integer, the maximum starting position in the
4828  * string to _end_ the search is the sum of the string's length and +offset+:
4829  *
4830  * 'foo'.byterindex('o', -1) # => 2
4831  * 'foo'.byterindex('o', -2) # => 1
4832  * 'foo'.byterindex('o', -3) # => nil
4833  * 'foo'.byterindex('o', -4) # => nil
4834  *
4835  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4836  * raised.
4837  *
4838  * Related: String#byteindex.
4839  */
4840 
4841 static VALUE
4842 rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4843 {
4844  VALUE sub;
4845  VALUE initpos;
4846  long pos, len = RSTRING_LEN(str);
4847 
4848  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4849  pos = NUM2LONG(initpos);
4850  if (pos < 0 && (pos += len) < 0) {
4851  if (RB_TYPE_P(sub, T_REGEXP)) {
4853  }
4854  return Qnil;
4855  }
4856  if (pos > len) pos = len;
4857  }
4858  else {
4859  pos = len;
4860  }
4861 
4862  str_ensure_byte_pos(str, pos);
4863 
4864  if (RB_TYPE_P(sub, T_REGEXP)) {
4865  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4866  VALUE match = rb_backref_get();
4867  struct re_registers *regs = RMATCH_REGS(match);
4868  pos = BEG(0);
4869  return LONG2NUM(pos);
4870  }
4871  }
4872  else {
4873  StringValue(sub);
4874  pos = rb_str_byterindex(str, sub, pos);
4875  if (pos >= 0) return LONG2NUM(pos);
4876  }
4877  return Qnil;
4878 }
4879 
4880 /*
4881  * call-seq:
4882  * string =~ regexp -> integer or nil
4883  * string =~ object -> integer or nil
4884  *
4885  * Returns the Integer index of the first substring that matches
4886  * the given +regexp+, or +nil+ if no match found:
4887  *
4888  * 'foo' =~ /f/ # => 0
4889  * 'foo' =~ /o/ # => 1
4890  * 'foo' =~ /x/ # => nil
4891  *
4892  * Note: also updates Regexp@Global+Variables.
4893  *
4894  * If the given +object+ is not a Regexp, returns the value
4895  * returned by <tt>object =~ self</tt>.
4896  *
4897  * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4898  * (see Regexp#=~):
4899  *
4900  * number= nil
4901  * "no. 9" =~ /(?<number>\d+)/
4902  * number # => nil (not assigned)
4903  * /(?<number>\d+)/ =~ "no. 9"
4904  * number #=> "9"
4905  *
4906  */
4907 
4908 static VALUE
4909 rb_str_match(VALUE x, VALUE y)
4910 {
4911  switch (OBJ_BUILTIN_TYPE(y)) {
4912  case T_STRING:
4913  rb_raise(rb_eTypeError, "type mismatch: String given");
4914 
4915  case T_REGEXP:
4916  return rb_reg_match(y, x);
4917 
4918  default:
4919  return rb_funcall(y, idEqTilde, 1, x);
4920  }
4921 }
4922 
4923 
4924 static VALUE get_pat(VALUE);
4925 
4926 
4927 /*
4928  * call-seq:
4929  * match(pattern, offset = 0) -> matchdata or nil
4930  * match(pattern, offset = 0) {|matchdata| ... } -> object
4931  *
4932  * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4933  *
4934  * Note: also updates Regexp@Global+Variables.
4935  *
4936  * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4937  * regexp = Regexp.new(pattern)
4938  * - Computes +matchdata+, which will be either a MatchData object or +nil+
4939  * (see Regexp#match):
4940  * matchdata = <tt>regexp.match(self)
4941  *
4942  * With no block given, returns the computed +matchdata+:
4943  *
4944  * 'foo'.match('f') # => #<MatchData "f">
4945  * 'foo'.match('o') # => #<MatchData "o">
4946  * 'foo'.match('x') # => nil
4947  *
4948  * If Integer argument +offset+ is given, the search begins at index +offset+:
4949  *
4950  * 'foo'.match('f', 1) # => nil
4951  * 'foo'.match('o', 1) # => #<MatchData "o">
4952  *
4953  * With a block given, calls the block with the computed +matchdata+
4954  * and returns the block's return value:
4955  *
4956  * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4957  * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4958  * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4959  *
4960  */
4961 
4962 static VALUE
4963 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4964 {
4965  VALUE re, result;
4966  if (argc < 1)
4967  rb_check_arity(argc, 1, 2);
4968  re = argv[0];
4969  argv[0] = str;
4970  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4971  if (!NIL_P(result) && rb_block_given_p()) {
4972  return rb_yield(result);
4973  }
4974  return result;
4975 }
4976 
4977 /*
4978  * call-seq:
4979  * match?(pattern, offset = 0) -> true or false
4980  *
4981  * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4982  *
4983  * Note: does not update Regexp@Global+Variables.
4984  *
4985  * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4986  * regexp = Regexp.new(pattern)
4987  *
4988  * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4989  * +false+ otherwise:
4990  *
4991  * 'foo'.match?(/o/) # => true
4992  * 'foo'.match?('o') # => true
4993  * 'foo'.match?(/x/) # => false
4994  *
4995  * If Integer argument +offset+ is given, the search begins at index +offset+:
4996  * 'foo'.match?('f', 1) # => false
4997  * 'foo'.match?('o', 1) # => true
4998  *
4999  */
5000 
5001 static VALUE
5002 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5003 {
5004  VALUE re;
5005  rb_check_arity(argc, 1, 2);
5006  re = get_pat(argv[0]);
5007  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5008 }
5009 
5010 enum neighbor_char {
5011  NEIGHBOR_NOT_CHAR,
5012  NEIGHBOR_FOUND,
5013  NEIGHBOR_WRAPPED
5014 };
5015 
5016 static enum neighbor_char
5017 enc_succ_char(char *p, long len, rb_encoding *enc)
5018 {
5019  long i;
5020  int l;
5021 
5022  if (rb_enc_mbminlen(enc) > 1) {
5023  /* wchar, trivial case */
5024  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5025  if (!MBCLEN_CHARFOUND_P(r)) {
5026  return NEIGHBOR_NOT_CHAR;
5027  }
5028  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5029  l = rb_enc_code_to_mbclen(c, enc);
5030  if (!l) return NEIGHBOR_NOT_CHAR;
5031  if (l != len) return NEIGHBOR_WRAPPED;
5032  rb_enc_mbcput(c, p, enc);
5033  r = rb_enc_precise_mbclen(p, p + len, enc);
5034  if (!MBCLEN_CHARFOUND_P(r)) {
5035  return NEIGHBOR_NOT_CHAR;
5036  }
5037  return NEIGHBOR_FOUND;
5038  }
5039  while (1) {
5040  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5041  p[i] = '\0';
5042  if (i < 0)
5043  return NEIGHBOR_WRAPPED;
5044  ++((unsigned char*)p)[i];
5045  l = rb_enc_precise_mbclen(p, p+len, enc);
5046  if (MBCLEN_CHARFOUND_P(l)) {
5047  l = MBCLEN_CHARFOUND_LEN(l);
5048  if (l == len) {
5049  return NEIGHBOR_FOUND;
5050  }
5051  else {
5052  memset(p+l, 0xff, len-l);
5053  }
5054  }
5055  if (MBCLEN_INVALID_P(l) && i < len-1) {
5056  long len2;
5057  int l2;
5058  for (len2 = len-1; 0 < len2; len2--) {
5059  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5060  if (!MBCLEN_INVALID_P(l2))
5061  break;
5062  }
5063  memset(p+len2+1, 0xff, len-(len2+1));
5064  }
5065  }
5066 }
5067 
5068 static enum neighbor_char
5069 enc_pred_char(char *p, long len, rb_encoding *enc)
5070 {
5071  long i;
5072  int l;
5073  if (rb_enc_mbminlen(enc) > 1) {
5074  /* wchar, trivial case */
5075  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5076  if (!MBCLEN_CHARFOUND_P(r)) {
5077  return NEIGHBOR_NOT_CHAR;
5078  }
5079  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5080  if (!c) return NEIGHBOR_NOT_CHAR;
5081  --c;
5082  l = rb_enc_code_to_mbclen(c, enc);
5083  if (!l) return NEIGHBOR_NOT_CHAR;
5084  if (l != len) return NEIGHBOR_WRAPPED;
5085  rb_enc_mbcput(c, p, enc);
5086  r = rb_enc_precise_mbclen(p, p + len, enc);
5087  if (!MBCLEN_CHARFOUND_P(r)) {
5088  return NEIGHBOR_NOT_CHAR;
5089  }
5090  return NEIGHBOR_FOUND;
5091  }
5092  while (1) {
5093  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5094  p[i] = '\xff';
5095  if (i < 0)
5096  return NEIGHBOR_WRAPPED;
5097  --((unsigned char*)p)[i];
5098  l = rb_enc_precise_mbclen(p, p+len, enc);
5099  if (MBCLEN_CHARFOUND_P(l)) {
5100  l = MBCLEN_CHARFOUND_LEN(l);
5101  if (l == len) {
5102  return NEIGHBOR_FOUND;
5103  }
5104  else {
5105  memset(p+l, 0, len-l);
5106  }
5107  }
5108  if (MBCLEN_INVALID_P(l) && i < len-1) {
5109  long len2;
5110  int l2;
5111  for (len2 = len-1; 0 < len2; len2--) {
5112  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5113  if (!MBCLEN_INVALID_P(l2))
5114  break;
5115  }
5116  memset(p+len2+1, 0, len-(len2+1));
5117  }
5118  }
5119 }
5120 
5121 /*
5122  overwrite +p+ by succeeding letter in +enc+ and returns
5123  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5124  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5125  assuming each ranges are successive, and mbclen
5126  never change in each ranges.
5127  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5128  character.
5129  */
5130 static enum neighbor_char
5131 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5132 {
5133  enum neighbor_char ret;
5134  unsigned int c;
5135  int ctype;
5136  int range;
5137  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5138 
5139  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5140  int try;
5141  const int max_gaps = 1;
5142 
5143  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5144  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5145  ctype = ONIGENC_CTYPE_DIGIT;
5146  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5147  ctype = ONIGENC_CTYPE_ALPHA;
5148  else
5149  return NEIGHBOR_NOT_CHAR;
5150 
5151  MEMCPY(save, p, char, len);
5152  for (try = 0; try <= max_gaps; ++try) {
5153  ret = enc_succ_char(p, len, enc);
5154  if (ret == NEIGHBOR_FOUND) {
5155  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5156  if (rb_enc_isctype(c, ctype, enc))
5157  return NEIGHBOR_FOUND;
5158  }
5159  }
5160  MEMCPY(p, save, char, len);
5161  range = 1;
5162  while (1) {
5163  MEMCPY(save, p, char, len);
5164  ret = enc_pred_char(p, len, enc);
5165  if (ret == NEIGHBOR_FOUND) {
5166  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5167  if (!rb_enc_isctype(c, ctype, enc)) {
5168  MEMCPY(p, save, char, len);
5169  break;
5170  }
5171  }
5172  else {
5173  MEMCPY(p, save, char, len);
5174  break;
5175  }
5176  range++;
5177  }
5178  if (range == 1) {
5179  return NEIGHBOR_NOT_CHAR;
5180  }
5181 
5182  if (ctype != ONIGENC_CTYPE_DIGIT) {
5183  MEMCPY(carry, p, char, len);
5184  return NEIGHBOR_WRAPPED;
5185  }
5186 
5187  MEMCPY(carry, p, char, len);
5188  enc_succ_char(carry, len, enc);
5189  return NEIGHBOR_WRAPPED;
5190 }
5191 
5192 
5193 static VALUE str_succ(VALUE str);
5194 
5195 /*
5196  * call-seq:
5197  * succ -> new_str
5198  *
5199  * Returns the successor to +self+. The successor is calculated by
5200  * incrementing characters.
5201  *
5202  * The first character to be incremented is the rightmost alphanumeric:
5203  * or, if no alphanumerics, the rightmost character:
5204  *
5205  * 'THX1138'.succ # => "THX1139"
5206  * '<<koala>>'.succ # => "<<koalb>>"
5207  * '***'.succ # => '**+'
5208  *
5209  * The successor to a digit is another digit, "carrying" to the next-left
5210  * character for a "rollover" from 9 to 0, and prepending another digit
5211  * if necessary:
5212  *
5213  * '00'.succ # => "01"
5214  * '09'.succ # => "10"
5215  * '99'.succ # => "100"
5216  *
5217  * The successor to a letter is another letter of the same case,
5218  * carrying to the next-left character for a rollover,
5219  * and prepending another same-case letter if necessary:
5220  *
5221  * 'aa'.succ # => "ab"
5222  * 'az'.succ # => "ba"
5223  * 'zz'.succ # => "aaa"
5224  * 'AA'.succ # => "AB"
5225  * 'AZ'.succ # => "BA"
5226  * 'ZZ'.succ # => "AAA"
5227  *
5228  * The successor to a non-alphanumeric character is the next character
5229  * in the underlying character set's collating sequence,
5230  * carrying to the next-left character for a rollover,
5231  * and prepending another character if necessary:
5232  *
5233  * s = 0.chr * 3
5234  * s # => "\x00\x00\x00"
5235  * s.succ # => "\x00\x00\x01"
5236  * s = 255.chr * 3
5237  * s # => "\xFF\xFF\xFF"
5238  * s.succ # => "\x01\x00\x00\x00"
5239  *
5240  * Carrying can occur between and among mixtures of alphanumeric characters:
5241  *
5242  * s = 'zz99zz99'
5243  * s.succ # => "aaa00aa00"
5244  * s = '99zz99zz'
5245  * s.succ # => "100aa00aa"
5246  *
5247  * The successor to an empty +String+ is a new empty +String+:
5248  *
5249  * ''.succ # => ""
5250  *
5251  */
5252 
5253 VALUE
5255 {
5256  VALUE str;
5257  str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5258  rb_enc_cr_str_copy_for_substr(str, orig);
5259  return str_succ(str);
5260 }
5261 
5262 static VALUE
5263 str_succ(VALUE str)
5264 {
5265  rb_encoding *enc;
5266  char *sbeg, *s, *e, *last_alnum = 0;
5267  int found_alnum = 0;
5268  long l, slen;
5269  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5270  long carry_pos = 0, carry_len = 1;
5271  enum neighbor_char neighbor = NEIGHBOR_FOUND;
5272 
5273  slen = RSTRING_LEN(str);
5274  if (slen == 0) return str;
5275 
5276  enc = STR_ENC_GET(str);
5277  sbeg = RSTRING_PTR(str);
5278  s = e = sbeg + slen;
5279 
5280  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5281  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5282  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5283  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5284  break;
5285  }
5286  }
5287  l = rb_enc_precise_mbclen(s, e, enc);
5288  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5289  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5290  neighbor = enc_succ_alnum_char(s, l, enc, carry);
5291  switch (neighbor) {
5292  case NEIGHBOR_NOT_CHAR:
5293  continue;
5294  case NEIGHBOR_FOUND:
5295  return str;
5296  case NEIGHBOR_WRAPPED:
5297  last_alnum = s;
5298  break;
5299  }
5300  found_alnum = 1;
5301  carry_pos = s - sbeg;
5302  carry_len = l;
5303  }
5304  if (!found_alnum) { /* str contains no alnum */
5305  s = e;
5306  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5307  enum neighbor_char neighbor;
5308  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5309  l = rb_enc_precise_mbclen(s, e, enc);
5310  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5311  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5312  MEMCPY(tmp, s, char, l);
5313  neighbor = enc_succ_char(tmp, l, enc);
5314  switch (neighbor) {
5315  case NEIGHBOR_FOUND:
5316  MEMCPY(s, tmp, char, l);
5317  return str;
5318  break;
5319  case NEIGHBOR_WRAPPED:
5320  MEMCPY(s, tmp, char, l);
5321  break;
5322  case NEIGHBOR_NOT_CHAR:
5323  break;
5324  }
5325  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5326  /* wrapped to \0...\0. search next valid char. */
5327  enc_succ_char(s, l, enc);
5328  }
5329  if (!rb_enc_asciicompat(enc)) {
5330  MEMCPY(carry, s, char, l);
5331  carry_len = l;
5332  }
5333  carry_pos = s - sbeg;
5334  }
5336  }
5337  RESIZE_CAPA(str, slen + carry_len);
5338  sbeg = RSTRING_PTR(str);
5339  s = sbeg + carry_pos;
5340  memmove(s + carry_len, s, slen - carry_pos);
5341  memmove(s, carry, carry_len);
5342  slen += carry_len;
5343  STR_SET_LEN(str, slen);
5344  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5345  rb_enc_str_coderange(str);
5346  return str;
5347 }
5348 
5349 
5350 /*
5351  * call-seq:
5352  * succ! -> self
5353  *
5354  * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5355  */
5356 
5357 static VALUE
5358 rb_str_succ_bang(VALUE str)
5359 {
5360  rb_str_modify(str);
5361  str_succ(str);
5362  return str;
5363 }
5364 
5365 static int
5366 all_digits_p(const char *s, long len)
5367 {
5368  while (len-- > 0) {
5369  if (!ISDIGIT(*s)) return 0;
5370  s++;
5371  }
5372  return 1;
5373 }
5374 
5375 static int
5376 str_upto_i(VALUE str, VALUE arg)
5377 {
5378  rb_yield(str);
5379  return 0;
5380 }
5381 
5382 /*
5383  * call-seq:
5384  * upto(other_string, exclusive = false) {|string| ... } -> self
5385  * upto(other_string, exclusive = false) -> new_enumerator
5386  *
5387  * With a block given, calls the block with each +String+ value
5388  * returned by successive calls to String#succ;
5389  * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5390  * the sequence terminates when value +other_string+ is reached;
5391  * returns +self+:
5392  *
5393  * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5394  * Output:
5395  *
5396  * a8 a9 b0 b1 b2 b3 b4 b5 b6
5397  *
5398  * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5399  *
5400  * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5401  *
5402  * Output:
5403  *
5404  * a8 a9 b0 b1 b2 b3 b4 b5
5405  *
5406  * If +other_string+ would not be reached, does not call the block:
5407  *
5408  * '25'.upto('5') {|s| fail s }
5409  * 'aa'.upto('a') {|s| fail s }
5410  *
5411  * With no block given, returns a new Enumerator:
5412  *
5413  * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5414  *
5415  */
5416 
5417 static VALUE
5418 rb_str_upto(int argc, VALUE *argv, VALUE beg)
5419 {
5420  VALUE end, exclusive;
5421 
5422  rb_scan_args(argc, argv, "11", &end, &exclusive);
5423  RETURN_ENUMERATOR(beg, argc, argv);
5424  return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5425 }
5426 
5427 VALUE
5428 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5429 {
5430  VALUE current, after_end;
5431  ID succ;
5432  int n, ascii;
5433  rb_encoding *enc;
5434 
5435  CONST_ID(succ, "succ");
5436  StringValue(end);
5437  enc = rb_enc_check(beg, end);
5438  ascii = (is_ascii_string(beg) && is_ascii_string(end));
5439  /* single character */
5440  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5441  char c = RSTRING_PTR(beg)[0];
5442  char e = RSTRING_PTR(end)[0];
5443 
5444  if (c > e || (excl && c == e)) return beg;
5445  for (;;) {
5446  VALUE str = rb_enc_str_new(&c, 1, enc);
5448  if ((*each)(str, arg)) break;
5449  if (!excl && c == e) break;
5450  c++;
5451  if (excl && c == e) break;
5452  }
5453  return beg;
5454  }
5455  /* both edges are all digits */
5456  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5457  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5458  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5459  VALUE b, e;
5460  int width;
5461 
5462  width = RSTRING_LENINT(beg);
5463  b = rb_str_to_inum(beg, 10, FALSE);
5464  e = rb_str_to_inum(end, 10, FALSE);
5465  if (FIXNUM_P(b) && FIXNUM_P(e)) {
5466  long bi = FIX2LONG(b);
5467  long ei = FIX2LONG(e);
5468  rb_encoding *usascii = rb_usascii_encoding();
5469 
5470  while (bi <= ei) {
5471  if (excl && bi == ei) break;
5472  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5473  bi++;
5474  }
5475  }
5476  else {
5477  ID op = excl ? '<' : idLE;
5478  VALUE args[2], fmt = rb_fstring_lit("%.*d");
5479 
5480  args[0] = INT2FIX(width);
5481  while (rb_funcall(b, op, 1, e)) {
5482  args[1] = b;
5483  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5484  b = rb_funcallv(b, succ, 0, 0);
5485  }
5486  }
5487  return beg;
5488  }
5489  /* normal case */
5490  n = rb_str_cmp(beg, end);
5491  if (n > 0 || (excl && n == 0)) return beg;
5492 
5493  after_end = rb_funcallv(end, succ, 0, 0);
5494  current = str_duplicate(rb_cString, beg);
5495  while (!rb_str_equal(current, after_end)) {
5496  VALUE next = Qnil;
5497  if (excl || !rb_str_equal(current, end))
5498  next = rb_funcallv(current, succ, 0, 0);
5499  if ((*each)(current, arg)) break;
5500  if (NIL_P(next)) break;
5501  current = next;
5502  StringValue(current);
5503  if (excl && rb_str_equal(current, end)) break;
5504  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5505  break;
5506  }
5507 
5508  return beg;
5509 }
5510 
5511 VALUE
5512 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5513 {
5514  VALUE current;
5515  ID succ;
5516 
5517  CONST_ID(succ, "succ");
5518  /* both edges are all digits */
5519  if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5520  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5521  VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5522  int width = RSTRING_LENINT(beg);
5523  b = rb_str_to_inum(beg, 10, FALSE);
5524  if (FIXNUM_P(b)) {
5525  long bi = FIX2LONG(b);
5526  rb_encoding *usascii = rb_usascii_encoding();
5527 
5528  while (FIXABLE(bi)) {
5529  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5530  bi++;
5531  }
5532  b = LONG2NUM(bi);
5533  }
5534  args[0] = INT2FIX(width);
5535  while (1) {
5536  args[1] = b;
5537  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5538  b = rb_funcallv(b, succ, 0, 0);
5539  }
5540  }
5541  /* normal case */
5542  current = str_duplicate(rb_cString, beg);
5543  while (1) {
5544  VALUE next = rb_funcallv(current, succ, 0, 0);
5545  if ((*each)(current, arg)) break;
5546  current = next;
5547  StringValue(current);
5548  if (RSTRING_LEN(current) == 0)
5549  break;
5550  }
5551 
5552  return beg;
5553 }
5554 
5555 static int
5556 include_range_i(VALUE str, VALUE arg)
5557 {
5558  VALUE *argp = (VALUE *)arg;
5559  if (!rb_equal(str, *argp)) return 0;
5560  *argp = Qnil;
5561  return 1;
5562 }
5563 
5564 VALUE
5565 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5566 {
5567  beg = rb_str_new_frozen(beg);
5568  StringValue(end);
5569  end = rb_str_new_frozen(end);
5570  if (NIL_P(val)) return Qfalse;
5571  val = rb_check_string_type(val);
5572  if (NIL_P(val)) return Qfalse;
5573  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5574  rb_enc_asciicompat(STR_ENC_GET(end)) &&
5575  rb_enc_asciicompat(STR_ENC_GET(val))) {
5576  const char *bp = RSTRING_PTR(beg);
5577  const char *ep = RSTRING_PTR(end);
5578  const char *vp = RSTRING_PTR(val);
5579  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5580  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5581  return Qfalse;
5582  else {
5583  char b = *bp;
5584  char e = *ep;
5585  char v = *vp;
5586 
5587  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5588  if (b <= v && v < e) return Qtrue;
5589  return RBOOL(!RTEST(exclusive) && v == e);
5590  }
5591  }
5592  }
5593 #if 0
5594  /* both edges are all digits */
5595  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5596  all_digits_p(bp, RSTRING_LEN(beg)) &&
5597  all_digits_p(ep, RSTRING_LEN(end))) {
5598  /* TODO */
5599  }
5600 #endif
5601  }
5602  rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5603 
5604  return RBOOL(NIL_P(val));
5605 }
5606 
5607 static VALUE
5608 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5609 {
5610  if (rb_reg_search(re, str, 0, 0) >= 0) {
5611  VALUE match = rb_backref_get();
5612  int nth = rb_reg_backref_number(match, backref);
5613  return rb_reg_nth_match(nth, match);
5614  }
5615  return Qnil;
5616 }
5617 
5618 static VALUE
5619 rb_str_aref(VALUE str, VALUE indx)
5620 {
5621  long idx;
5622 
5623  if (FIXNUM_P(indx)) {
5624  idx = FIX2LONG(indx);
5625  }
5626  else if (RB_TYPE_P(indx, T_REGEXP)) {
5627  return rb_str_subpat(str, indx, INT2FIX(0));
5628  }
5629  else if (RB_TYPE_P(indx, T_STRING)) {
5630  if (rb_str_index(str, indx, 0) != -1)
5631  return str_duplicate(rb_cString, indx);
5632  return Qnil;
5633  }
5634  else {
5635  /* check if indx is Range */
5636  long beg, len = str_strlen(str, NULL);
5637  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5638  case Qfalse:
5639  break;
5640  case Qnil:
5641  return Qnil;
5642  default:
5643  return rb_str_substr(str, beg, len);
5644  }
5645  idx = NUM2LONG(indx);
5646  }
5647 
5648  return str_substr(str, idx, 1, FALSE);
5649 }
5650 
5651 
5652 /*
5653  * call-seq:
5654  * string[index] -> new_string or nil
5655  * string[start, length] -> new_string or nil
5656  * string[range] -> new_string or nil
5657  * string[regexp, capture = 0] -> new_string or nil
5658  * string[substring] -> new_string or nil
5659  *
5660  * Returns the substring of +self+ specified by the arguments.
5661  * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5662  *
5663  *
5664  */
5665 
5666 static VALUE
5667 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5668 {
5669  if (argc == 2) {
5670  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5671  return rb_str_subpat(str, argv[0], argv[1]);
5672  }
5673  else {
5674  long beg = NUM2LONG(argv[0]);
5675  long len = NUM2LONG(argv[1]);
5676  return rb_str_substr(str, beg, len);
5677  }
5678  }
5679  rb_check_arity(argc, 1, 2);
5680  return rb_str_aref(str, argv[0]);
5681 }
5682 
5683 VALUE
5685 {
5686  char *ptr = RSTRING_PTR(str);
5687  long olen = RSTRING_LEN(str), nlen;
5688 
5689  str_modifiable(str);
5690  if (len > olen) len = olen;
5691  nlen = olen - len;
5692  if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5693  char *oldptr = ptr;
5694  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5695  STR_SET_EMBED(str);
5696  ptr = RSTRING(str)->as.embed.ary;
5697  memmove(ptr, oldptr + len, nlen);
5698  if (fl == STR_NOEMBED) xfree(oldptr);
5699  }
5700  else {
5701  if (!STR_SHARED_P(str)) {
5702  VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5703  rb_enc_cr_str_exact_copy(shared, str);
5704  OBJ_FREEZE(shared);
5705  }
5706  ptr = RSTRING(str)->as.heap.ptr += len;
5707  }
5708  STR_SET_LEN(str, nlen);
5709 
5710  if (!SHARABLE_MIDDLE_SUBSTRING) {
5711  TERM_FILL(ptr + nlen, TERM_LEN(str));
5712  }
5713  ENC_CODERANGE_CLEAR(str);
5714  return str;
5715 }
5716 
5717 static void
5718 rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5719 {
5720  char *sptr;
5721  long slen;
5722  int cr;
5723 
5724  if (beg == 0 && vlen == 0) {
5725  rb_str_drop_bytes(str, len);
5726  return;
5727  }
5728 
5729  str_modify_keep_cr(str);
5730  RSTRING_GETMEM(str, sptr, slen);
5731  if (len < vlen) {
5732  /* expand string */
5733  RESIZE_CAPA(str, slen + vlen - len);
5734  sptr = RSTRING_PTR(str);
5735  }
5736 
5737  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5738  cr = rb_enc_str_coderange(val);
5739  else
5740  cr = ENC_CODERANGE_UNKNOWN;
5741 
5742  if (vlen != len) {
5743  memmove(sptr + beg + vlen,
5744  sptr + beg + len,
5745  slen - (beg + len));
5746  }
5747  if (vlen < beg && len < 0) {
5748  MEMZERO(sptr + slen, char, -len);
5749  }
5750  if (vlen > 0) {
5751  memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5752  }
5753  slen += vlen - len;
5754  STR_SET_LEN(str, slen);
5755  TERM_FILL(&sptr[slen], TERM_LEN(str));
5756  ENC_CODERANGE_SET(str, cr);
5757 }
5758 
5759 static inline void
5760 rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5761 {
5762  rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5763 }
5764 
5765 void
5766 rb_str_update(VALUE str, long beg, long len, VALUE val)
5767 {
5768  long slen;
5769  char *p, *e;
5770  rb_encoding *enc;
5771  int singlebyte = single_byte_optimizable(str);
5772  int cr;
5773 
5774  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5775 
5776  StringValue(val);
5777  enc = rb_enc_check(str, val);
5778  slen = str_strlen(str, enc); /* rb_enc_check */
5779 
5780  if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5781  rb_raise(rb_eIndexError, "index %ld out of string", beg);
5782  }
5783  if (beg < 0) {
5784  beg += slen;
5785  }
5786  RUBY_ASSERT(beg >= 0);
5787  RUBY_ASSERT(beg <= slen);
5788 
5789  if (len > slen - beg) {
5790  len = slen - beg;
5791  }
5792  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5793  if (!p) p = RSTRING_END(str);
5794  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5795  if (!e) e = RSTRING_END(str);
5796  /* error check */
5797  beg = p - RSTRING_PTR(str); /* physical position */
5798  len = e - p; /* physical length */
5799  rb_str_update_0(str, beg, len, val);
5800  rb_enc_associate(str, enc);
5802  if (cr != ENC_CODERANGE_BROKEN)
5803  ENC_CODERANGE_SET(str, cr);
5804 }
5805 
5806 static void
5807 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5808 {
5809  int nth;
5810  VALUE match;
5811  long start, end, len;
5812  rb_encoding *enc;
5813  struct re_registers *regs;
5814 
5815  if (rb_reg_search(re, str, 0, 0) < 0) {
5816  rb_raise(rb_eIndexError, "regexp not matched");
5817  }
5818  match = rb_backref_get();
5819  nth = rb_reg_backref_number(match, backref);
5820  regs = RMATCH_REGS(match);
5821  if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5822  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5823  }
5824  if (nth < 0) {
5825  nth += regs->num_regs;
5826  }
5827 
5828  start = BEG(nth);
5829  if (start == -1) {
5830  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5831  }
5832  end = END(nth);
5833  len = end - start;
5834  StringValue(val);
5835  enc = rb_enc_check_str(str, val);
5836  rb_str_update_0(str, start, len, val);
5837  rb_enc_associate(str, enc);
5838 }
5839 
5840 static VALUE
5841 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5842 {
5843  long idx, beg;
5844 
5845  switch (TYPE(indx)) {
5846  case T_REGEXP:
5847  rb_str_subpat_set(str, indx, INT2FIX(0), val);
5848  return val;
5849 
5850  case T_STRING:
5851  beg = rb_str_index(str, indx, 0);
5852  if (beg < 0) {
5853  rb_raise(rb_eIndexError, "string not matched");
5854  }
5855  beg = rb_str_sublen(str, beg);
5856  rb_str_update(str, beg, str_strlen(indx, NULL), val);
5857  return val;
5858 
5859  default:
5860  /* check if indx is Range */
5861  {
5862  long beg, len;
5863  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5864  rb_str_update(str, beg, len, val);
5865  return val;
5866  }
5867  }
5868  /* FALLTHROUGH */
5869 
5870  case T_FIXNUM:
5871  idx = NUM2LONG(indx);
5872  rb_str_update(str, idx, 1, val);
5873  return val;
5874  }
5875 }
5876 
5877 /*
5878  * call-seq:
5879  * string[index] = new_string
5880  * string[start, length] = new_string
5881  * string[range] = new_string
5882  * string[regexp, capture = 0] = new_string
5883  * string[substring] = new_string
5884  *
5885  * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5886  * See {String Slices}[rdoc-ref:String@String+Slices].
5887  *
5888  * A few examples:
5889  *
5890  * s = 'foo'
5891  * s[2] = 'rtune' # => "rtune"
5892  * s # => "fortune"
5893  * s[1, 5] = 'init' # => "init"
5894  * s # => "finite"
5895  * s[3..4] = 'al' # => "al"
5896  * s # => "finale"
5897  * s[/e$/] = 'ly' # => "ly"
5898  * s # => "finally"
5899  * s['lly'] = 'ncial' # => "ncial"
5900  * s # => "financial"
5901  *
5902  */
5903 
5904 static VALUE
5905 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5906 {
5907  if (argc == 3) {
5908  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5909  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5910  }
5911  else {
5912  rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5913  }
5914  return argv[2];
5915  }
5916  rb_check_arity(argc, 2, 3);
5917  return rb_str_aset(str, argv[0], argv[1]);
5918 }
5919 
5920 /*
5921  * call-seq:
5922  * insert(index, other_string) -> self
5923  *
5924  * Inserts the given +other_string+ into +self+; returns +self+.
5925  *
5926  * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5927  *
5928  * 'foo'.insert(1, 'bar') # => "fbaroo"
5929  *
5930  * If the Integer +index+ is negative, counts backward from the end of +self+
5931  * and inserts +other_string+ at offset <tt>index+1</tt>
5932  * (that is, _after_ <tt>self[index]</tt>):
5933  *
5934  * 'foo'.insert(-2, 'bar') # => "fobaro"
5935  *
5936  */
5937 
5938 static VALUE
5939 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5940 {
5941  long pos = NUM2LONG(idx);
5942 
5943  if (pos == -1) {
5944  return rb_str_append(str, str2);
5945  }
5946  else if (pos < 0) {
5947  pos++;
5948  }
5949  rb_str_update(str, pos, 0, str2);
5950  return str;
5951 }
5952 
5953 
5954 /*
5955  * call-seq:
5956  * slice!(index) -> new_string or nil
5957  * slice!(start, length) -> new_string or nil
5958  * slice!(range) -> new_string or nil
5959  * slice!(regexp, capture = 0) -> new_string or nil
5960  * slice!(substring) -> new_string or nil
5961  *
5962  * Removes and returns the substring of +self+ specified by the arguments.
5963  * See {String Slices}[rdoc-ref:String@String+Slices].
5964  *
5965  * A few examples:
5966  *
5967  * string = "This is a string"
5968  * string.slice!(2) #=> "i"
5969  * string.slice!(3..6) #=> " is "
5970  * string.slice!(/s.*t/) #=> "sa st"
5971  * string.slice!("r") #=> "r"
5972  * string #=> "Thing"
5973  *
5974  */
5975 
5976 static VALUE
5977 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5978 {
5979  VALUE result = Qnil;
5980  VALUE indx;
5981  long beg, len = 1;
5982  char *p;
5983 
5984  rb_check_arity(argc, 1, 2);
5985  str_modify_keep_cr(str);
5986  indx = argv[0];
5987  if (RB_TYPE_P(indx, T_REGEXP)) {
5988  if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5989  VALUE match = rb_backref_get();
5990  struct re_registers *regs = RMATCH_REGS(match);
5991  int nth = 0;
5992  if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5993  if ((nth += regs->num_regs) <= 0) return Qnil;
5994  }
5995  else if (nth >= regs->num_regs) return Qnil;
5996  beg = BEG(nth);
5997  len = END(nth) - beg;
5998  goto subseq;
5999  }
6000  else if (argc == 2) {
6001  beg = NUM2LONG(indx);
6002  len = NUM2LONG(argv[1]);
6003  goto num_index;
6004  }
6005  else if (FIXNUM_P(indx)) {
6006  beg = FIX2LONG(indx);
6007  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6008  if (!len) return Qnil;
6009  beg = p - RSTRING_PTR(str);
6010  goto subseq;
6011  }
6012  else if (RB_TYPE_P(indx, T_STRING)) {
6013  beg = rb_str_index(str, indx, 0);
6014  if (beg == -1) return Qnil;
6015  len = RSTRING_LEN(indx);
6016  result = str_duplicate(rb_cString, indx);
6017  goto squash;
6018  }
6019  else {
6020  switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6021  case Qnil:
6022  return Qnil;
6023  case Qfalse:
6024  beg = NUM2LONG(indx);
6025  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6026  if (!len) return Qnil;
6027  beg = p - RSTRING_PTR(str);
6028  goto subseq;
6029  default:
6030  goto num_index;
6031  }
6032  }
6033 
6034  num_index:
6035  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6036  beg = p - RSTRING_PTR(str);
6037 
6038  subseq:
6039  result = rb_str_new(RSTRING_PTR(str)+beg, len);
6040  rb_enc_cr_str_copy_for_substr(result, str);
6041 
6042  squash:
6043  if (len > 0) {
6044  if (beg == 0) {
6045  rb_str_drop_bytes(str, len);
6046  }
6047  else {
6048  char *sptr = RSTRING_PTR(str);
6049  long slen = RSTRING_LEN(str);
6050  if (beg + len > slen) /* pathological check */
6051  len = slen - beg;
6052  memmove(sptr + beg,
6053  sptr + beg + len,
6054  slen - (beg + len));
6055  slen -= len;
6056  STR_SET_LEN(str, slen);
6057  TERM_FILL(&sptr[slen], TERM_LEN(str));
6058  }
6059  }
6060  return result;
6061 }
6062 
6063 static VALUE
6064 get_pat(VALUE pat)
6065 {
6066  VALUE val;
6067 
6068  switch (OBJ_BUILTIN_TYPE(pat)) {
6069  case T_REGEXP:
6070  return pat;
6071 
6072  case T_STRING:
6073  break;
6074 
6075  default:
6076  val = rb_check_string_type(pat);
6077  if (NIL_P(val)) {
6078  Check_Type(pat, T_REGEXP);
6079  }
6080  pat = val;
6081  }
6082 
6083  return rb_reg_regcomp(pat);
6084 }
6085 
6086 static VALUE
6087 get_pat_quoted(VALUE pat, int check)
6088 {
6089  VALUE val;
6090 
6091  switch (OBJ_BUILTIN_TYPE(pat)) {
6092  case T_REGEXP:
6093  return pat;
6094 
6095  case T_STRING:
6096  break;
6097 
6098  default:
6099  val = rb_check_string_type(pat);
6100  if (NIL_P(val)) {
6101  Check_Type(pat, T_REGEXP);
6102  }
6103  pat = val;
6104  }
6105  if (check && is_broken_string(pat)) {
6106  rb_exc_raise(rb_reg_check_preprocess(pat));
6107  }
6108  return pat;
6109 }
6110 
6111 static long
6112 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6113 {
6114  if (BUILTIN_TYPE(pat) == T_STRING) {
6115  pos = rb_str_byteindex(str, pat, pos);
6116  if (set_backref_str) {
6117  if (pos >= 0) {
6118  str = rb_str_new_frozen_String(str);
6119  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6120  }
6121  else {
6123  }
6124  }
6125  return pos;
6126  }
6127  else {
6128  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6129  }
6130 }
6131 
6132 
6133 /*
6134  * call-seq:
6135  * sub!(pattern, replacement) -> self or nil
6136  * sub!(pattern) {|match| ... } -> self or nil
6137  *
6138  * Replaces the first occurrence (not all occurrences) of the given +pattern+
6139  * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6140  *
6141  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6142  *
6143  * Related: String#sub, String#gsub, String#gsub!.
6144  *
6145  */
6146 
6147 static VALUE
6148 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6149 {
6150  VALUE pat, repl, hash = Qnil;
6151  int iter = 0;
6152  long plen;
6153  int min_arity = rb_block_given_p() ? 1 : 2;
6154  long beg;
6155 
6156  rb_check_arity(argc, min_arity, 2);
6157  if (argc == 1) {
6158  iter = 1;
6159  }
6160  else {
6161  repl = argv[1];
6162  hash = rb_check_hash_type(argv[1]);
6163  if (NIL_P(hash)) {
6164  StringValue(repl);
6165  }
6166  }
6167 
6168  pat = get_pat_quoted(argv[0], 1);
6169 
6170  str_modifiable(str);
6171  beg = rb_pat_search(pat, str, 0, 1);
6172  if (beg >= 0) {
6173  rb_encoding *enc;
6174  int cr = ENC_CODERANGE(str);
6175  long beg0, end0;
6176  VALUE match, match0 = Qnil;
6177  struct re_registers *regs;
6178  char *p, *rp;
6179  long len, rlen;
6180 
6181  match = rb_backref_get();
6182  regs = RMATCH_REGS(match);
6183  if (RB_TYPE_P(pat, T_STRING)) {
6184  beg0 = beg;
6185  end0 = beg0 + RSTRING_LEN(pat);
6186  match0 = pat;
6187  }
6188  else {
6189  beg0 = BEG(0);
6190  end0 = END(0);
6191  if (iter) match0 = rb_reg_nth_match(0, match);
6192  }
6193 
6194  if (iter || !NIL_P(hash)) {
6195  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6196 
6197  if (iter) {
6198  repl = rb_obj_as_string(rb_yield(match0));
6199  }
6200  else {
6201  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6202  repl = rb_obj_as_string(repl);
6203  }
6204  str_mod_check(str, p, len);
6205  rb_check_frozen(str);
6206  }
6207  else {
6208  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6209  }
6210 
6211  enc = rb_enc_compatible(str, repl);
6212  if (!enc) {
6213  rb_encoding *str_enc = STR_ENC_GET(str);
6214  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6215  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6216  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6217  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6218  rb_enc_inspect_name(str_enc),
6219  rb_enc_inspect_name(STR_ENC_GET(repl)));
6220  }
6221  enc = STR_ENC_GET(repl);
6222  }
6223  rb_str_modify(str);
6224  rb_enc_associate(str, enc);
6225  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
6226  int cr2 = ENC_CODERANGE(repl);
6227  if (cr2 == ENC_CODERANGE_BROKEN ||
6228  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6229  cr = ENC_CODERANGE_UNKNOWN;
6230  else
6231  cr = cr2;
6232  }
6233  plen = end0 - beg0;
6234  rlen = RSTRING_LEN(repl);
6235  len = RSTRING_LEN(str);
6236  if (rlen > plen) {
6237  RESIZE_CAPA(str, len + rlen - plen);
6238  }
6239  p = RSTRING_PTR(str);
6240  if (rlen != plen) {
6241  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6242  }
6243  rp = RSTRING_PTR(repl);
6244  memmove(p + beg0, rp, rlen);
6245  len += rlen - plen;
6246  STR_SET_LEN(str, len);
6247  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6248  ENC_CODERANGE_SET(str, cr);
6249 
6250  RB_GC_GUARD(match);
6251 
6252  return str;
6253  }
6254  return Qnil;
6255 }
6256 
6257 
6258 /*
6259  * call-seq:
6260  * sub(pattern, replacement) -> new_string
6261  * sub(pattern) {|match| ... } -> new_string
6262  *
6263  * Returns a copy of +self+ with only the first occurrence
6264  * (not all occurrences) of the given +pattern+ replaced.
6265  *
6266  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6267  *
6268  * Related: String#sub!, String#gsub, String#gsub!.
6269  *
6270  */
6271 
6272 static VALUE
6273 rb_str_sub(int argc, VALUE *argv, VALUE str)
6274 {
6275  str = str_duplicate(rb_cString, str);
6276  rb_str_sub_bang(argc, argv, str);
6277  return str;
6278 }
6279 
6280 static VALUE
6281 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6282 {
6283  VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6284  long beg, beg0, end0;
6285  long offset, blen, slen, len, last;
6286  enum {STR, ITER, MAP} mode = STR;
6287  char *sp, *cp;
6288  int need_backref = -1;
6289  rb_encoding *str_enc;
6290 
6291  switch (argc) {
6292  case 1:
6293  RETURN_ENUMERATOR(str, argc, argv);
6294  mode = ITER;
6295  break;
6296  case 2:
6297  repl = argv[1];
6298  hash = rb_check_hash_type(argv[1]);
6299  if (NIL_P(hash)) {
6300  StringValue(repl);
6301  }
6302  else {
6303  mode = MAP;
6304  }
6305  break;
6306  default:
6307  rb_error_arity(argc, 1, 2);
6308  }
6309 
6310  pat = get_pat_quoted(argv[0], 1);
6311  beg = rb_pat_search(pat, str, 0, need_backref);
6312  if (beg < 0) {
6313  if (bang) return Qnil; /* no match, no substitution */
6314  return str_duplicate(rb_cString, str);
6315  }
6316 
6317  offset = 0;
6318  blen = RSTRING_LEN(str) + 30; /* len + margin */
6319  dest = rb_str_buf_new(blen);
6320  sp = RSTRING_PTR(str);
6321  slen = RSTRING_LEN(str);
6322  cp = sp;
6323  str_enc = STR_ENC_GET(str);
6324  rb_enc_associate(dest, str_enc);
6326 
6327  do {
6328  VALUE match = rb_backref_get();
6329  struct re_registers *regs = RMATCH_REGS(match);
6330  if (RB_TYPE_P(pat, T_STRING)) {
6331  beg0 = beg;
6332  end0 = beg0 + RSTRING_LEN(pat);
6333  match0 = pat;
6334  }
6335  else {
6336  beg0 = BEG(0);
6337  end0 = END(0);
6338  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6339  }
6340 
6341  if (mode) {
6342  if (mode == ITER) {
6343  val = rb_obj_as_string(rb_yield(match0));
6344  }
6345  else {
6346  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6347  val = rb_obj_as_string(val);
6348  }
6349  str_mod_check(str, sp, slen);
6350  if (val == dest) { /* paranoid check [ruby-dev:24827] */
6351  rb_raise(rb_eRuntimeError, "block should not cheat");
6352  }
6353  }
6354  else if (need_backref) {
6355  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6356  if (need_backref < 0) {
6357  need_backref = val != repl;
6358  }
6359  }
6360  else {
6361  val = repl;
6362  }
6363 
6364  len = beg0 - offset; /* copy pre-match substr */
6365  if (len) {
6366  rb_enc_str_buf_cat(dest, cp, len, str_enc);
6367  }
6368 
6369  rb_str_buf_append(dest, val);
6370 
6371  last = offset;
6372  offset = end0;
6373  if (beg0 == end0) {
6374  /*
6375  * Always consume at least one character of the input string
6376  * in order to prevent infinite loops.
6377  */
6378  if (RSTRING_LEN(str) <= end0) break;
6379  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6380  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6381  offset = end0 + len;
6382  }
6383  cp = RSTRING_PTR(str) + offset;
6384  if (offset > RSTRING_LEN(str)) break;
6385  beg = rb_pat_search(pat, str, offset, need_backref);
6386 
6387  RB_GC_GUARD(match);
6388  } while (beg >= 0);
6389  if (RSTRING_LEN(str) > offset) {
6390  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6391  }
6392  rb_pat_search(pat, str, last, 1);
6393  if (bang) {
6394  str_shared_replace(str, dest);
6395  }
6396  else {
6397  str = dest;
6398  }
6399 
6400  return str;
6401 }
6402 
6403 
6404 /*
6405  * call-seq:
6406  * gsub!(pattern, replacement) -> self or nil
6407  * gsub!(pattern) {|match| ... } -> self or nil
6408  * gsub!(pattern) -> an_enumerator
6409  *
6410  * Performs the specified substring replacement(s) on +self+;
6411  * returns +self+ if any replacement occurred, +nil+ otherwise.
6412  *
6413  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6414  *
6415  * Returns an Enumerator if no +replacement+ and no block given.
6416  *
6417  * Related: String#sub, String#gsub, String#sub!.
6418  *
6419  */
6420 
6421 static VALUE
6422 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6423 {
6424  str_modify_keep_cr(str);
6425  return str_gsub(argc, argv, str, 1);
6426 }
6427 
6428 
6429 /*
6430  * call-seq:
6431  * gsub(pattern, replacement) -> new_string
6432  * gsub(pattern) {|match| ... } -> new_string
6433  * gsub(pattern) -> enumerator
6434  *
6435  * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6436  *
6437  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6438  *
6439  * Returns an Enumerator if no +replacement+ and no block given.
6440  *
6441  * Related: String#sub, String#sub!, String#gsub!.
6442  *
6443  */
6444 
6445 static VALUE
6446 rb_str_gsub(int argc, VALUE *argv, VALUE str)
6447 {
6448  return str_gsub(argc, argv, str, 0);
6449 }
6450 
6451 
6452 /*
6453  * call-seq:
6454  * replace(other_string) -> self
6455  *
6456  * Replaces the contents of +self+ with the contents of +other_string+:
6457  *
6458  * s = 'foo' # => "foo"
6459  * s.replace('bar') # => "bar"
6460  *
6461  */
6462 
6463 VALUE
6465 {
6466  str_modifiable(str);
6467  if (str == str2) return str;
6468 
6469  StringValue(str2);
6470  str_discard(str);
6471  return str_replace(str, str2);
6472 }
6473 
6474 /*
6475  * call-seq:
6476  * clear -> self
6477  *
6478  * Removes the contents of +self+:
6479  *
6480  * s = 'foo' # => "foo"
6481  * s.clear # => ""
6482  *
6483  */
6484 
6485 static VALUE
6486 rb_str_clear(VALUE str)
6487 {
6488  str_discard(str);
6489  STR_SET_EMBED(str);
6490  STR_SET_LEN(str, 0);
6491  RSTRING_PTR(str)[0] = 0;
6492  if (rb_enc_asciicompat(STR_ENC_GET(str)))
6494  else
6496  return str;
6497 }
6498 
6499 /*
6500  * call-seq:
6501  * chr -> string
6502  *
6503  * Returns a string containing the first character of +self+:
6504  *
6505  * s = 'foo' # => "foo"
6506  * s.chr # => "f"
6507  *
6508  */
6509 
6510 static VALUE
6511 rb_str_chr(VALUE str)
6512 {
6513  return rb_str_substr(str, 0, 1);
6514 }
6515 
6516 /*
6517  * call-seq:
6518  * getbyte(index) -> integer or nil
6519  *
6520  * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6521  *
6522  * s = 'abcde' # => "abcde"
6523  * s.getbyte(0) # => 97
6524  * s.getbyte(-1) # => 101
6525  * s.getbyte(5) # => nil
6526  *
6527  * Related: String#setbyte.
6528  */
6529 VALUE
6530 rb_str_getbyte(VALUE str, VALUE index)
6531 {
6532  long pos = NUM2LONG(index);
6533 
6534  if (pos < 0)
6535  pos += RSTRING_LEN(str);
6536  if (pos < 0 || RSTRING_LEN(str) <= pos)
6537  return Qnil;
6538 
6539  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6540 }
6541 
6542 /*
6543  * call-seq:
6544  * setbyte(index, integer) -> integer
6545  *
6546  * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6547  *
6548  * s = 'abcde' # => "abcde"
6549  * s.setbyte(0, 98) # => 98
6550  * s # => "bbcde"
6551  *
6552  * Related: String#getbyte.
6553  */
6554 VALUE
6555 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6556 {
6557  long pos = NUM2LONG(index);
6558  long len = RSTRING_LEN(str);
6559  char *ptr, *head, *left = 0;
6560  rb_encoding *enc;
6561  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6562 
6563  if (pos < -len || len <= pos)
6564  rb_raise(rb_eIndexError, "index %ld out of string", pos);
6565  if (pos < 0)
6566  pos += len;
6567 
6568  VALUE v = rb_to_int(value);
6569  VALUE w = rb_int_and(v, INT2FIX(0xff));
6570  char byte = (char)(NUM2INT(w) & 0xFF);
6571 
6572  if (!str_independent(str))
6573  str_make_independent(str);
6574  enc = STR_ENC_GET(str);
6575  head = RSTRING_PTR(str);
6576  ptr = &head[pos];
6577  if (!STR_EMBED_P(str)) {
6578  cr = ENC_CODERANGE(str);
6579  switch (cr) {
6580  case ENC_CODERANGE_7BIT:
6581  left = ptr;
6582  *ptr = byte;
6583  if (ISASCII(byte)) goto end;
6584  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6585  if (!MBCLEN_CHARFOUND_P(nlen))
6587  else
6589  goto end;
6590  case ENC_CODERANGE_VALID:
6591  left = rb_enc_left_char_head(head, ptr, head+len, enc);
6592  width = rb_enc_precise_mbclen(left, head+len, enc);
6593  *ptr = byte;
6594  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6595  if (!MBCLEN_CHARFOUND_P(nlen))
6597  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6598  ENC_CODERANGE_CLEAR(str);
6599  goto end;
6600  }
6601  }
6602  ENC_CODERANGE_CLEAR(str);
6603  *ptr = byte;
6604 
6605  end:
6606  return value;
6607 }
6608 
6609 static VALUE
6610 str_byte_substr(VALUE str, long beg, long len, int empty)
6611 {
6612  long n = RSTRING_LEN(str);
6613 
6614  if (beg > n || len < 0) return Qnil;
6615  if (beg < 0) {
6616  beg += n;
6617  if (beg < 0) return Qnil;
6618  }
6619  if (len > n - beg)
6620  len = n - beg;
6621  if (len <= 0) {
6622  if (!empty) return Qnil;
6623  len = 0;
6624  }
6625 
6626  VALUE str2 = str_subseq(str, beg, len);
6627 
6628  str_enc_copy_direct(str2, str);
6629 
6630  if (RSTRING_LEN(str2) == 0) {
6631  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6633  else
6635  }
6636  else {
6637  switch (ENC_CODERANGE(str)) {
6638  case ENC_CODERANGE_7BIT:
6640  break;
6641  default:
6643  break;
6644  }
6645  }
6646 
6647  return str2;
6648 }
6649 
6650 VALUE
6651 rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6652 {
6653  return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6654 }
6655 
6656 static VALUE
6657 str_byte_aref(VALUE str, VALUE indx)
6658 {
6659  long idx;
6660  if (FIXNUM_P(indx)) {
6661  idx = FIX2LONG(indx);
6662  }
6663  else {
6664  /* check if indx is Range */
6665  long beg, len = RSTRING_LEN(str);
6666 
6667  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6668  case Qfalse:
6669  break;
6670  case Qnil:
6671  return Qnil;
6672  default:
6673  return str_byte_substr(str, beg, len, TRUE);
6674  }
6675 
6676  idx = NUM2LONG(indx);
6677  }
6678  return str_byte_substr(str, idx, 1, FALSE);
6679 }
6680 
6681 /*
6682  * call-seq:
6683  * byteslice(index, length = 1) -> string or nil
6684  * byteslice(range) -> string or nil
6685  *
6686  * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6687  *
6688  * With integer arguments +index+ and +length+ given,
6689  * returns the substring beginning at the given +index+
6690  * of the given +length+ (if possible),
6691  * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6692  *
6693  * s = '0123456789' # => "0123456789"
6694  * s.byteslice(2) # => "2"
6695  * s.byteslice(200) # => nil
6696  * s.byteslice(4, 3) # => "456"
6697  * s.byteslice(4, 30) # => "456789"
6698  * s.byteslice(4, -1) # => nil
6699  * s.byteslice(40, 2) # => nil
6700  *
6701  * In either case above, counts backwards from the end of +self+
6702  * if +index+ is negative:
6703  *
6704  * s = '0123456789' # => "0123456789"
6705  * s.byteslice(-4) # => "6"
6706  * s.byteslice(-4, 3) # => "678"
6707  *
6708  * With Range argument +range+ given, returns
6709  * <tt>byteslice(range.begin, range.size)</tt>:
6710  *
6711  * s = '0123456789' # => "0123456789"
6712  * s.byteslice(4..6) # => "456"
6713  * s.byteslice(-6..-4) # => "456"
6714  * s.byteslice(5..2) # => "" # range.size is zero.
6715  * s.byteslice(40..42) # => nil
6716  *
6717  * In all cases, a returned string has the same encoding as +self+:
6718  *
6719  * s.encoding # => #<Encoding:UTF-8>
6720  * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6721  *
6722  */
6723 
6724 static VALUE
6725 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6726 {
6727  if (argc == 2) {
6728  long beg = NUM2LONG(argv[0]);
6729  long len = NUM2LONG(argv[1]);
6730  return str_byte_substr(str, beg, len, TRUE);
6731  }
6732  rb_check_arity(argc, 1, 2);
6733  return str_byte_aref(str, argv[0]);
6734 }
6735 
6736 static void
6737 str_check_beg_len(VALUE str, long *beg, long *len)
6738 {
6739  long end, slen = RSTRING_LEN(str);
6740 
6741  if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6742  if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6743  rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6744  }
6745  if (*beg < 0) {
6746  *beg += slen;
6747  }
6748  RUBY_ASSERT(*beg >= 0);
6749  RUBY_ASSERT(*beg <= slen);
6750 
6751  if (*len > slen - *beg) {
6752  *len = slen - *beg;
6753  }
6754  end = *beg + *len;
6755  str_ensure_byte_pos(str, *beg);
6756  str_ensure_byte_pos(str, end);
6757 }
6758 
6759 /*
6760  * call-seq:
6761  * bytesplice(index, length, str) -> string
6762  * bytesplice(index, length, str, str_index, str_length) -> string
6763  * bytesplice(range, str) -> string
6764  * bytesplice(range, str, str_range) -> string
6765  *
6766  * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6767  * The portion of the string affected is determined using
6768  * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6769  * If the replacement string is not the same length as the text it is replacing,
6770  * the string will be adjusted accordingly.
6771  *
6772  * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6773  *
6774  * The form that take an Integer will raise an IndexError if the value is out
6775  * of range; the Range form will raise a RangeError.
6776  * If the beginning or ending offset does not land on character (codepoint)
6777  * boundary, an IndexError will be raised.
6778  */
6779 
6780 static VALUE
6781 rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6782 {
6783  long beg, len, vbeg, vlen;
6784  VALUE val;
6785  int cr;
6786 
6787  rb_check_arity(argc, 2, 5);
6788  if (!(argc == 2 || argc == 3 || argc == 5)) {
6789  rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6790  }
6791  if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6792  if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6793  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6794  rb_builtin_class_name(argv[0]));
6795  }
6796  val = argv[1];
6797  StringValue(val);
6798  if (argc == 2) {
6799  /* bytesplice(range, str) */
6800  vbeg = 0;
6801  vlen = RSTRING_LEN(val);
6802  }
6803  else {
6804  /* bytesplice(range, str, str_range) */
6805  if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6806  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6807  rb_builtin_class_name(argv[2]));
6808  }
6809  }
6810  }
6811  else {
6812  beg = NUM2LONG(argv[0]);
6813  len = NUM2LONG(argv[1]);
6814  val = argv[2];
6815  StringValue(val);
6816  if (argc == 3) {
6817  /* bytesplice(index, length, str) */
6818  vbeg = 0;
6819  vlen = RSTRING_LEN(val);
6820  }
6821  else {
6822  /* bytesplice(index, length, str, str_index, str_length) */
6823  vbeg = NUM2LONG(argv[3]);
6824  vlen = NUM2LONG(argv[4]);
6825  }
6826  }
6827  str_check_beg_len(str, &beg, &len);
6828  str_check_beg_len(val, &vbeg, &vlen);
6829  str_modify_keep_cr(str);
6830 
6832  rb_enc_associate(str, rb_enc_check(str, val));
6833  }
6834 
6835  rb_str_update_1(str, beg, len, val, vbeg, vlen);
6837  if (cr != ENC_CODERANGE_BROKEN)
6838  ENC_CODERANGE_SET(str, cr);
6839  return str;
6840 }
6841 
6842 /*
6843  * call-seq:
6844  * reverse -> string
6845  *
6846  * Returns a new string with the characters from +self+ in reverse order.
6847  *
6848  * 'stressed'.reverse # => "desserts"
6849  *
6850  */
6851 
6852 static VALUE
6853 rb_str_reverse(VALUE str)
6854 {
6855  rb_encoding *enc;
6856  VALUE rev;
6857  char *s, *e, *p;
6858  int cr;
6859 
6860  if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6861  enc = STR_ENC_GET(str);
6862  rev = rb_str_new(0, RSTRING_LEN(str));
6863  s = RSTRING_PTR(str); e = RSTRING_END(str);
6864  p = RSTRING_END(rev);
6865  cr = ENC_CODERANGE(str);
6866 
6867  if (RSTRING_LEN(str) > 1) {
6868  if (single_byte_optimizable(str)) {
6869  while (s < e) {
6870  *--p = *s++;
6871  }
6872  }
6873  else if (cr == ENC_CODERANGE_VALID) {
6874  while (s < e) {
6875  int clen = rb_enc_fast_mbclen(s, e, enc);
6876 
6877  p -= clen;
6878  memcpy(p, s, clen);
6879  s += clen;
6880  }
6881  }
6882  else {
6883  cr = rb_enc_asciicompat(enc) ?
6885  while (s < e) {
6886  int clen = rb_enc_mbclen(s, e, enc);
6887 
6888  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6889  p -= clen;
6890  memcpy(p, s, clen);
6891  s += clen;
6892  }
6893  }
6894  }
6895  STR_SET_LEN(rev, RSTRING_LEN(str));
6896  str_enc_copy_direct(rev, str);
6897  ENC_CODERANGE_SET(rev, cr);
6898 
6899  return rev;
6900 }
6901 
6902 
6903 /*
6904  * call-seq:
6905  * reverse! -> self
6906  *
6907  * Returns +self+ with its characters reversed:
6908  *
6909  * s = 'stressed'
6910  * s.reverse! # => "desserts"
6911  * s # => "desserts"
6912  *
6913  */
6914 
6915 static VALUE
6916 rb_str_reverse_bang(VALUE str)
6917 {
6918  if (RSTRING_LEN(str) > 1) {
6919  if (single_byte_optimizable(str)) {
6920  char *s, *e, c;
6921 
6922  str_modify_keep_cr(str);
6923  s = RSTRING_PTR(str);
6924  e = RSTRING_END(str) - 1;
6925  while (s < e) {
6926  c = *s;
6927  *s++ = *e;
6928  *e-- = c;
6929  }
6930  }
6931  else {
6932  str_shared_replace(str, rb_str_reverse(str));
6933  }
6934  }
6935  else {
6936  str_modify_keep_cr(str);
6937  }
6938  return str;
6939 }
6940 
6941 
6942 /*
6943  * call-seq:
6944  * include?(other_string) -> true or false
6945  *
6946  * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6947  *
6948  * s = 'foo'
6949  * s.include?('f') # => true
6950  * s.include?('fo') # => true
6951  * s.include?('food') # => false
6952  *
6953  */
6954 
6955 VALUE
6956 rb_str_include(VALUE str, VALUE arg)
6957 {
6958  long i;
6959 
6960  StringValue(arg);
6961  i = rb_str_index(str, arg, 0);
6962 
6963  return RBOOL(i != -1);
6964 }
6965 
6966 
6967 /*
6968  * call-seq:
6969  * to_i(base = 10) -> integer
6970  *
6971  * Returns the result of interpreting leading characters in +self+
6972  * as an integer in the given +base+ (which must be in (0, 2..36)):
6973  *
6974  * '123456'.to_i # => 123456
6975  * '123def'.to_i(16) # => 1195503
6976  *
6977  * With +base+ zero, string +object+ may contain leading characters
6978  * to specify the actual base:
6979  *
6980  * '123def'.to_i(0) # => 123
6981  * '0123def'.to_i(0) # => 83
6982  * '0b123def'.to_i(0) # => 1
6983  * '0o123def'.to_i(0) # => 83
6984  * '0d123def'.to_i(0) # => 123
6985  * '0x123def'.to_i(0) # => 1195503
6986  *
6987  * Characters past a leading valid number (in the given +base+) are ignored:
6988  *
6989  * '12.345'.to_i # => 12
6990  * '12345'.to_i(2) # => 1
6991  *
6992  * Returns zero if there is no leading valid number:
6993  *
6994  * 'abcdef'.to_i # => 0
6995  * '2'.to_i(2) # => 0
6996  *
6997  */
6998 
6999 static VALUE
7000 rb_str_to_i(int argc, VALUE *argv, VALUE str)
7001 {
7002  int base = 10;
7003 
7004  if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7005  rb_raise(rb_eArgError, "invalid radix %d", base);
7006  }
7007  return rb_str_to_inum(str, base, FALSE);
7008 }
7009 
7010 
7011 /*
7012  * call-seq:
7013  * to_f -> float
7014  *
7015  * Returns the result of interpreting leading characters in +self+ as a Float:
7016  *
7017  * '3.14159'.to_f # => 3.14159
7018  * '1.234e-2'.to_f # => 0.01234
7019  *
7020  * Characters past a leading valid number (in the given +base+) are ignored:
7021  *
7022  * '3.14 (pi to two places)'.to_f # => 3.14
7023  *
7024  * Returns zero if there is no leading valid number:
7025  *
7026  * 'abcdef'.to_f # => 0.0
7027  *
7028  */
7029 
7030 static VALUE
7031 rb_str_to_f(VALUE str)
7032 {
7033  return DBL2NUM(rb_str_to_dbl(str, FALSE));
7034 }
7035 
7036 
7037 /*
7038  * call-seq:
7039  * to_s -> self or string
7040  *
7041  * Returns +self+ if +self+ is a +String+,
7042  * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7043  */
7044 
7045 static VALUE
7046 rb_str_to_s(VALUE str)
7047 {
7048  if (rb_obj_class(str) != rb_cString) {
7049  return str_duplicate(rb_cString, str);
7050  }
7051  return str;
7052 }
7053 
7054 #if 0
7055 static void
7056 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7057 {
7058  char s[RUBY_MAX_CHAR_LEN];
7059  int n = rb_enc_codelen(c, enc);
7060 
7061  rb_enc_mbcput(c, s, enc);
7062  rb_enc_str_buf_cat(str, s, n, enc);
7063 }
7064 #endif
7065 
7066 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7067 
7068 int
7069 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7070 {
7071  char buf[CHAR_ESC_LEN + 1];
7072  int l;
7073 
7074 #if SIZEOF_INT > 4
7075  c &= 0xffffffff;
7076 #endif
7077  if (unicode_p) {
7078  if (c < 0x7F && ISPRINT(c)) {
7079  snprintf(buf, CHAR_ESC_LEN, "%c", c);
7080  }
7081  else if (c < 0x10000) {
7082  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7083  }
7084  else {
7085  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7086  }
7087  }
7088  else {
7089  if (c < 0x100) {
7090  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7091  }
7092  else {
7093  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7094  }
7095  }
7096  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7097  rb_str_buf_cat(result, buf, l);
7098  return l;
7099 }
7100 
7101 const char *
7102 ruby_escaped_char(int c)
7103 {
7104  switch (c) {
7105  case '\0': return "\\0";
7106  case '\n': return "\\n";
7107  case '\r': return "\\r";
7108  case '\t': return "\\t";
7109  case '\f': return "\\f";
7110  case '\013': return "\\v";
7111  case '\010': return "\\b";
7112  case '\007': return "\\a";
7113  case '\033': return "\\e";
7114  case '\x7f': return "\\c?";
7115  }
7116  return NULL;
7117 }
7118 
7119 VALUE
7120 rb_str_escape(VALUE str)
7121 {
7122  int encidx = ENCODING_GET(str);
7123  rb_encoding *enc = rb_enc_from_index(encidx);
7124  const char *p = RSTRING_PTR(str);
7125  const char *pend = RSTRING_END(str);
7126  const char *prev = p;
7127  char buf[CHAR_ESC_LEN + 1];
7128  VALUE result = rb_str_buf_new(0);
7129  int unicode_p = rb_enc_unicode_p(enc);
7130  int asciicompat = rb_enc_asciicompat(enc);
7131 
7132  while (p < pend) {
7133  unsigned int c;
7134  const char *cc;
7135  int n = rb_enc_precise_mbclen(p, pend, enc);
7136  if (!MBCLEN_CHARFOUND_P(n)) {
7137  if (p > prev) str_buf_cat(result, prev, p - prev);
7138  n = rb_enc_mbminlen(enc);
7139  if (pend < p + n)
7140  n = (int)(pend - p);
7141  while (n--) {
7142  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7143  str_buf_cat(result, buf, strlen(buf));
7144  prev = ++p;
7145  }
7146  continue;
7147  }
7148  n = MBCLEN_CHARFOUND_LEN(n);
7149  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7150  p += n;
7151  cc = ruby_escaped_char(c);
7152  if (cc) {
7153  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7154  str_buf_cat(result, cc, strlen(cc));
7155  prev = p;
7156  }
7157  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7158  }
7159  else {
7160  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7161  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7162  prev = p;
7163  }
7164  }
7165  if (p > prev) str_buf_cat(result, prev, p - prev);
7167 
7168  return result;
7169 }
7170 
7171 /*
7172  * call-seq:
7173  * inspect -> string
7174  *
7175  * Returns a printable version of +self+, enclosed in double-quotes,
7176  * and with special characters escaped:
7177  *
7178  * s = "foo\tbar\tbaz\n"
7179  * s.inspect
7180  * # => "\"foo\\tbar\\tbaz\\n\""
7181  *
7182  */
7183 
7184 VALUE
7186 {
7187  int encidx = ENCODING_GET(str);
7188  rb_encoding *enc = rb_enc_from_index(encidx);
7189  const char *p, *pend, *prev;
7190  char buf[CHAR_ESC_LEN + 1];
7191  VALUE result = rb_str_buf_new(0);
7193  int unicode_p = rb_enc_unicode_p(enc);
7194  int asciicompat = rb_enc_asciicompat(enc);
7195 
7196  if (resenc == NULL) resenc = rb_default_external_encoding();
7197  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7198  rb_enc_associate(result, resenc);
7199  str_buf_cat2(result, "\"");
7200 
7201  p = RSTRING_PTR(str); pend = RSTRING_END(str);
7202  prev = p;
7203  while (p < pend) {
7204  unsigned int c, cc;
7205  int n;
7206 
7207  n = rb_enc_precise_mbclen(p, pend, enc);
7208  if (!MBCLEN_CHARFOUND_P(n)) {
7209  if (p > prev) str_buf_cat(result, prev, p - prev);
7210  n = rb_enc_mbminlen(enc);
7211  if (pend < p + n)
7212  n = (int)(pend - p);
7213  while (n--) {
7214  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7215  str_buf_cat(result, buf, strlen(buf));
7216  prev = ++p;
7217  }
7218  continue;
7219  }
7220  n = MBCLEN_CHARFOUND_LEN(n);
7221  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7222  p += n;
7223  if ((asciicompat || unicode_p) &&
7224  (c == '"'|| c == '\\' ||
7225  (c == '#' &&
7226  p < pend &&
7228  (cc = rb_enc_codepoint(p,pend,enc),
7229  (cc == '$' || cc == '@' || cc == '{'))))) {
7230  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7231  str_buf_cat2(result, "\\");
7232  if (asciicompat || enc == resenc) {
7233  prev = p - n;
7234  continue;
7235  }
7236  }
7237  switch (c) {
7238  case '\n': cc = 'n'; break;
7239  case '\r': cc = 'r'; break;
7240  case '\t': cc = 't'; break;
7241  case '\f': cc = 'f'; break;
7242  case '\013': cc = 'v'; break;
7243  case '\010': cc = 'b'; break;
7244  case '\007': cc = 'a'; break;
7245  case 033: cc = 'e'; break;
7246  default: cc = 0; break;
7247  }
7248  if (cc) {
7249  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7250  buf[0] = '\\';
7251  buf[1] = (char)cc;
7252  str_buf_cat(result, buf, 2);
7253  prev = p;
7254  continue;
7255  }
7256  /* The special casing of 0x85 (NEXT_LINE) here is because
7257  * Oniguruma historically treats it as printable, but it
7258  * doesn't match the print POSIX bracket class or character
7259  * property in regexps.
7260  *
7261  * See Ruby Bug #16842 for details:
7262  * https://bugs.ruby-lang.org/issues/16842
7263  */
7264  if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7265  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7266  continue;
7267  }
7268  else {
7269  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7270  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7271  prev = p;
7272  continue;
7273  }
7274  }
7275  if (p > prev) str_buf_cat(result, prev, p - prev);
7276  str_buf_cat2(result, "\"");
7277 
7278  return result;
7279 }
7280 
7281 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7282 
7283 /*
7284  * call-seq:
7285  * dump -> string
7286  *
7287  * Returns a printable version of +self+, enclosed in double-quotes,
7288  * with special characters escaped, and with non-printing characters
7289  * replaced by hexadecimal notation:
7290  *
7291  * "hello \n ''".dump # => "\"hello \\n ''\""
7292  * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7293  *
7294  * Related: String#undump (inverse of String#dump).
7295  *
7296  */
7297 
7298 VALUE
7300 {
7301  int encidx = rb_enc_get_index(str);
7302  rb_encoding *enc = rb_enc_from_index(encidx);
7303  long len;
7304  const char *p, *pend;
7305  char *q, *qend;
7306  VALUE result;
7307  int u8 = (encidx == rb_utf8_encindex());
7308  static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7309 
7310  len = 2; /* "" */
7311  if (!rb_enc_asciicompat(enc)) {
7312  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7313  len += strlen(enc->name);
7314  }
7315 
7316  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7317  while (p < pend) {
7318  int clen;
7319  unsigned char c = *p++;
7320 
7321  switch (c) {
7322  case '"': case '\\':
7323  case '\n': case '\r':
7324  case '\t': case '\f':
7325  case '\013': case '\010': case '\007': case '\033':
7326  clen = 2;
7327  break;
7328 
7329  case '#':
7330  clen = IS_EVSTR(p, pend) ? 2 : 1;
7331  break;
7332 
7333  default:
7334  if (ISPRINT(c)) {
7335  clen = 1;
7336  }
7337  else {
7338  if (u8 && c > 0x7F) { /* \u notation */
7339  int n = rb_enc_precise_mbclen(p-1, pend, enc);
7340  if (MBCLEN_CHARFOUND_P(n)) {
7341  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7342  if (cc <= 0xFFFF)
7343  clen = 6; /* \uXXXX */
7344  else if (cc <= 0xFFFFF)
7345  clen = 9; /* \u{XXXXX} */
7346  else
7347  clen = 10; /* \u{XXXXXX} */
7348  p += MBCLEN_CHARFOUND_LEN(n)-1;
7349  break;
7350  }
7351  }
7352  clen = 4; /* \xNN */
7353  }
7354  break;
7355  }
7356 
7357  if (clen > LONG_MAX - len) {
7358  rb_raise(rb_eRuntimeError, "string size too big");
7359  }
7360  len += clen;
7361  }
7362 
7363  result = rb_str_new(0, len);
7364  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7365  q = RSTRING_PTR(result); qend = q + len + 1;
7366 
7367  *q++ = '"';
7368  while (p < pend) {
7369  unsigned char c = *p++;
7370 
7371  if (c == '"' || c == '\\') {
7372  *q++ = '\\';
7373  *q++ = c;
7374  }
7375  else if (c == '#') {
7376  if (IS_EVSTR(p, pend)) *q++ = '\\';
7377  *q++ = '#';
7378  }
7379  else if (c == '\n') {
7380  *q++ = '\\';
7381  *q++ = 'n';
7382  }
7383  else if (c == '\r') {
7384  *q++ = '\\';
7385  *q++ = 'r';
7386  }
7387  else if (c == '\t') {
7388  *q++ = '\\';
7389  *q++ = 't';
7390  }
7391  else if (c == '\f') {
7392  *q++ = '\\';
7393  *q++ = 'f';
7394  }
7395  else if (c == '\013') {
7396  *q++ = '\\';
7397  *q++ = 'v';
7398  }
7399  else if (c == '\010') {
7400  *q++ = '\\';
7401  *q++ = 'b';
7402  }
7403  else if (c == '\007') {
7404  *q++ = '\\';
7405  *q++ = 'a';
7406  }
7407  else if (c == '\033') {
7408  *q++ = '\\';
7409  *q++ = 'e';
7410  }
7411  else if (ISPRINT(c)) {
7412  *q++ = c;
7413  }
7414  else {
7415  *q++ = '\\';
7416  if (u8) {
7417  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7418  if (MBCLEN_CHARFOUND_P(n)) {
7419  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7420  p += n;
7421  if (cc <= 0xFFFF)
7422  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7423  else
7424  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7425  q += strlen(q);
7426  continue;
7427  }
7428  }
7429  snprintf(q, qend-q, "x%02X", c);
7430  q += 3;
7431  }
7432  }
7433  *q++ = '"';
7434  *q = '\0';
7435  if (!rb_enc_asciicompat(enc)) {
7436  snprintf(q, qend-q, nonascii_suffix, enc->name);
7437  encidx = rb_ascii8bit_encindex();
7438  }
7439  /* result from dump is ASCII */
7440  rb_enc_associate_index(result, encidx);
7442  return result;
7443 }
7444 
7445 static int
7446 unescape_ascii(unsigned int c)
7447 {
7448  switch (c) {
7449  case 'n':
7450  return '\n';
7451  case 'r':
7452  return '\r';
7453  case 't':
7454  return '\t';
7455  case 'f':
7456  return '\f';
7457  case 'v':
7458  return '\13';
7459  case 'b':
7460  return '\010';
7461  case 'a':
7462  return '\007';
7463  case 'e':
7464  return 033;
7465  }
7466  UNREACHABLE_RETURN(-1);
7467 }
7468 
7469 static void
7470 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7471 {
7472  const char *s = *ss;
7473  unsigned int c;
7474  int codelen;
7475  size_t hexlen;
7476  unsigned char buf[6];
7477  static rb_encoding *enc_utf8 = NULL;
7478 
7479  switch (*s) {
7480  case '\\':
7481  case '"':
7482  case '#':
7483  rb_str_cat(undumped, s, 1); /* cat itself */
7484  s++;
7485  break;
7486  case 'n':
7487  case 'r':
7488  case 't':
7489  case 'f':
7490  case 'v':
7491  case 'b':
7492  case 'a':
7493  case 'e':
7494  *buf = unescape_ascii(*s);
7495  rb_str_cat(undumped, (char *)buf, 1);
7496  s++;
7497  break;
7498  case 'u':
7499  if (*binary) {
7500  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7501  }
7502  *utf8 = true;
7503  if (++s >= s_end) {
7504  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7505  }
7506  if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7507  if (*penc != enc_utf8) {
7508  *penc = enc_utf8;
7509  rb_enc_associate(undumped, enc_utf8);
7510  }
7511  if (*s == '{') { /* handle \u{...} form */
7512  s++;
7513  for (;;) {
7514  if (s >= s_end) {
7515  rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7516  }
7517  if (*s == '}') {
7518  s++;
7519  break;
7520  }
7521  if (ISSPACE(*s)) {
7522  s++;
7523  continue;
7524  }
7525  c = scan_hex(s, s_end-s, &hexlen);
7526  if (hexlen == 0 || hexlen > 6) {
7527  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7528  }
7529  if (c > 0x10ffff) {
7530  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7531  }
7532  if (0xd800 <= c && c <= 0xdfff) {
7533  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7534  }
7535  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7536  rb_str_cat(undumped, (char *)buf, codelen);
7537  s += hexlen;
7538  }
7539  }
7540  else { /* handle \uXXXX form */
7541  c = scan_hex(s, 4, &hexlen);
7542  if (hexlen != 4) {
7543  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7544  }
7545  if (0xd800 <= c && c <= 0xdfff) {
7546  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7547  }
7548  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7549  rb_str_cat(undumped, (char *)buf, codelen);
7550  s += hexlen;
7551  }
7552  break;
7553  case 'x':
7554  if (*utf8) {
7555  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7556  }
7557  *binary = true;
7558  if (++s >= s_end) {
7559  rb_raise(rb_eRuntimeError, "invalid hex escape");
7560  }
7561  *buf = scan_hex(s, 2, &hexlen);
7562  if (hexlen != 2) {
7563  rb_raise(rb_eRuntimeError, "invalid hex escape");
7564  }
7565  rb_str_cat(undumped, (char *)buf, 1);
7566  s += hexlen;
7567  break;
7568  default:
7569  rb_str_cat(undumped, s-1, 2);
7570  s++;
7571  }
7572 
7573  *ss = s;
7574 }
7575 
7576 static VALUE rb_str_is_ascii_only_p(VALUE str);
7577 
7578 /*
7579  * call-seq:
7580  * undump -> string
7581  *
7582  * Returns an unescaped version of +self+:
7583  *
7584  * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7585  * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7586  * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7587  * s_undumped == s_orig # => true
7588  *
7589  * Related: String#dump (inverse of String#undump).
7590  *
7591  */
7592 
7593 static VALUE
7594 str_undump(VALUE str)
7595 {
7596  const char *s = RSTRING_PTR(str);
7597  const char *s_end = RSTRING_END(str);
7598  rb_encoding *enc = rb_enc_get(str);
7599  VALUE undumped = rb_enc_str_new(s, 0L, enc);
7600  bool utf8 = false;
7601  bool binary = false;
7602  int w;
7603 
7604  rb_must_asciicompat(str);
7605  if (rb_str_is_ascii_only_p(str) == Qfalse) {
7606  rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7607  }
7608  if (!str_null_check(str, &w)) {
7609  rb_raise(rb_eRuntimeError, "string contains null byte");
7610  }
7611  if (RSTRING_LEN(str) < 2) goto invalid_format;
7612  if (*s != '"') goto invalid_format;
7613 
7614  /* strip '"' at the start */
7615  s++;
7616 
7617  for (;;) {
7618  if (s >= s_end) {
7619  rb_raise(rb_eRuntimeError, "unterminated dumped string");
7620  }
7621 
7622  if (*s == '"') {
7623  /* epilogue */
7624  s++;
7625  if (s == s_end) {
7626  /* ascii compatible dumped string */
7627  break;
7628  }
7629  else {
7630  static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7631  static const char dup_suffix[] = ".dup";
7632  const char *encname;
7633  int encidx;
7634  ptrdiff_t size;
7635 
7636  /* check separately for strings dumped by older versions */
7637  size = sizeof(dup_suffix) - 1;
7638  if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7639 
7640  size = sizeof(force_encoding_suffix) - 1;
7641  if (s_end - s <= size) goto invalid_format;
7642  if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7643  s += size;
7644 
7645  if (utf8) {
7646  rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7647  }
7648 
7649  encname = s;
7650  s = memchr(s, '"', s_end-s);
7651  size = s - encname;
7652  if (!s) goto invalid_format;
7653  if (s_end - s != 2) goto invalid_format;
7654  if (s[0] != '"' || s[1] != ')') goto invalid_format;
7655 
7656  encidx = rb_enc_find_index2(encname, (long)size);
7657  if (encidx < 0) {
7658  rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7659  }
7660  rb_enc_associate_index(undumped, encidx);
7661  }
7662  break;
7663  }
7664 
7665  if (*s == '\\') {
7666  s++;
7667  if (s >= s_end) {
7668  rb_raise(rb_eRuntimeError, "invalid escape");
7669  }
7670  undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7671  }
7672  else {
7673  rb_str_cat(undumped, s++, 1);
7674  }
7675  }
7676 
7677  RB_GC_GUARD(str);
7678 
7679  return undumped;
7680 invalid_format:
7681  rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7682 }
7683 
7684 static void
7685 rb_str_check_dummy_enc(rb_encoding *enc)
7686 {
7687  if (rb_enc_dummy_p(enc)) {
7688  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7689  rb_enc_name(enc));
7690  }
7691 }
7692 
7693 static rb_encoding *
7694 str_true_enc(VALUE str)
7695 {
7696  rb_encoding *enc = STR_ENC_GET(str);
7697  rb_str_check_dummy_enc(enc);
7698  return enc;
7699 }
7700 
7701 static OnigCaseFoldType
7702 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7703 {
7704  if (argc==0)
7705  return flags;
7706  if (argc>2)
7707  rb_raise(rb_eArgError, "too many options");
7708  if (argv[0]==sym_turkic) {
7709  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7710  if (argc==2) {
7711  if (argv[1]==sym_lithuanian)
7712  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7713  else
7714  rb_raise(rb_eArgError, "invalid second option");
7715  }
7716  }
7717  else if (argv[0]==sym_lithuanian) {
7718  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7719  if (argc==2) {
7720  if (argv[1]==sym_turkic)
7721  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7722  else
7723  rb_raise(rb_eArgError, "invalid second option");
7724  }
7725  }
7726  else if (argc>1)
7727  rb_raise(rb_eArgError, "too many options");
7728  else if (argv[0]==sym_ascii)
7729  flags |= ONIGENC_CASE_ASCII_ONLY;
7730  else if (argv[0]==sym_fold) {
7731  if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7732  flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7733  else
7734  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7735  }
7736  else
7737  rb_raise(rb_eArgError, "invalid option");
7738  return flags;
7739 }
7740 
7741 static inline bool
7742 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7743 {
7744  if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7745  return true;
7746  return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7747 }
7748 
7749 /* 16 should be long enough to absorb any kind of single character length increase */
7750 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7751 #ifndef CASEMAP_DEBUG
7752 # define CASEMAP_DEBUG 0
7753 #endif
7754 
7755 struct mapping_buffer;
7756 typedef struct mapping_buffer {
7757  size_t capa;
7758  size_t used;
7759  struct mapping_buffer *next;
7760  OnigUChar space[FLEX_ARY_LEN];
7761 } mapping_buffer;
7762 
7763 static void
7764 mapping_buffer_free(void *p)
7765 {
7766  mapping_buffer *previous_buffer;
7767  mapping_buffer *current_buffer = p;
7768  while (current_buffer) {
7769  previous_buffer = current_buffer;
7770  current_buffer = current_buffer->next;
7771  ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7772  }
7773 }
7774 
7775 static const rb_data_type_t mapping_buffer_type = {
7776  "mapping_buffer",
7777  {0, mapping_buffer_free,},
7778  0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7779 };
7780 
7781 static VALUE
7782 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7783 {
7784  VALUE target;
7785 
7786  const OnigUChar *source_current, *source_end;
7787  int target_length = 0;
7788  VALUE buffer_anchor;
7789  mapping_buffer *current_buffer = 0;
7790  mapping_buffer **pre_buffer;
7791  size_t buffer_count = 0;
7792  int buffer_length_or_invalid;
7793 
7794  if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7795 
7796  source_current = (OnigUChar*)RSTRING_PTR(source);
7797  source_end = (OnigUChar*)RSTRING_END(source);
7798 
7799  buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7800  pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7801  while (source_current < source_end) {
7802  /* increase multiplier using buffer count to converge quickly */
7803  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7804  if (CASEMAP_DEBUG) {
7805  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7806  }
7807  current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7808  *pre_buffer = current_buffer;
7809  pre_buffer = &current_buffer->next;
7810  current_buffer->next = NULL;
7811  current_buffer->capa = capa;
7812  buffer_length_or_invalid = enc->case_map(flags,
7813  &source_current, source_end,
7814  current_buffer->space,
7815  current_buffer->space+current_buffer->capa,
7816  enc);
7817  if (buffer_length_or_invalid < 0) {
7818  current_buffer = DATA_PTR(buffer_anchor);
7819  DATA_PTR(buffer_anchor) = 0;
7820  mapping_buffer_free(current_buffer);
7821  rb_raise(rb_eArgError, "input string invalid");
7822  }
7823  target_length += current_buffer->used = buffer_length_or_invalid;
7824  }
7825  if (CASEMAP_DEBUG) {
7826  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7827  }
7828 
7829  if (buffer_count==1) {
7830  target = rb_str_new((const char*)current_buffer->space, target_length);
7831  }
7832  else {
7833  char *target_current;
7834 
7835  target = rb_str_new(0, target_length);
7836  target_current = RSTRING_PTR(target);
7837  current_buffer = DATA_PTR(buffer_anchor);
7838  while (current_buffer) {
7839  memcpy(target_current, current_buffer->space, current_buffer->used);
7840  target_current += current_buffer->used;
7841  current_buffer = current_buffer->next;
7842  }
7843  }
7844  current_buffer = DATA_PTR(buffer_anchor);
7845  DATA_PTR(buffer_anchor) = 0;
7846  mapping_buffer_free(current_buffer);
7847 
7848  RB_GC_GUARD(buffer_anchor);
7849 
7850  /* TODO: check about string terminator character */
7851  str_enc_copy_direct(target, source);
7852  /*ENC_CODERANGE_SET(mapped, cr);*/
7853 
7854  return target;
7855 }
7856 
7857 static VALUE
7858 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7859 {
7860  const OnigUChar *source_current, *source_end;
7861  OnigUChar *target_current, *target_end;
7862  long old_length = RSTRING_LEN(source);
7863  int length_or_invalid;
7864 
7865  if (old_length == 0) return Qnil;
7866 
7867  source_current = (OnigUChar*)RSTRING_PTR(source);
7868  source_end = (OnigUChar*)RSTRING_END(source);
7869  if (source == target) {
7870  target_current = (OnigUChar*)source_current;
7871  target_end = (OnigUChar*)source_end;
7872  }
7873  else {
7874  target_current = (OnigUChar*)RSTRING_PTR(target);
7875  target_end = (OnigUChar*)RSTRING_END(target);
7876  }
7877 
7878  length_or_invalid = onigenc_ascii_only_case_map(flags,
7879  &source_current, source_end,
7880  target_current, target_end, enc);
7881  if (length_or_invalid < 0)
7882  rb_raise(rb_eArgError, "input string invalid");
7883  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7884  fprintf(stderr, "problem with rb_str_ascii_casemap"
7885  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7886  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7887  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7888  }
7889 
7890  str_enc_copy(target, source);
7891 
7892  return target;
7893 }
7894 
7895 static bool
7896 upcase_single(VALUE str)
7897 {
7898  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7899  bool modified = false;
7900 
7901  while (s < send) {
7902  unsigned int c = *(unsigned char*)s;
7903 
7904  if ('a' <= c && c <= 'z') {
7905  *s = 'A' + (c - 'a');
7906  modified = true;
7907  }
7908  s++;
7909  }
7910  return modified;
7911 }
7912 
7913 /*
7914  * call-seq:
7915  * upcase!(*options) -> self or nil
7916  *
7917  * Upcases the characters in +self+;
7918  * returns +self+ if any changes were made, +nil+ otherwise:
7919  *
7920  * s = 'Hello World!' # => "Hello World!"
7921  * s.upcase! # => "HELLO WORLD!"
7922  * s # => "HELLO WORLD!"
7923  * s.upcase! # => nil
7924  *
7925  * The casing may be affected by the given +options+;
7926  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7927  *
7928  * Related: String#upcase, String#downcase, String#downcase!.
7929  *
7930  */
7931 
7932 static VALUE
7933 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7934 {
7935  rb_encoding *enc;
7936  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7937 
7938  flags = check_case_options(argc, argv, flags);
7939  str_modify_keep_cr(str);
7940  enc = str_true_enc(str);
7941  if (case_option_single_p(flags, enc, str)) {
7942  if (upcase_single(str))
7943  flags |= ONIGENC_CASE_MODIFIED;
7944  }
7945  else if (flags&ONIGENC_CASE_ASCII_ONLY)
7946  rb_str_ascii_casemap(str, str, &flags, enc);
7947  else
7948  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7949 
7950  if (ONIGENC_CASE_MODIFIED&flags) return str;
7951  return Qnil;
7952 }
7953 
7954 
7955 /*
7956  * call-seq:
7957  * upcase(*options) -> string
7958  *
7959  * Returns a string containing the upcased characters in +self+:
7960  *
7961  * s = 'Hello World!' # => "Hello World!"
7962  * s.upcase # => "HELLO WORLD!"
7963  *
7964  * The casing may be affected by the given +options+;
7965  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7966  *
7967  * Related: String#upcase!, String#downcase, String#downcase!.
7968  *
7969  */
7970 
7971 static VALUE
7972 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7973 {
7974  rb_encoding *enc;
7975  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7976  VALUE ret;
7977 
7978  flags = check_case_options(argc, argv, flags);
7979  enc = str_true_enc(str);
7980  if (case_option_single_p(flags, enc, str)) {
7981  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7982  str_enc_copy_direct(ret, str);
7983  upcase_single(ret);
7984  }
7985  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7986  ret = rb_str_new(0, RSTRING_LEN(str));
7987  rb_str_ascii_casemap(str, ret, &flags, enc);
7988  }
7989  else {
7990  ret = rb_str_casemap(str, &flags, enc);
7991  }
7992 
7993  return ret;
7994 }
7995 
7996 static bool
7997 downcase_single(VALUE str)
7998 {
7999  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8000  bool modified = false;
8001 
8002  while (s < send) {
8003  unsigned int c = *(unsigned char*)s;
8004 
8005  if ('A' <= c && c <= 'Z') {
8006  *s = 'a' + (c - 'A');
8007  modified = true;
8008  }
8009  s++;
8010  }
8011 
8012  return modified;
8013 }
8014 
8015 /*
8016  * call-seq:
8017  * downcase!(*options) -> self or nil
8018  *
8019  * Downcases the characters in +self+;
8020  * returns +self+ if any changes were made, +nil+ otherwise:
8021  *
8022  * s = 'Hello World!' # => "Hello World!"
8023  * s.downcase! # => "hello world!"
8024  * s # => "hello world!"
8025  * s.downcase! # => nil
8026  *
8027  * The casing may be affected by the given +options+;
8028  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8029  *
8030  * Related: String#downcase, String#upcase, String#upcase!.
8031  *
8032  */
8033 
8034 static VALUE
8035 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8036 {
8037  rb_encoding *enc;
8038  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8039 
8040  flags = check_case_options(argc, argv, flags);
8041  str_modify_keep_cr(str);
8042  enc = str_true_enc(str);
8043  if (case_option_single_p(flags, enc, str)) {
8044  if (downcase_single(str))
8045  flags |= ONIGENC_CASE_MODIFIED;
8046  }
8047  else if (flags&ONIGENC_CASE_ASCII_ONLY)
8048  rb_str_ascii_casemap(str, str, &flags, enc);
8049  else
8050  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8051 
8052  if (ONIGENC_CASE_MODIFIED&flags) return str;
8053  return Qnil;
8054 }
8055 
8056 
8057 /*
8058  * call-seq:
8059  * downcase(*options) -> string
8060  *
8061  * Returns a string containing the downcased characters in +self+:
8062  *
8063  * s = 'Hello World!' # => "Hello World!"
8064  * s.downcase # => "hello world!"
8065  *
8066  * The casing may be affected by the given +options+;
8067  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8068  *
8069  * Related: String#downcase!, String#upcase, String#upcase!.
8070  *
8071  */
8072 
8073 static VALUE
8074 rb_str_downcase(int argc, VALUE *argv, VALUE str)
8075 {
8076  rb_encoding *enc;
8077  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8078  VALUE ret;
8079 
8080  flags = check_case_options(argc, argv, flags);
8081  enc = str_true_enc(str);
8082  if (case_option_single_p(flags, enc, str)) {
8083  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8084  str_enc_copy_direct(ret, str);
8085  downcase_single(ret);
8086  }
8087  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8088  ret = rb_str_new(0, RSTRING_LEN(str));
8089  rb_str_ascii_casemap(str, ret, &flags, enc);
8090  }
8091  else {
8092  ret = rb_str_casemap(str, &flags, enc);
8093  }
8094 
8095  return ret;
8096 }
8097 
8098 
8099 /*
8100  * call-seq:
8101  * capitalize!(*options) -> self or nil
8102  *
8103  * Upcases the first character in +self+;
8104  * downcases the remaining characters;
8105  * returns +self+ if any changes were made, +nil+ otherwise:
8106  *
8107  * s = 'hello World!' # => "hello World!"
8108  * s.capitalize! # => "Hello world!"
8109  * s # => "Hello world!"
8110  * s.capitalize! # => nil
8111  *
8112  * The casing may be affected by the given +options+;
8113  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8114  *
8115  * Related: String#capitalize.
8116  *
8117  */
8118 
8119 static VALUE
8120 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8121 {
8122  rb_encoding *enc;
8123  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8124 
8125  flags = check_case_options(argc, argv, flags);
8126  str_modify_keep_cr(str);
8127  enc = str_true_enc(str);
8128  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8129  if (flags&ONIGENC_CASE_ASCII_ONLY)
8130  rb_str_ascii_casemap(str, str, &flags, enc);
8131  else
8132  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8133 
8134  if (ONIGENC_CASE_MODIFIED&flags) return str;
8135  return Qnil;
8136 }
8137 
8138 
8139 /*
8140  * call-seq:
8141  * capitalize(*options) -> string
8142  *
8143  * Returns a string containing the characters in +self+;
8144  * the first character is upcased;
8145  * the remaining characters are downcased:
8146  *
8147  * s = 'hello World!' # => "hello World!"
8148  * s.capitalize # => "Hello world!"
8149  *
8150  * The casing may be affected by the given +options+;
8151  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8152  *
8153  * Related: String#capitalize!.
8154  *
8155  */
8156 
8157 static VALUE
8158 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8159 {
8160  rb_encoding *enc;
8161  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8162  VALUE ret;
8163 
8164  flags = check_case_options(argc, argv, flags);
8165  enc = str_true_enc(str);
8166  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8167  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8168  ret = rb_str_new(0, RSTRING_LEN(str));
8169  rb_str_ascii_casemap(str, ret, &flags, enc);
8170  }
8171  else {
8172  ret = rb_str_casemap(str, &flags, enc);
8173  }
8174  return ret;
8175 }
8176 
8177 
8178 /*
8179  * call-seq:
8180  * swapcase!(*options) -> self or nil
8181  *
8182  * Upcases each lowercase character in +self+;
8183  * downcases uppercase character;
8184  * returns +self+ if any changes were made, +nil+ otherwise:
8185  *
8186  * s = 'Hello World!' # => "Hello World!"
8187  * s.swapcase! # => "hELLO wORLD!"
8188  * s # => "hELLO wORLD!"
8189  * ''.swapcase! # => nil
8190  *
8191  * The casing may be affected by the given +options+;
8192  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8193  *
8194  * Related: String#swapcase.
8195  *
8196  */
8197 
8198 static VALUE
8199 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8200 {
8201  rb_encoding *enc;
8202  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8203 
8204  flags = check_case_options(argc, argv, flags);
8205  str_modify_keep_cr(str);
8206  enc = str_true_enc(str);
8207  if (flags&ONIGENC_CASE_ASCII_ONLY)
8208  rb_str_ascii_casemap(str, str, &flags, enc);
8209  else
8210  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8211 
8212  if (ONIGENC_CASE_MODIFIED&flags) return str;
8213  return Qnil;
8214 }
8215 
8216 
8217 /*
8218  * call-seq:
8219  * swapcase(*options) -> string
8220  *
8221  * Returns a string containing the characters in +self+, with cases reversed;
8222  * each uppercase character is downcased;
8223  * each lowercase character is upcased:
8224  *
8225  * s = 'Hello World!' # => "Hello World!"
8226  * s.swapcase # => "hELLO wORLD!"
8227  *
8228  * The casing may be affected by the given +options+;
8229  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8230  *
8231  * Related: String#swapcase!.
8232  *
8233  */
8234 
8235 static VALUE
8236 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8237 {
8238  rb_encoding *enc;
8239  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8240  VALUE ret;
8241 
8242  flags = check_case_options(argc, argv, flags);
8243  enc = str_true_enc(str);
8244  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8245  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8246  ret = rb_str_new(0, RSTRING_LEN(str));
8247  rb_str_ascii_casemap(str, ret, &flags, enc);
8248  }
8249  else {
8250  ret = rb_str_casemap(str, &flags, enc);
8251  }
8252  return ret;
8253 }
8254 
8255 typedef unsigned char *USTR;
8256 
8257 struct tr {
8258  int gen;
8259  unsigned int now, max;
8260  char *p, *pend;
8261 };
8262 
8263 static unsigned int
8264 trnext(struct tr *t, rb_encoding *enc)
8265 {
8266  int n;
8267 
8268  for (;;) {
8269  nextpart:
8270  if (!t->gen) {
8271  if (t->p == t->pend) return -1;
8272  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8273  t->p += n;
8274  }
8275  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8276  t->p += n;
8277  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8278  t->p += n;
8279  if (t->p < t->pend) {
8280  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8281  t->p += n;
8282  if (t->now > c) {
8283  if (t->now < 0x80 && c < 0x80) {
8285  "invalid range \"%c-%c\" in string transliteration",
8286  t->now, c);
8287  }
8288  else {
8289  rb_raise(rb_eArgError, "invalid range in string transliteration");
8290  }
8291  continue; /* not reached */
8292  }
8293  else if (t->now < c) {
8294  t->gen = 1;
8295  t->max = c;
8296  }
8297  }
8298  }
8299  return t->now;
8300  }
8301  else {
8302  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8303  if (t->now == t->max) {
8304  t->gen = 0;
8305  goto nextpart;
8306  }
8307  }
8308  if (t->now < t->max) {
8309  return t->now;
8310  }
8311  else {
8312  t->gen = 0;
8313  return t->max;
8314  }
8315  }
8316  }
8317 }
8318 
8319 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8320 
8321 static VALUE
8322 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8323 {
8324  const unsigned int errc = -1;
8325  unsigned int trans[256];
8326  rb_encoding *enc, *e1, *e2;
8327  struct tr trsrc, trrepl;
8328  int cflag = 0;
8329  unsigned int c, c0, last = 0;
8330  int modify = 0, i, l;
8331  unsigned char *s, *send;
8332  VALUE hash = 0;
8333  int singlebyte = single_byte_optimizable(str);
8334  int termlen;
8335  int cr;
8336 
8337 #define CHECK_IF_ASCII(c) \
8338  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8339  (cr = ENC_CODERANGE_VALID) : 0)
8340 
8341  StringValue(src);
8342  StringValue(repl);
8343  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8344  if (RSTRING_LEN(repl) == 0) {
8345  return rb_str_delete_bang(1, &src, str);
8346  }
8347 
8348  cr = ENC_CODERANGE(str);
8349  e1 = rb_enc_check(str, src);
8350  e2 = rb_enc_check(str, repl);
8351  if (e1 == e2) {
8352  enc = e1;
8353  }
8354  else {
8355  enc = rb_enc_check(src, repl);
8356  }
8357  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8358  if (RSTRING_LEN(src) > 1 &&
8359  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8360  trsrc.p + l < trsrc.pend) {
8361  cflag = 1;
8362  trsrc.p += l;
8363  }
8364  trrepl.p = RSTRING_PTR(repl);
8365  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8366  trsrc.gen = trrepl.gen = 0;
8367  trsrc.now = trrepl.now = 0;
8368  trsrc.max = trrepl.max = 0;
8369 
8370  if (cflag) {
8371  for (i=0; i<256; i++) {
8372  trans[i] = 1;
8373  }
8374  while ((c = trnext(&trsrc, enc)) != errc) {
8375  if (c < 256) {
8376  trans[c] = errc;
8377  }
8378  else {
8379  if (!hash) hash = rb_hash_new();
8380  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8381  }
8382  }
8383  while ((c = trnext(&trrepl, enc)) != errc)
8384  /* retrieve last replacer */;
8385  last = trrepl.now;
8386  for (i=0; i<256; i++) {
8387  if (trans[i] != errc) {
8388  trans[i] = last;
8389  }
8390  }
8391  }
8392  else {
8393  unsigned int r;
8394 
8395  for (i=0; i<256; i++) {
8396  trans[i] = errc;
8397  }
8398  while ((c = trnext(&trsrc, enc)) != errc) {
8399  r = trnext(&trrepl, enc);
8400  if (r == errc) r = trrepl.now;
8401  if (c < 256) {
8402  trans[c] = r;
8403  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8404  }
8405  else {
8406  if (!hash) hash = rb_hash_new();
8407  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8408  }
8409  }
8410  }
8411 
8412  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8413  cr = ENC_CODERANGE_7BIT;
8414  str_modify_keep_cr(str);
8415  s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8416  termlen = rb_enc_mbminlen(enc);
8417  if (sflag) {
8418  int clen, tlen;
8419  long offset, max = RSTRING_LEN(str);
8420  unsigned int save = -1;
8421  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8422 
8423  while (s < send) {
8424  int may_modify = 0;
8425 
8426  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8427  if (!MBCLEN_CHARFOUND_P(r)) {
8428  xfree(buf);
8429  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8430  }
8431  clen = MBCLEN_CHARFOUND_LEN(r);
8432  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8433 
8434  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8435 
8436  s += clen;
8437  if (c < 256) {
8438  c = trans[c];
8439  }
8440  else if (hash) {
8441  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8442  if (NIL_P(tmp)) {
8443  if (cflag) c = last;
8444  else c = errc;
8445  }
8446  else if (cflag) c = errc;
8447  else c = NUM2INT(tmp);
8448  }
8449  else {
8450  c = errc;
8451  }
8452  if (c != (unsigned int)-1) {
8453  if (save == c) {
8454  CHECK_IF_ASCII(c);
8455  continue;
8456  }
8457  save = c;
8458  tlen = rb_enc_codelen(c, enc);
8459  modify = 1;
8460  }
8461  else {
8462  save = -1;
8463  c = c0;
8464  if (enc != e1) may_modify = 1;
8465  }
8466  if ((offset = t - buf) + tlen > max) {
8467  size_t MAYBE_UNUSED(old) = max + termlen;
8468  max = offset + tlen + (send - s);
8469  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8470  t = buf + offset;
8471  }
8472  rb_enc_mbcput(c, t, enc);
8473  if (may_modify && memcmp(s, t, tlen) != 0) {
8474  modify = 1;
8475  }
8476  CHECK_IF_ASCII(c);
8477  t += tlen;
8478  }
8479  if (!STR_EMBED_P(str)) {
8480  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8481  }
8482  TERM_FILL((char *)t, termlen);
8483  RSTRING(str)->as.heap.ptr = (char *)buf;
8484  STR_SET_LEN(str, t - buf);
8485  STR_SET_NOEMBED(str);
8486  RSTRING(str)->as.heap.aux.capa = max;
8487  }
8488  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8489  while (s < send) {
8490  c = (unsigned char)*s;
8491  if (trans[c] != errc) {
8492  if (!cflag) {
8493  c = trans[c];
8494  *s = c;
8495  modify = 1;
8496  }
8497  else {
8498  *s = last;
8499  modify = 1;
8500  }
8501  }
8502  CHECK_IF_ASCII(c);
8503  s++;
8504  }
8505  }
8506  else {
8507  int clen, tlen;
8508  long offset, max = (long)((send - s) * 1.2);
8509  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8510 
8511  while (s < send) {
8512  int may_modify = 0;
8513 
8514  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8515  if (!MBCLEN_CHARFOUND_P(r)) {
8516  xfree(buf);
8517  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8518  }
8519  clen = MBCLEN_CHARFOUND_LEN(r);
8520  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8521 
8522  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8523 
8524  if (c < 256) {
8525  c = trans[c];
8526  }
8527  else if (hash) {
8528  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8529  if (NIL_P(tmp)) {
8530  if (cflag) c = last;
8531  else c = errc;
8532  }
8533  else if (cflag) c = errc;
8534  else c = NUM2INT(tmp);
8535  }
8536  else {
8537  c = cflag ? last : errc;
8538  }
8539  if (c != errc) {
8540  tlen = rb_enc_codelen(c, enc);
8541  modify = 1;
8542  }
8543  else {
8544  c = c0;
8545  if (enc != e1) may_modify = 1;
8546  }
8547  if ((offset = t - buf) + tlen > max) {
8548  size_t MAYBE_UNUSED(old) = max + termlen;
8549  max = offset + tlen + (long)((send - s) * 1.2);
8550  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8551  t = buf + offset;
8552  }
8553  if (s != t) {
8554  rb_enc_mbcput(c, t, enc);
8555  if (may_modify && memcmp(s, t, tlen) != 0) {
8556  modify = 1;
8557  }
8558  }
8559  CHECK_IF_ASCII(c);
8560  s += clen;
8561  t += tlen;
8562  }
8563  if (!STR_EMBED_P(str)) {
8564  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8565  }
8566  TERM_FILL((char *)t, termlen);
8567  RSTRING(str)->as.heap.ptr = (char *)buf;
8568  STR_SET_LEN(str, t - buf);
8569  STR_SET_NOEMBED(str);
8570  RSTRING(str)->as.heap.aux.capa = max;
8571  }
8572 
8573  if (modify) {
8574  if (cr != ENC_CODERANGE_BROKEN)
8575  ENC_CODERANGE_SET(str, cr);
8576  rb_enc_associate(str, enc);
8577  return str;
8578  }
8579  return Qnil;
8580 }
8581 
8582 
8583 /*
8584  * call-seq:
8585  * tr!(selector, replacements) -> self or nil
8586  *
8587  * Like String#tr, but modifies +self+ in place.
8588  * Returns +self+ if any changes were made, +nil+ otherwise.
8589  *
8590  */
8591 
8592 static VALUE
8593 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8594 {
8595  return tr_trans(str, src, repl, 0);
8596 }
8597 
8598 
8599 /*
8600  * call-seq:
8601  * tr(selector, replacements) -> new_string
8602  *
8603  * Returns a copy of +self+ with each character specified by string +selector+
8604  * translated to the corresponding character in string +replacements+.
8605  * The correspondence is _positional_:
8606  *
8607  * - Each occurrence of the first character specified by +selector+
8608  * is translated to the first character in +replacements+.
8609  * - Each occurrence of the second character specified by +selector+
8610  * is translated to the second character in +replacements+.
8611  * - And so on.
8612  *
8613  * Example:
8614  *
8615  * 'hello'.tr('el', 'ip') #=> "hippo"
8616  *
8617  * If +replacements+ is shorter than +selector+,
8618  * it is implicitly padded with its own last character:
8619  *
8620  * 'hello'.tr('aeiou', '-') # => "h-ll-"
8621  * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8622  *
8623  * Arguments +selector+ and +replacements+ must be valid character selectors
8624  * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8625  * and may use any of its valid forms, including negation, ranges, and escaping:
8626  *
8627  * # Negation.
8628  * 'hello'.tr('^aeiou', '-') # => "-e--o"
8629  * # Ranges.
8630  * 'ibm'.tr('b-z', 'a-z') # => "hal"
8631  * # Escapes.
8632  * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8633  * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8634  * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8635  *
8636  */
8637 
8638 static VALUE
8639 rb_str_tr(VALUE str, VALUE src, VALUE repl)
8640 {
8641  str = str_duplicate(rb_cString, str);
8642  tr_trans(str, src, repl, 0);
8643  return str;
8644 }
8645 
8646 #define TR_TABLE_MAX (UCHAR_MAX+1)
8647 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8648 static void
8649 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8650  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8651 {
8652  const unsigned int errc = -1;
8653  char buf[TR_TABLE_MAX];
8654  struct tr tr;
8655  unsigned int c;
8656  VALUE table = 0, ptable = 0;
8657  int i, l, cflag = 0;
8658 
8659  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8660  tr.gen = tr.now = tr.max = 0;
8661 
8662  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8663  cflag = 1;
8664  tr.p += l;
8665  }
8666  if (first) {
8667  for (i=0; i<TR_TABLE_MAX; i++) {
8668  stable[i] = 1;
8669  }
8670  stable[TR_TABLE_MAX] = cflag;
8671  }
8672  else if (stable[TR_TABLE_MAX] && !cflag) {
8673  stable[TR_TABLE_MAX] = 0;
8674  }
8675  for (i=0; i<TR_TABLE_MAX; i++) {
8676  buf[i] = cflag;
8677  }
8678 
8679  while ((c = trnext(&tr, enc)) != errc) {
8680  if (c < TR_TABLE_MAX) {
8681  buf[(unsigned char)c] = !cflag;
8682  }
8683  else {
8684  VALUE key = UINT2NUM(c);
8685 
8686  if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8687  if (cflag) {
8688  ptable = *ctablep;
8689  table = ptable ? ptable : rb_hash_new();
8690  *ctablep = table;
8691  }
8692  else {
8693  table = rb_hash_new();
8694  ptable = *tablep;
8695  *tablep = table;
8696  }
8697  }
8698  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8699  rb_hash_aset(table, key, Qtrue);
8700  }
8701  }
8702  }
8703  for (i=0; i<TR_TABLE_MAX; i++) {
8704  stable[i] = stable[i] && buf[i];
8705  }
8706  if (!table && !cflag) {
8707  *tablep = 0;
8708  }
8709 }
8710 
8711 
8712 static int
8713 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8714 {
8715  if (c < TR_TABLE_MAX) {
8716  return table[c] != 0;
8717  }
8718  else {
8719  VALUE v = UINT2NUM(c);
8720 
8721  if (del) {
8722  if (!NIL_P(rb_hash_lookup(del, v)) &&
8723  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8724  return TRUE;
8725  }
8726  }
8727  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8728  return FALSE;
8729  }
8730  return table[TR_TABLE_MAX] ? TRUE : FALSE;
8731  }
8732 }
8733 
8734 /*
8735  * call-seq:
8736  * delete!(*selectors) -> self or nil
8737  *
8738  * Like String#delete, but modifies +self+ in place.
8739  * Returns +self+ if any changes were made, +nil+ otherwise.
8740  *
8741  */
8742 
8743 static VALUE
8744 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8745 {
8746  char squeez[TR_TABLE_SIZE];
8747  rb_encoding *enc = 0;
8748  char *s, *send, *t;
8749  VALUE del = 0, nodel = 0;
8750  int modify = 0;
8751  int i, ascompat, cr;
8752 
8753  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8755  for (i=0; i<argc; i++) {
8756  VALUE s = argv[i];
8757 
8758  StringValue(s);
8759  enc = rb_enc_check(str, s);
8760  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8761  }
8762 
8763  str_modify_keep_cr(str);
8764  ascompat = rb_enc_asciicompat(enc);
8765  s = t = RSTRING_PTR(str);
8766  send = RSTRING_END(str);
8767  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8768  while (s < send) {
8769  unsigned int c;
8770  int clen;
8771 
8772  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8773  if (squeez[c]) {
8774  modify = 1;
8775  }
8776  else {
8777  if (t != s) *t = c;
8778  t++;
8779  }
8780  s++;
8781  }
8782  else {
8783  c = rb_enc_codepoint_len(s, send, &clen, enc);
8784 
8785  if (tr_find(c, squeez, del, nodel)) {
8786  modify = 1;
8787  }
8788  else {
8789  if (t != s) rb_enc_mbcput(c, t, enc);
8790  t += clen;
8791  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8792  }
8793  s += clen;
8794  }
8795  }
8796  TERM_FILL(t, TERM_LEN(str));
8797  STR_SET_LEN(str, t - RSTRING_PTR(str));
8798  ENC_CODERANGE_SET(str, cr);
8799 
8800  if (modify) return str;
8801  return Qnil;
8802 }
8803 
8804 
8805 /*
8806  * call-seq:
8807  * delete(*selectors) -> new_string
8808  *
8809  * Returns a copy of +self+ with characters specified by +selectors+ removed
8810  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8811  *
8812  * "hello".delete "l","lo" #=> "heo"
8813  * "hello".delete "lo" #=> "he"
8814  * "hello".delete "aeiou", "^e" #=> "hell"
8815  * "hello".delete "ej-m" #=> "ho"
8816  *
8817  */
8818 
8819 static VALUE
8820 rb_str_delete(int argc, VALUE *argv, VALUE str)
8821 {
8822  str = str_duplicate(rb_cString, str);
8823  rb_str_delete_bang(argc, argv, str);
8824  return str;
8825 }
8826 
8827 
8828 /*
8829  * call-seq:
8830  * squeeze!(*selectors) -> self or nil
8831  *
8832  * Like String#squeeze, but modifies +self+ in place.
8833  * Returns +self+ if any changes were made, +nil+ otherwise.
8834  */
8835 
8836 static VALUE
8837 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8838 {
8839  char squeez[TR_TABLE_SIZE];
8840  rb_encoding *enc = 0;
8841  VALUE del = 0, nodel = 0;
8842  unsigned char *s, *send, *t;
8843  int i, modify = 0;
8844  int ascompat, singlebyte = single_byte_optimizable(str);
8845  unsigned int save;
8846 
8847  if (argc == 0) {
8848  enc = STR_ENC_GET(str);
8849  }
8850  else {
8851  for (i=0; i<argc; i++) {
8852  VALUE s = argv[i];
8853 
8854  StringValue(s);
8855  enc = rb_enc_check(str, s);
8856  if (singlebyte && !single_byte_optimizable(s))
8857  singlebyte = 0;
8858  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8859  }
8860  }
8861 
8862  str_modify_keep_cr(str);
8863  s = t = (unsigned char *)RSTRING_PTR(str);
8864  if (!s || RSTRING_LEN(str) == 0) return Qnil;
8865  send = (unsigned char *)RSTRING_END(str);
8866  save = -1;
8867  ascompat = rb_enc_asciicompat(enc);
8868 
8869  if (singlebyte) {
8870  while (s < send) {
8871  unsigned int c = *s++;
8872  if (c != save || (argc > 0 && !squeez[c])) {
8873  *t++ = save = c;
8874  }
8875  }
8876  }
8877  else {
8878  while (s < send) {
8879  unsigned int c;
8880  int clen;
8881 
8882  if (ascompat && (c = *s) < 0x80) {
8883  if (c != save || (argc > 0 && !squeez[c])) {
8884  *t++ = save = c;
8885  }
8886  s++;
8887  }
8888  else {
8889  c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8890 
8891  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8892  if (t != s) rb_enc_mbcput(c, t, enc);
8893  save = c;
8894  t += clen;
8895  }
8896  s += clen;
8897  }
8898  }
8899  }
8900 
8901  TERM_FILL((char *)t, TERM_LEN(str));
8902  if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8903  STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8904  modify = 1;
8905  }
8906 
8907  if (modify) return str;
8908  return Qnil;
8909 }
8910 
8911 
8912 /*
8913  * call-seq:
8914  * squeeze(*selectors) -> new_string
8915  *
8916  * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8917  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8918  *
8919  * "Squeezed" means that each multiple-character run of a selected character
8920  * is squeezed down to a single character;
8921  * with no arguments given, squeezes all characters:
8922  *
8923  * "yellow moon".squeeze #=> "yelow mon"
8924  * " now is the".squeeze(" ") #=> " now is the"
8925  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8926  *
8927  */
8928 
8929 static VALUE
8930 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8931 {
8932  str = str_duplicate(rb_cString, str);
8933  rb_str_squeeze_bang(argc, argv, str);
8934  return str;
8935 }
8936 
8937 
8938 /*
8939  * call-seq:
8940  * tr_s!(selector, replacements) -> self or nil
8941  *
8942  * Like String#tr_s, but modifies +self+ in place.
8943  * Returns +self+ if any changes were made, +nil+ otherwise.
8944  *
8945  * Related: String#squeeze!.
8946  */
8947 
8948 static VALUE
8949 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8950 {
8951  return tr_trans(str, src, repl, 1);
8952 }
8953 
8954 
8955 /*
8956  * call-seq:
8957  * tr_s(selector, replacements) -> string
8958  *
8959  * Like String#tr, but also squeezes the modified portions of the translated string;
8960  * returns a new string (translated and squeezed).
8961  *
8962  * 'hello'.tr_s('l', 'r') #=> "hero"
8963  * 'hello'.tr_s('el', '-') #=> "h-o"
8964  * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8965  *
8966  * Related: String#squeeze.
8967  *
8968  */
8969 
8970 static VALUE
8971 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8972 {
8973  str = str_duplicate(rb_cString, str);
8974  tr_trans(str, src, repl, 1);
8975  return str;
8976 }
8977 
8978 
8979 /*
8980  * call-seq:
8981  * count(*selectors) -> integer
8982  *
8983  * Returns the total number of characters in +self+
8984  * that are specified by the given +selectors+
8985  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8986  *
8987  * a = "hello world"
8988  * a.count "lo" #=> 5
8989  * a.count "lo", "o" #=> 2
8990  * a.count "hello", "^l" #=> 4
8991  * a.count "ej-m" #=> 4
8992  *
8993  * "hello^world".count "\\^aeiou" #=> 4
8994  * "hello-world".count "a\\-eo" #=> 4
8995  *
8996  * c = "hello world\\r\\n"
8997  * c.count "\\" #=> 2
8998  * c.count "\\A" #=> 0
8999  * c.count "X-\\w" #=> 3
9000  */
9001 
9002 static VALUE
9003 rb_str_count(int argc, VALUE *argv, VALUE str)
9004 {
9005  char table[TR_TABLE_SIZE];
9006  rb_encoding *enc = 0;
9007  VALUE del = 0, nodel = 0, tstr;
9008  char *s, *send;
9009  int i;
9010  int ascompat;
9011  size_t n = 0;
9012 
9014 
9015  tstr = argv[0];
9016  StringValue(tstr);
9017  enc = rb_enc_check(str, tstr);
9018  if (argc == 1) {
9019  const char *ptstr;
9020  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9021  (ptstr = RSTRING_PTR(tstr),
9022  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9023  !is_broken_string(str)) {
9024  int clen;
9025  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9026 
9027  s = RSTRING_PTR(str);
9028  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9029  send = RSTRING_END(str);
9030  while (s < send) {
9031  if (*(unsigned char*)s++ == c) n++;
9032  }
9033  return SIZET2NUM(n);
9034  }
9035  }
9036 
9037  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9038  for (i=1; i<argc; i++) {
9039  tstr = argv[i];
9040  StringValue(tstr);
9041  enc = rb_enc_check(str, tstr);
9042  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9043  }
9044 
9045  s = RSTRING_PTR(str);
9046  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9047  send = RSTRING_END(str);
9048  ascompat = rb_enc_asciicompat(enc);
9049  while (s < send) {
9050  unsigned int c;
9051 
9052  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9053  if (table[c]) {
9054  n++;
9055  }
9056  s++;
9057  }
9058  else {
9059  int clen;
9060  c = rb_enc_codepoint_len(s, send, &clen, enc);
9061  if (tr_find(c, table, del, nodel)) {
9062  n++;
9063  }
9064  s += clen;
9065  }
9066  }
9067 
9068  return SIZET2NUM(n);
9069 }
9070 
9071 static VALUE
9072 rb_fs_check(VALUE val)
9073 {
9074  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9075  val = rb_check_string_type(val);
9076  if (NIL_P(val)) return 0;
9077  }
9078  return val;
9079 }
9080 
9081 static const char isspacetable[256] = {
9082  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9083  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9084  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9085  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9086  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9087  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9088  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9089  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9090  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9091  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9092  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9093  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9094  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9095  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9096  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9098 };
9099 
9100 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9101 
9102 static long
9103 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9104 {
9105  if (empty_count >= 0 && len == 0) {
9106  return empty_count + 1;
9107  }
9108  if (empty_count > 0) {
9109  /* make different substrings */
9110  if (result) {
9111  do {
9112  rb_ary_push(result, str_new_empty_String(str));
9113  } while (--empty_count > 0);
9114  }
9115  else {
9116  do {
9117  rb_yield(str_new_empty_String(str));
9118  } while (--empty_count > 0);
9119  }
9120  }
9121  str = rb_str_subseq(str, beg, len);
9122  if (result) {
9123  rb_ary_push(result, str);
9124  }
9125  else {
9126  rb_yield(str);
9127  }
9128  return empty_count;
9129 }
9130 
9131 typedef enum {
9132  SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9133 } split_type_t;
9134 
9135 static split_type_t
9136 literal_split_pattern(VALUE spat, split_type_t default_type)
9137 {
9138  rb_encoding *enc = STR_ENC_GET(spat);
9139  const char *ptr;
9140  long len;
9141  RSTRING_GETMEM(spat, ptr, len);
9142  if (len == 0) {
9143  /* Special case - split into chars */
9144  return SPLIT_TYPE_CHARS;
9145  }
9146  else if (rb_enc_asciicompat(enc)) {
9147  if (len == 1 && ptr[0] == ' ') {
9148  return SPLIT_TYPE_AWK;
9149  }
9150  }
9151  else {
9152  int l;
9153  if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9154  return SPLIT_TYPE_AWK;
9155  }
9156  }
9157  return default_type;
9158 }
9159 
9160 /*
9161  * call-seq:
9162  * split(field_sep = $;, limit = nil) -> array
9163  * split(field_sep = $;, limit = nil) {|substring| ... } -> self
9164  *
9165  * :include: doc/string/split.rdoc
9166  *
9167  */
9168 
9169 static VALUE
9170 rb_str_split_m(int argc, VALUE *argv, VALUE str)
9171 {
9172  rb_encoding *enc;
9173  VALUE spat;
9174  VALUE limit;
9175  split_type_t split_type;
9176  long beg, end, i = 0, empty_count = -1;
9177  int lim = 0;
9178  VALUE result, tmp;
9179 
9180  result = rb_block_given_p() ? Qfalse : Qnil;
9181  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9182  lim = NUM2INT(limit);
9183  if (lim <= 0) limit = Qnil;
9184  else if (lim == 1) {
9185  if (RSTRING_LEN(str) == 0)
9186  return result ? rb_ary_new2(0) : str;
9187  tmp = str_duplicate(rb_cString, str);
9188  if (!result) {
9189  rb_yield(tmp);
9190  return str;
9191  }
9192  return rb_ary_new3(1, tmp);
9193  }
9194  i = 1;
9195  }
9196  if (NIL_P(limit) && !lim) empty_count = 0;
9197 
9198  enc = STR_ENC_GET(str);
9199  split_type = SPLIT_TYPE_REGEXP;
9200  if (!NIL_P(spat)) {
9201  spat = get_pat_quoted(spat, 0);
9202  }
9203  else if (NIL_P(spat = rb_fs)) {
9204  split_type = SPLIT_TYPE_AWK;
9205  }
9206  else if (!(spat = rb_fs_check(spat))) {
9207  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9208  }
9209  else {
9210  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9211  }
9212  if (split_type != SPLIT_TYPE_AWK) {
9213  switch (BUILTIN_TYPE(spat)) {
9214  case T_REGEXP:
9215  rb_reg_options(spat); /* check if uninitialized */
9216  tmp = RREGEXP_SRC(spat);
9217  split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9218  if (split_type == SPLIT_TYPE_AWK) {
9219  spat = tmp;
9220  split_type = SPLIT_TYPE_STRING;
9221  }
9222  break;
9223 
9224  case T_STRING:
9225  mustnot_broken(spat);
9226  split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9227  break;
9228 
9229  default:
9231  }
9232  }
9233 
9234 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9235 
9236  beg = 0;
9237  char *ptr = RSTRING_PTR(str);
9238  char *eptr = RSTRING_END(str);
9239  if (split_type == SPLIT_TYPE_AWK) {
9240  char *bptr = ptr;
9241  int skip = 1;
9242  unsigned int c;
9243 
9244  if (result) result = rb_ary_new();
9245  end = beg;
9246  if (is_ascii_string(str)) {
9247  while (ptr < eptr) {
9248  c = (unsigned char)*ptr++;
9249  if (skip) {
9250  if (ascii_isspace(c)) {
9251  beg = ptr - bptr;
9252  }
9253  else {
9254  end = ptr - bptr;
9255  skip = 0;
9256  if (!NIL_P(limit) && lim <= i) break;
9257  }
9258  }
9259  else if (ascii_isspace(c)) {
9260  SPLIT_STR(beg, end-beg);
9261  skip = 1;
9262  beg = ptr - bptr;
9263  if (!NIL_P(limit)) ++i;
9264  }
9265  else {
9266  end = ptr - bptr;
9267  }
9268  }
9269  }
9270  else {
9271  while (ptr < eptr) {
9272  int n;
9273 
9274  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9275  ptr += n;
9276  if (skip) {
9277  if (rb_isspace(c)) {
9278  beg = ptr - bptr;
9279  }
9280  else {
9281  end = ptr - bptr;
9282  skip = 0;
9283  if (!NIL_P(limit) && lim <= i) break;
9284  }
9285  }
9286  else if (rb_isspace(c)) {
9287  SPLIT_STR(beg, end-beg);
9288  skip = 1;
9289  beg = ptr - bptr;
9290  if (!NIL_P(limit)) ++i;
9291  }
9292  else {
9293  end = ptr - bptr;
9294  }
9295  }
9296  }
9297  }
9298  else if (split_type == SPLIT_TYPE_STRING) {
9299  char *str_start = ptr;
9300  char *substr_start = ptr;
9301  char *sptr = RSTRING_PTR(spat);
9302  long slen = RSTRING_LEN(spat);
9303 
9304  if (result) result = rb_ary_new();
9305  mustnot_broken(str);
9306  enc = rb_enc_check(str, spat);
9307  while (ptr < eptr &&
9308  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9309  /* Check we are at the start of a char */
9310  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9311  if (t != ptr + end) {
9312  ptr = t;
9313  continue;
9314  }
9315  SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9316  ptr += end + slen;
9317  substr_start = ptr;
9318  if (!NIL_P(limit) && lim <= ++i) break;
9319  }
9320  beg = ptr - str_start;
9321  }
9322  else if (split_type == SPLIT_TYPE_CHARS) {
9323  char *str_start = ptr;
9324  int n;
9325 
9326  if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9327  mustnot_broken(str);
9328  enc = rb_enc_get(str);
9329  while (ptr < eptr &&
9330  (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9331  SPLIT_STR(ptr - str_start, n);
9332  ptr += n;
9333  if (!NIL_P(limit) && lim <= ++i) break;
9334  }
9335  beg = ptr - str_start;
9336  }
9337  else {
9338  if (result) result = rb_ary_new();
9339  long len = RSTRING_LEN(str);
9340  long start = beg;
9341  long idx;
9342  int last_null = 0;
9343  struct re_registers *regs;
9344  VALUE match = 0;
9345 
9346  for (; rb_reg_search(spat, str, start, 0) >= 0;
9347  (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9348  match = rb_backref_get();
9349  if (!result) rb_match_busy(match);
9350  regs = RMATCH_REGS(match);
9351  end = BEG(0);
9352  if (start == end && BEG(0) == END(0)) {
9353  if (!ptr) {
9354  SPLIT_STR(0, 0);
9355  break;
9356  }
9357  else if (last_null == 1) {
9358  SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9359  beg = start;
9360  }
9361  else {
9362  if (start == len)
9363  start++;
9364  else
9365  start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9366  last_null = 1;
9367  continue;
9368  }
9369  }
9370  else {
9371  SPLIT_STR(beg, end-beg);
9372  beg = start = END(0);
9373  }
9374  last_null = 0;
9375 
9376  for (idx=1; idx < regs->num_regs; idx++) {
9377  if (BEG(idx) == -1) continue;
9378  SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9379  }
9380  if (!NIL_P(limit) && lim <= ++i) break;
9381  }
9382  if (match) rb_match_unbusy(match);
9383  }
9384  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9385  SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9386  }
9387 
9388  return result ? result : str;
9389 }
9390 
9391 VALUE
9392 rb_str_split(VALUE str, const char *sep0)
9393 {
9394  VALUE sep;
9395 
9396  StringValue(str);
9397  sep = rb_str_new_cstr(sep0);
9398  return rb_str_split_m(1, &sep, str);
9399 }
9400 
9401 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9402 
9403 static inline int
9404 enumerator_element(VALUE ary, VALUE e)
9405 {
9406  if (ary) {
9407  rb_ary_push(ary, e);
9408  return 0;
9409  }
9410  else {
9411  rb_yield(e);
9412  return 1;
9413  }
9414 }
9415 
9416 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9417 
9418 static const char *
9419 chomp_newline(const char *p, const char *e, rb_encoding *enc)
9420 {
9421  const char *prev = rb_enc_prev_char(p, e, e, enc);
9422  if (rb_enc_is_newline(prev, e, enc)) {
9423  e = prev;
9424  prev = rb_enc_prev_char(p, e, e, enc);
9425  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9426  e = prev;
9427  }
9428  return e;
9429 }
9430 
9431 static VALUE
9432 get_rs(void)
9433 {
9434  VALUE rs = rb_rs;
9435  if (!NIL_P(rs) &&
9436  (!RB_TYPE_P(rs, T_STRING) ||
9437  RSTRING_LEN(rs) != 1 ||
9438  RSTRING_PTR(rs)[0] != '\n')) {
9439  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9440  }
9441  return rs;
9442 }
9443 
9444 #define rb_rs get_rs()
9445 
9446 static VALUE
9447 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9448 {
9449  rb_encoding *enc;
9450  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9451  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9452  long pos, len, rslen;
9453  int rsnewline = 0;
9454 
9455  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9456  rs = rb_rs;
9457  if (!NIL_P(opts)) {
9458  static ID keywords[1];
9459  if (!keywords[0]) {
9460  keywords[0] = rb_intern_const("chomp");
9461  }
9462  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9463  chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9464  }
9465 
9466  if (NIL_P(rs)) {
9467  if (!ENUM_ELEM(ary, str)) {
9468  return ary;
9469  }
9470  else {
9471  return orig;
9472  }
9473  }
9474 
9475  if (!RSTRING_LEN(str)) goto end;
9476  str = rb_str_new_frozen(str);
9477  ptr = subptr = RSTRING_PTR(str);
9478  pend = RSTRING_END(str);
9479  len = RSTRING_LEN(str);
9480  StringValue(rs);
9481  rslen = RSTRING_LEN(rs);
9482 
9483  if (rs == rb_default_rs)
9484  enc = rb_enc_get(str);
9485  else
9486  enc = rb_enc_check(str, rs);
9487 
9488  if (rslen == 0) {
9489  /* paragraph mode */
9490  int n;
9491  const char *eol = NULL;
9492  subend = subptr;
9493  while (subend < pend) {
9494  long chomp_rslen = 0;
9495  do {
9496  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9497  n = 0;
9498  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9499  if (rb_enc_is_newline(subend + n, pend, enc)) {
9500  if (eol == subend) break;
9501  subend += rslen;
9502  if (subptr) {
9503  eol = subend;
9504  chomp_rslen = -rslen;
9505  }
9506  }
9507  else {
9508  if (!subptr) subptr = subend;
9509  subend += rslen;
9510  }
9511  rslen = 0;
9512  } while (subend < pend);
9513  if (!subptr) break;
9514  if (rslen == 0) chomp_rslen = 0;
9515  line = rb_str_subseq(str, subptr - ptr,
9516  subend - subptr + (chomp ? chomp_rslen : rslen));
9517  if (ENUM_ELEM(ary, line)) {
9518  str_mod_check(str, ptr, len);
9519  }
9520  subptr = eol = NULL;
9521  }
9522  goto end;
9523  }
9524  else {
9525  rsptr = RSTRING_PTR(rs);
9526  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9527  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9528  rsnewline = 1;
9529  }
9530  }
9531 
9532  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9533  rs = rb_str_new(rsptr, rslen);
9534  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9535  rsptr = RSTRING_PTR(rs);
9536  rslen = RSTRING_LEN(rs);
9537  }
9538 
9539  while (subptr < pend) {
9540  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9541  if (pos < 0) break;
9542  hit = subptr + pos;
9543  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9544  if (hit != adjusted) {
9545  subptr = adjusted;
9546  continue;
9547  }
9548  subend = hit += rslen;
9549  if (chomp) {
9550  if (rsnewline) {
9551  subend = chomp_newline(subptr, subend, enc);
9552  }
9553  else {
9554  subend -= rslen;
9555  }
9556  }
9557  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9558  if (ENUM_ELEM(ary, line)) {
9559  str_mod_check(str, ptr, len);
9560  }
9561  subptr = hit;
9562  }
9563 
9564  if (subptr != pend) {
9565  if (chomp) {
9566  if (rsnewline) {
9567  pend = chomp_newline(subptr, pend, enc);
9568  }
9569  else if (pend - subptr >= rslen &&
9570  memcmp(pend - rslen, rsptr, rslen) == 0) {
9571  pend -= rslen;
9572  }
9573  }
9574  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9575  ENUM_ELEM(ary, line);
9576  RB_GC_GUARD(str);
9577  }
9578 
9579  end:
9580  if (ary)
9581  return ary;
9582  else
9583  return orig;
9584 }
9585 
9586 /*
9587  * call-seq:
9588  * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9589  * each_line(line_sep = $/, chomp: false) -> enumerator
9590  *
9591  * :include: doc/string/each_line.rdoc
9592  *
9593  */
9594 
9595 static VALUE
9596 rb_str_each_line(int argc, VALUE *argv, VALUE str)
9597 {
9598  RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9599  return rb_str_enumerate_lines(argc, argv, str, 0);
9600 }
9601 
9602 /*
9603  * call-seq:
9604  * lines(Line_sep = $/, chomp: false) -> array_of_strings
9605  *
9606  * Forms substrings ("lines") of +self+ according to the given arguments
9607  * (see String#each_line for details); returns the lines in an array.
9608  *
9609  */
9610 
9611 static VALUE
9612 rb_str_lines(int argc, VALUE *argv, VALUE str)
9613 {
9614  VALUE ary = WANTARRAY("lines", 0);
9615  return rb_str_enumerate_lines(argc, argv, str, ary);
9616 }
9617 
9618 static VALUE
9619 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9620 {
9621  return LONG2FIX(RSTRING_LEN(str));
9622 }
9623 
9624 static VALUE
9625 rb_str_enumerate_bytes(VALUE str, VALUE ary)
9626 {
9627  long i;
9628 
9629  for (i=0; i<RSTRING_LEN(str); i++) {
9630  ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9631  }
9632  if (ary)
9633  return ary;
9634  else
9635  return str;
9636 }
9637 
9638 /*
9639  * call-seq:
9640  * each_byte {|byte| ... } -> self
9641  * each_byte -> enumerator
9642  *
9643  * :include: doc/string/each_byte.rdoc
9644  *
9645  */
9646 
9647 static VALUE
9648 rb_str_each_byte(VALUE str)
9649 {
9650  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9651  return rb_str_enumerate_bytes(str, 0);
9652 }
9653 
9654 /*
9655  * call-seq:
9656  * bytes -> array_of_bytes
9657  *
9658  * :include: doc/string/bytes.rdoc
9659  *
9660  */
9661 
9662 static VALUE
9663 rb_str_bytes(VALUE str)
9664 {
9665  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9666  return rb_str_enumerate_bytes(str, ary);
9667 }
9668 
9669 static VALUE
9670 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9671 {
9672  return rb_str_length(str);
9673 }
9674 
9675 static VALUE
9676 rb_str_enumerate_chars(VALUE str, VALUE ary)
9677 {
9678  VALUE orig = str;
9679  long i, len, n;
9680  const char *ptr;
9681  rb_encoding *enc;
9682 
9683  str = rb_str_new_frozen(str);
9684  ptr = RSTRING_PTR(str);
9685  len = RSTRING_LEN(str);
9686  enc = rb_enc_get(str);
9687 
9689  for (i = 0; i < len; i += n) {
9690  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9691  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9692  }
9693  }
9694  else {
9695  for (i = 0; i < len; i += n) {
9696  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9697  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9698  }
9699  }
9700  RB_GC_GUARD(str);
9701  if (ary)
9702  return ary;
9703  else
9704  return orig;
9705 }
9706 
9707 /*
9708  * call-seq:
9709  * each_char {|c| ... } -> self
9710  * each_char -> enumerator
9711  *
9712  * :include: doc/string/each_char.rdoc
9713  *
9714  */
9715 
9716 static VALUE
9717 rb_str_each_char(VALUE str)
9718 {
9719  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9720  return rb_str_enumerate_chars(str, 0);
9721 }
9722 
9723 /*
9724  * call-seq:
9725  * chars -> array_of_characters
9726  *
9727  * :include: doc/string/chars.rdoc
9728  *
9729  */
9730 
9731 static VALUE
9732 rb_str_chars(VALUE str)
9733 {
9734  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9735  return rb_str_enumerate_chars(str, ary);
9736 }
9737 
9738 static VALUE
9739 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9740 {
9741  VALUE orig = str;
9742  int n;
9743  unsigned int c;
9744  const char *ptr, *end;
9745  rb_encoding *enc;
9746 
9747  if (single_byte_optimizable(str))
9748  return rb_str_enumerate_bytes(str, ary);
9749 
9750  str = rb_str_new_frozen(str);
9751  ptr = RSTRING_PTR(str);
9752  end = RSTRING_END(str);
9753  enc = STR_ENC_GET(str);
9754 
9755  while (ptr < end) {
9756  c = rb_enc_codepoint_len(ptr, end, &n, enc);
9757  ENUM_ELEM(ary, UINT2NUM(c));
9758  ptr += n;
9759  }
9760  RB_GC_GUARD(str);
9761  if (ary)
9762  return ary;
9763  else
9764  return orig;
9765 }
9766 
9767 /*
9768  * call-seq:
9769  * each_codepoint {|integer| ... } -> self
9770  * each_codepoint -> enumerator
9771  *
9772  * :include: doc/string/each_codepoint.rdoc
9773  *
9774  */
9775 
9776 static VALUE
9777 rb_str_each_codepoint(VALUE str)
9778 {
9779  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9780  return rb_str_enumerate_codepoints(str, 0);
9781 }
9782 
9783 /*
9784  * call-seq:
9785  * codepoints -> array_of_integers
9786  *
9787  * :include: doc/string/codepoints.rdoc
9788  *
9789  */
9790 
9791 static VALUE
9792 rb_str_codepoints(VALUE str)
9793 {
9794  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9795  return rb_str_enumerate_codepoints(str, ary);
9796 }
9797 
9798 static regex_t *
9799 get_reg_grapheme_cluster(rb_encoding *enc)
9800 {
9801  int encidx = rb_enc_to_index(enc);
9802 
9803  const OnigUChar source_ascii[] = "\\X";
9804  const OnigUChar *source = source_ascii;
9805  size_t source_len = sizeof(source_ascii) - 1;
9806 
9807  switch (encidx) {
9808 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9809 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9810 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9811 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9812 #define CASE_UTF(e) \
9813  case ENCINDEX_UTF_##e: { \
9814  static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9815  source = source_UTF_##e; \
9816  source_len = sizeof(source_UTF_##e); \
9817  break; \
9818  }
9819  CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9820 #undef CASE_UTF
9821 #undef CHARS_16BE
9822 #undef CHARS_16LE
9823 #undef CHARS_32BE
9824 #undef CHARS_32LE
9825  }
9826 
9827  regex_t *reg_grapheme_cluster;
9828  OnigErrorInfo einfo;
9829  int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9830  ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9831  if (r) {
9832  UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9833  onig_error_code_to_str(message, r, &einfo);
9834  rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9835  }
9836 
9837  return reg_grapheme_cluster;
9838 }
9839 
9840 static regex_t *
9841 get_cached_reg_grapheme_cluster(rb_encoding *enc)
9842 {
9843  int encidx = rb_enc_to_index(enc);
9844  static regex_t *reg_grapheme_cluster_utf8 = NULL;
9845 
9846  if (encidx == rb_utf8_encindex()) {
9847  if (!reg_grapheme_cluster_utf8) {
9848  reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9849  }
9850 
9851  return reg_grapheme_cluster_utf8;
9852  }
9853 
9854  return NULL;
9855 }
9856 
9857 static VALUE
9858 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9859 {
9860  size_t grapheme_cluster_count = 0;
9861  rb_encoding *enc = get_encoding(str);
9862  const char *ptr, *end;
9863 
9864  if (!rb_enc_unicode_p(enc)) {
9865  return rb_str_length(str);
9866  }
9867 
9868  bool cached_reg_grapheme_cluster = true;
9869  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9870  if (!reg_grapheme_cluster) {
9871  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9872  cached_reg_grapheme_cluster = false;
9873  }
9874 
9875  ptr = RSTRING_PTR(str);
9876  end = RSTRING_END(str);
9877 
9878  while (ptr < end) {
9879  OnigPosition len = onig_match(reg_grapheme_cluster,
9880  (const OnigUChar *)ptr, (const OnigUChar *)end,
9881  (const OnigUChar *)ptr, NULL, 0);
9882  if (len <= 0) break;
9883  grapheme_cluster_count++;
9884  ptr += len;
9885  }
9886 
9887  if (!cached_reg_grapheme_cluster) {
9888  onig_free(reg_grapheme_cluster);
9889  }
9890 
9891  return SIZET2NUM(grapheme_cluster_count);
9892 }
9893 
9894 static VALUE
9895 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9896 {
9897  VALUE orig = str;
9898  rb_encoding *enc = get_encoding(str);
9899  const char *ptr0, *ptr, *end;
9900 
9901  if (!rb_enc_unicode_p(enc)) {
9902  return rb_str_enumerate_chars(str, ary);
9903  }
9904 
9905  if (!ary) str = rb_str_new_frozen(str);
9906 
9907  bool cached_reg_grapheme_cluster = true;
9908  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9909  if (!reg_grapheme_cluster) {
9910  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9911  cached_reg_grapheme_cluster = false;
9912  }
9913 
9914  ptr0 = ptr = RSTRING_PTR(str);
9915  end = RSTRING_END(str);
9916 
9917  while (ptr < end) {
9918  OnigPosition len = onig_match(reg_grapheme_cluster,
9919  (const OnigUChar *)ptr, (const OnigUChar *)end,
9920  (const OnigUChar *)ptr, NULL, 0);
9921  if (len <= 0) break;
9922  ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9923  ptr += len;
9924  }
9925 
9926  if (!cached_reg_grapheme_cluster) {
9927  onig_free(reg_grapheme_cluster);
9928  }
9929 
9930  RB_GC_GUARD(str);
9931  if (ary)
9932  return ary;
9933  else
9934  return orig;
9935 }
9936 
9937 /*
9938  * call-seq:
9939  * each_grapheme_cluster {|gc| ... } -> self
9940  * each_grapheme_cluster -> enumerator
9941  *
9942  * :include: doc/string/each_grapheme_cluster.rdoc
9943  *
9944  */
9945 
9946 static VALUE
9947 rb_str_each_grapheme_cluster(VALUE str)
9948 {
9949  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9950  return rb_str_enumerate_grapheme_clusters(str, 0);
9951 }
9952 
9953 /*
9954  * call-seq:
9955  * grapheme_clusters -> array_of_grapheme_clusters
9956  *
9957  * :include: doc/string/grapheme_clusters.rdoc
9958  *
9959  */
9960 
9961 static VALUE
9962 rb_str_grapheme_clusters(VALUE str)
9963 {
9964  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9965  return rb_str_enumerate_grapheme_clusters(str, ary);
9966 }
9967 
9968 static long
9969 chopped_length(VALUE str)
9970 {
9971  rb_encoding *enc = STR_ENC_GET(str);
9972  const char *p, *p2, *beg, *end;
9973 
9974  beg = RSTRING_PTR(str);
9975  end = beg + RSTRING_LEN(str);
9976  if (beg >= end) return 0;
9977  p = rb_enc_prev_char(beg, end, end, enc);
9978  if (!p) return 0;
9979  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9980  p2 = rb_enc_prev_char(beg, p, end, enc);
9981  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9982  }
9983  return p - beg;
9984 }
9985 
9986 /*
9987  * call-seq:
9988  * chop! -> self or nil
9989  *
9990  * Like String#chop, but modifies +self+ in place;
9991  * returns +nil+ if +self+ is empty, +self+ otherwise.
9992  *
9993  * Related: String#chomp!.
9994  */
9995 
9996 static VALUE
9997 rb_str_chop_bang(VALUE str)
9998 {
9999  str_modify_keep_cr(str);
10000  if (RSTRING_LEN(str) > 0) {
10001  long len;
10002  len = chopped_length(str);
10003  STR_SET_LEN(str, len);
10004  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10005  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10006  ENC_CODERANGE_CLEAR(str);
10007  }
10008  return str;
10009  }
10010  return Qnil;
10011 }
10012 
10013 
10014 /*
10015  * call-seq:
10016  * chop -> new_string
10017  *
10018  * :include: doc/string/chop.rdoc
10019  *
10020  */
10021 
10022 static VALUE
10023 rb_str_chop(VALUE str)
10024 {
10025  return rb_str_subseq(str, 0, chopped_length(str));
10026 }
10027 
10028 static long
10029 smart_chomp(VALUE str, const char *e, const char *p)
10030 {
10031  rb_encoding *enc = rb_enc_get(str);
10032  if (rb_enc_mbminlen(enc) > 1) {
10033  const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10034  if (rb_enc_is_newline(pp, e, enc)) {
10035  e = pp;
10036  }
10037  pp = e - rb_enc_mbminlen(enc);
10038  if (pp >= p) {
10039  pp = rb_enc_left_char_head(p, pp, e, enc);
10040  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10041  e = pp;
10042  }
10043  }
10044  }
10045  else {
10046  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10047  case '\n':
10048  if (--e > p && *(e-1) == '\r') {
10049  --e;
10050  }
10051  break;
10052  case '\r':
10053  --e;
10054  break;
10055  }
10056  }
10057  return e - p;
10058 }
10059 
10060 static long
10061 chompped_length(VALUE str, VALUE rs)
10062 {
10063  rb_encoding *enc;
10064  int newline;
10065  char *pp, *e, *rsptr;
10066  long rslen;
10067  char *const p = RSTRING_PTR(str);
10068  long len = RSTRING_LEN(str);
10069 
10070  if (len == 0) return 0;
10071  e = p + len;
10072  if (rs == rb_default_rs) {
10073  return smart_chomp(str, e, p);
10074  }
10075 
10076  enc = rb_enc_get(str);
10077  RSTRING_GETMEM(rs, rsptr, rslen);
10078  if (rslen == 0) {
10079  if (rb_enc_mbminlen(enc) > 1) {
10080  while (e > p) {
10081  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10082  if (!rb_enc_is_newline(pp, e, enc)) break;
10083  e = pp;
10084  pp -= rb_enc_mbminlen(enc);
10085  if (pp >= p) {
10086  pp = rb_enc_left_char_head(p, pp, e, enc);
10087  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10088  e = pp;
10089  }
10090  }
10091  }
10092  }
10093  else {
10094  while (e > p && *(e-1) == '\n') {
10095  --e;
10096  if (e > p && *(e-1) == '\r')
10097  --e;
10098  }
10099  }
10100  return e - p;
10101  }
10102  if (rslen > len) return len;
10103 
10104  enc = rb_enc_get(rs);
10105  newline = rsptr[rslen-1];
10106  if (rslen == rb_enc_mbminlen(enc)) {
10107  if (rslen == 1) {
10108  if (newline == '\n')
10109  return smart_chomp(str, e, p);
10110  }
10111  else {
10112  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10113  return smart_chomp(str, e, p);
10114  }
10115  }
10116 
10117  enc = rb_enc_check(str, rs);
10118  if (is_broken_string(rs)) {
10119  return len;
10120  }
10121  pp = e - rslen;
10122  if (p[len-1] == newline &&
10123  (rslen <= 1 ||
10124  memcmp(rsptr, pp, rslen) == 0)) {
10125  if (at_char_boundary(p, pp, e, enc))
10126  return len - rslen;
10127  RB_GC_GUARD(rs);
10128  }
10129  return len;
10130 }
10131 
10137 static VALUE
10138 chomp_rs(int argc, const VALUE *argv)
10139 {
10140  rb_check_arity(argc, 0, 1);
10141  if (argc > 0) {
10142  VALUE rs = argv[0];
10143  if (!NIL_P(rs)) StringValue(rs);
10144  return rs;
10145  }
10146  else {
10147  return rb_rs;
10148  }
10149 }
10150 
10151 VALUE
10152 rb_str_chomp_string(VALUE str, VALUE rs)
10153 {
10154  long olen = RSTRING_LEN(str);
10155  long len = chompped_length(str, rs);
10156  if (len >= olen) return Qnil;
10157  str_modify_keep_cr(str);
10158  STR_SET_LEN(str, len);
10159  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10160  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10161  ENC_CODERANGE_CLEAR(str);
10162  }
10163  return str;
10164 }
10165 
10166 /*
10167  * call-seq:
10168  * chomp!(line_sep = $/) -> self or nil
10169  *
10170  * Like String#chomp, but modifies +self+ in place;
10171  * returns +nil+ if no modification made, +self+ otherwise.
10172  *
10173  */
10174 
10175 static VALUE
10176 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10177 {
10178  VALUE rs;
10179  str_modifiable(str);
10180  if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10181  rs = chomp_rs(argc, argv);
10182  if (NIL_P(rs)) return Qnil;
10183  return rb_str_chomp_string(str, rs);
10184 }
10185 
10186 
10187 /*
10188  * call-seq:
10189  * chomp(line_sep = $/) -> new_string
10190  *
10191  * :include: doc/string/chomp.rdoc
10192  *
10193  */
10194 
10195 static VALUE
10196 rb_str_chomp(int argc, VALUE *argv, VALUE str)
10197 {
10198  VALUE rs = chomp_rs(argc, argv);
10199  if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10200  return rb_str_subseq(str, 0, chompped_length(str, rs));
10201 }
10202 
10203 static long
10204 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10205 {
10206  const char *const start = s;
10207 
10208  if (!s || s >= e) return 0;
10209 
10210  /* remove spaces at head */
10211  if (single_byte_optimizable(str)) {
10212  while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10213  }
10214  else {
10215  while (s < e) {
10216  int n;
10217  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10218 
10219  if (cc && !rb_isspace(cc)) break;
10220  s += n;
10221  }
10222  }
10223  return s - start;
10224 }
10225 
10226 /*
10227  * call-seq:
10228  * lstrip! -> self or nil
10229  *
10230  * Like String#lstrip, except that any modifications are made in +self+;
10231  * returns +self+ if any modification are made, +nil+ otherwise.
10232  *
10233  * Related: String#rstrip!, String#strip!.
10234  */
10235 
10236 static VALUE
10237 rb_str_lstrip_bang(VALUE str)
10238 {
10239  rb_encoding *enc;
10240  char *start, *s;
10241  long olen, loffset;
10242 
10243  str_modify_keep_cr(str);
10244  enc = STR_ENC_GET(str);
10245  RSTRING_GETMEM(str, start, olen);
10246  loffset = lstrip_offset(str, start, start+olen, enc);
10247  if (loffset > 0) {
10248  long len = olen-loffset;
10249  s = start + loffset;
10250  memmove(start, s, len);
10251  STR_SET_LEN(str, len);
10252  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10253  return str;
10254  }
10255  return Qnil;
10256 }
10257 
10258 
10259 /*
10260  * call-seq:
10261  * lstrip -> new_string
10262  *
10263  * Returns a copy of +self+ with leading whitespace removed;
10264  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10265  *
10266  * whitespace = "\x00\t\n\v\f\r "
10267  * s = whitespace + 'abc' + whitespace
10268  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10269  * s.lstrip # => "abc\u0000\t\n\v\f\r "
10270  *
10271  * Related: String#rstrip, String#strip.
10272  */
10273 
10274 static VALUE
10275 rb_str_lstrip(VALUE str)
10276 {
10277  char *start;
10278  long len, loffset;
10279  RSTRING_GETMEM(str, start, len);
10280  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10281  if (loffset <= 0) return str_duplicate(rb_cString, str);
10282  return rb_str_subseq(str, loffset, len - loffset);
10283 }
10284 
10285 static long
10286 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10287 {
10288  const char *t;
10289 
10290  rb_str_check_dummy_enc(enc);
10292  rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10293  }
10294  if (!s || s >= e) return 0;
10295  t = e;
10296 
10297  /* remove trailing spaces or '\0's */
10298  if (single_byte_optimizable(str)) {
10299  unsigned char c;
10300  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10301  }
10302  else {
10303  char *tp;
10304 
10305  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10306  unsigned int c = rb_enc_codepoint(tp, e, enc);
10307  if (c && !rb_isspace(c)) break;
10308  t = tp;
10309  }
10310  }
10311  return e - t;
10312 }
10313 
10314 /*
10315  * call-seq:
10316  * rstrip! -> self or nil
10317  *
10318  * Like String#rstrip, except that any modifications are made in +self+;
10319  * returns +self+ if any modification are made, +nil+ otherwise.
10320  *
10321  * Related: String#lstrip!, String#strip!.
10322  */
10323 
10324 static VALUE
10325 rb_str_rstrip_bang(VALUE str)
10326 {
10327  rb_encoding *enc;
10328  char *start;
10329  long olen, roffset;
10330 
10331  str_modify_keep_cr(str);
10332  enc = STR_ENC_GET(str);
10333  RSTRING_GETMEM(str, start, olen);
10334  roffset = rstrip_offset(str, start, start+olen, enc);
10335  if (roffset > 0) {
10336  long len = olen - roffset;
10337 
10338  STR_SET_LEN(str, len);
10339  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10340  return str;
10341  }
10342  return Qnil;
10343 }
10344 
10345 
10346 /*
10347  * call-seq:
10348  * rstrip -> new_string
10349  *
10350  * Returns a copy of the receiver with trailing whitespace removed;
10351  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10352  *
10353  * whitespace = "\x00\t\n\v\f\r "
10354  * s = whitespace + 'abc' + whitespace
10355  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10356  * s.rstrip # => "\u0000\t\n\v\f\r abc"
10357  *
10358  * Related: String#lstrip, String#strip.
10359  */
10360 
10361 static VALUE
10362 rb_str_rstrip(VALUE str)
10363 {
10364  rb_encoding *enc;
10365  char *start;
10366  long olen, roffset;
10367 
10368  enc = STR_ENC_GET(str);
10369  RSTRING_GETMEM(str, start, olen);
10370  roffset = rstrip_offset(str, start, start+olen, enc);
10371 
10372  if (roffset <= 0) return str_duplicate(rb_cString, str);
10373  return rb_str_subseq(str, 0, olen-roffset);
10374 }
10375 
10376 
10377 /*
10378  * call-seq:
10379  * strip! -> self or nil
10380  *
10381  * Like String#strip, except that any modifications are made in +self+;
10382  * returns +self+ if any modification are made, +nil+ otherwise.
10383  *
10384  * Related: String#lstrip!, String#strip!.
10385  */
10386 
10387 static VALUE
10388 rb_str_strip_bang(VALUE str)
10389 {
10390  char *start;
10391  long olen, loffset, roffset;
10392  rb_encoding *enc;
10393 
10394  str_modify_keep_cr(str);
10395  enc = STR_ENC_GET(str);
10396  RSTRING_GETMEM(str, start, olen);
10397  loffset = lstrip_offset(str, start, start+olen, enc);
10398  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10399 
10400  if (loffset > 0 || roffset > 0) {
10401  long len = olen-roffset;
10402  if (loffset > 0) {
10403  len -= loffset;
10404  memmove(start, start + loffset, len);
10405  }
10406  STR_SET_LEN(str, len);
10407  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10408  return str;
10409  }
10410  return Qnil;
10411 }
10412 
10413 
10414 /*
10415  * call-seq:
10416  * strip -> new_string
10417  *
10418  * Returns a copy of the receiver with leading and trailing whitespace removed;
10419  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10420  *
10421  * whitespace = "\x00\t\n\v\f\r "
10422  * s = whitespace + 'abc' + whitespace
10423  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10424  * s.strip # => "abc"
10425  *
10426  * Related: String#lstrip, String#rstrip.
10427  */
10428 
10429 static VALUE
10430 rb_str_strip(VALUE str)
10431 {
10432  char *start;
10433  long olen, loffset, roffset;
10434  rb_encoding *enc = STR_ENC_GET(str);
10435 
10436  RSTRING_GETMEM(str, start, olen);
10437  loffset = lstrip_offset(str, start, start+olen, enc);
10438  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10439 
10440  if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10441  return rb_str_subseq(str, loffset, olen-loffset-roffset);
10442 }
10443 
10444 static VALUE
10445 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10446 {
10447  VALUE result = Qnil;
10448  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10449  if (pos >= 0) {
10450  VALUE match;
10451  struct re_registers *regs;
10452  if (BUILTIN_TYPE(pat) == T_STRING) {
10453  regs = NULL;
10454  end = pos + RSTRING_LEN(pat);
10455  }
10456  else {
10457  match = rb_backref_get();
10458  regs = RMATCH_REGS(match);
10459  pos = BEG(0);
10460  end = END(0);
10461  }
10462 
10463  if (pos == end) {
10464  rb_encoding *enc = STR_ENC_GET(str);
10465  /*
10466  * Always consume at least one character of the input string
10467  */
10468  if (RSTRING_LEN(str) > end)
10469  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10470  RSTRING_END(str), enc);
10471  else
10472  *start = end + 1;
10473  }
10474  else {
10475  *start = end;
10476  }
10477 
10478  if (!regs || regs->num_regs == 1) {
10479  result = rb_str_subseq(str, pos, end - pos);
10480  return result;
10481  }
10482  else {
10483  result = rb_ary_new2(regs->num_regs);
10484  for (int i = 1; i < regs->num_regs; i++) {
10485  VALUE s = Qnil;
10486  if (BEG(i) >= 0) {
10487  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10488  }
10489 
10490  rb_ary_push(result, s);
10491  }
10492  }
10493 
10494  RB_GC_GUARD(match);
10495  }
10496 
10497  return result;
10498 }
10499 
10500 
10501 /*
10502  * call-seq:
10503  * scan(string_or_regexp) -> array
10504  * scan(string_or_regexp) {|matches| ... } -> self
10505  *
10506  * Matches a pattern against +self+; the pattern is:
10507  *
10508  * - +string_or_regexp+ itself, if it is a Regexp.
10509  * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10510  *
10511  * Iterates through +self+, generating a collection of matching results:
10512  *
10513  * - If the pattern contains no groups, each result is the
10514  * matched string, <code>$&</code>.
10515  * - If the pattern contains groups, each result is an array
10516  * containing one entry per group.
10517  *
10518  * With no block given, returns an array of the results:
10519  *
10520  * s = 'cruel world'
10521  * s.scan(/\w+/) # => ["cruel", "world"]
10522  * s.scan(/.../) # => ["cru", "el ", "wor"]
10523  * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10524  * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10525  *
10526  * With a block given, calls the block with each result; returns +self+:
10527  *
10528  * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10529  * print "\n"
10530  * s.scan(/(.)(.)/) {|x,y| print y, x }
10531  * print "\n"
10532  *
10533  * Output:
10534  *
10535  * <<cruel>> <<world>>
10536  * rceu lowlr
10537  *
10538  */
10539 
10540 static VALUE
10541 rb_str_scan(VALUE str, VALUE pat)
10542 {
10543  VALUE result;
10544  long start = 0;
10545  long last = -1, prev = 0;
10546  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10547 
10548  pat = get_pat_quoted(pat, 1);
10549  mustnot_broken(str);
10550  if (!rb_block_given_p()) {
10551  VALUE ary = rb_ary_new();
10552 
10553  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10554  last = prev;
10555  prev = start;
10556  rb_ary_push(ary, result);
10557  }
10558  if (last >= 0) rb_pat_search(pat, str, last, 1);
10559  else rb_backref_set(Qnil);
10560  return ary;
10561  }
10562 
10563  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10564  last = prev;
10565  prev = start;
10566  rb_yield(result);
10567  str_mod_check(str, p, len);
10568  }
10569  if (last >= 0) rb_pat_search(pat, str, last, 1);
10570  return str;
10571 }
10572 
10573 
10574 /*
10575  * call-seq:
10576  * hex -> integer
10577  *
10578  * Interprets the leading substring of +self+ as a string of hexadecimal digits
10579  * (with an optional sign and an optional <code>0x</code>) and returns the
10580  * corresponding number;
10581  * returns zero if there is no such leading substring:
10582  *
10583  * '0x0a'.hex # => 10
10584  * '-1234'.hex # => -4660
10585  * '0'.hex # => 0
10586  * 'non-numeric'.hex # => 0
10587  *
10588  * Related: String#oct.
10589  *
10590  */
10591 
10592 static VALUE
10593 rb_str_hex(VALUE str)
10594 {
10595  return rb_str_to_inum(str, 16, FALSE);
10596 }
10597 
10598 
10599 /*
10600  * call-seq:
10601  * oct -> integer
10602  *
10603  * Interprets the leading substring of +self+ as a string of octal digits
10604  * (with an optional sign) and returns the corresponding number;
10605  * returns zero if there is no such leading substring:
10606  *
10607  * '123'.oct # => 83
10608  * '-377'.oct # => -255
10609  * '0377non-numeric'.oct # => 255
10610  * 'non-numeric'.oct # => 0
10611  *
10612  * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10613  * see Kernel#Integer.
10614  *
10615  * Related: String#hex.
10616  *
10617  */
10618 
10619 static VALUE
10620 rb_str_oct(VALUE str)
10621 {
10622  return rb_str_to_inum(str, -8, FALSE);
10623 }
10624 
10625 #ifndef HAVE_CRYPT_R
10626 # include "ruby/thread_native.h"
10627 # include "ruby/atomic.h"
10628 
10629 static struct {
10630  rb_nativethread_lock_t lock;
10631 } crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10632 
10633 static void
10634 crypt_mutex_initialize(void)
10635 {
10636 }
10637 #endif
10638 
10639 /*
10640  * call-seq:
10641  * crypt(salt_str) -> new_string
10642  *
10643  * Returns the string generated by calling <code>crypt(3)</code>
10644  * standard library function with <code>str</code> and
10645  * <code>salt_str</code>, in this order, as its arguments. Please do
10646  * not use this method any longer. It is legacy; provided only for
10647  * backward compatibility with ruby scripts in earlier days. It is
10648  * bad to use in contemporary programs for several reasons:
10649  *
10650  * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10651  * run. The generated string lacks data portability.
10652  *
10653  * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10654  * (i.e. silently ends up in unexpected results).
10655  *
10656  * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10657  * thread safe.
10658  *
10659  * * So-called "traditional" usage of <code>crypt(3)</code> is very
10660  * very very weak. According to its manpage, Linux's traditional
10661  * <code>crypt(3)</code> output has only 2**56 variations; too
10662  * easy to brute force today. And this is the default behaviour.
10663  *
10664  * * In order to make things robust some OSes implement so-called
10665  * "modular" usage. To go through, you have to do a complex
10666  * build-up of the <code>salt_str</code> parameter, by hand.
10667  * Failure in generation of a proper salt string tends not to
10668  * yield any errors; typos in parameters are normally not
10669  * detectable.
10670  *
10671  * * For instance, in the following example, the second invocation
10672  * of String#crypt is wrong; it has a typo in "round=" (lacks
10673  * "s"). However the call does not fail and something unexpected
10674  * is generated.
10675  *
10676  * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10677  * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10678  *
10679  * * Even in the "modular" mode, some hash functions are considered
10680  * archaic and no longer recommended at all; for instance module
10681  * <code>$1$</code> is officially abandoned by its author: see
10682  * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10683  * instance module <code>$3$</code> is considered completely
10684  * broken: see the manpage of FreeBSD.
10685  *
10686  * * On some OS such as Mac OS, there is no modular mode. Yet, as
10687  * written above, <code>crypt(3)</code> on Mac OS never fails.
10688  * This means even if you build up a proper salt string it
10689  * generates a traditional DES hash anyways, and there is no way
10690  * for you to be aware of.
10691  *
10692  * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10693  *
10694  * If for some reason you cannot migrate to other secure contemporary
10695  * password hashing algorithms, install the string-crypt gem and
10696  * <code>require 'string/crypt'</code> to continue using it.
10697  */
10698 
10699 static VALUE
10700 rb_str_crypt(VALUE str, VALUE salt)
10701 {
10702 #ifdef HAVE_CRYPT_R
10703  VALUE databuf;
10704  struct crypt_data *data;
10705 # define CRYPT_END() ALLOCV_END(databuf)
10706 #else
10707  extern char *crypt(const char *, const char *);
10708 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10709 #endif
10710  VALUE result;
10711  const char *s, *saltp;
10712  char *res;
10713 #ifdef BROKEN_CRYPT
10714  char salt_8bit_clean[3];
10715 #endif
10716 
10717  StringValue(salt);
10718  mustnot_wchar(str);
10719  mustnot_wchar(salt);
10720  s = StringValueCStr(str);
10721  saltp = RSTRING_PTR(salt);
10722  if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10723  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10724  }
10725 
10726 #ifdef BROKEN_CRYPT
10727  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10728  salt_8bit_clean[0] = saltp[0] & 0x7f;
10729  salt_8bit_clean[1] = saltp[1] & 0x7f;
10730  salt_8bit_clean[2] = '\0';
10731  saltp = salt_8bit_clean;
10732  }
10733 #endif
10734 #ifdef HAVE_CRYPT_R
10735  data = ALLOCV(databuf, sizeof(struct crypt_data));
10736 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10737  data->initialized = 0;
10738 # endif
10739  res = crypt_r(s, saltp, data);
10740 #else
10741  crypt_mutex_initialize();
10742  rb_nativethread_lock_lock(&crypt_mutex.lock);
10743  res = crypt(s, saltp);
10744 #endif
10745  if (!res) {
10746  int err = errno;
10747  CRYPT_END();
10748  rb_syserr_fail(err, "crypt");
10749  }
10750  result = rb_str_new_cstr(res);
10751  CRYPT_END();
10752  return result;
10753 }
10754 
10755 
10756 /*
10757  * call-seq:
10758  * ord -> integer
10759  *
10760  * :include: doc/string/ord.rdoc
10761  *
10762  */
10763 
10764 static VALUE
10765 rb_str_ord(VALUE s)
10766 {
10767  unsigned int c;
10768 
10769  c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10770  return UINT2NUM(c);
10771 }
10772 /*
10773  * call-seq:
10774  * sum(n = 16) -> integer
10775  *
10776  * :include: doc/string/sum.rdoc
10777  *
10778  */
10779 
10780 static VALUE
10781 rb_str_sum(int argc, VALUE *argv, VALUE str)
10782 {
10783  int bits = 16;
10784  char *ptr, *p, *pend;
10785  long len;
10786  VALUE sum = INT2FIX(0);
10787  unsigned long sum0 = 0;
10788 
10789  if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10790  bits = 0;
10791  }
10792  ptr = p = RSTRING_PTR(str);
10793  len = RSTRING_LEN(str);
10794  pend = p + len;
10795 
10796  while (p < pend) {
10797  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10798  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10799  str_mod_check(str, ptr, len);
10800  sum0 = 0;
10801  }
10802  sum0 += (unsigned char)*p;
10803  p++;
10804  }
10805 
10806  if (bits == 0) {
10807  if (sum0) {
10808  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10809  }
10810  }
10811  else {
10812  if (sum == INT2FIX(0)) {
10813  if (bits < (int)sizeof(long)*CHAR_BIT) {
10814  sum0 &= (((unsigned long)1)<<bits)-1;
10815  }
10816  sum = LONG2FIX(sum0);
10817  }
10818  else {
10819  VALUE mod;
10820 
10821  if (sum0) {
10822  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10823  }
10824 
10825  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10826  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10827  sum = rb_funcall(sum, '&', 1, mod);
10828  }
10829  }
10830  return sum;
10831 }
10832 
10833 static VALUE
10834 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10835 {
10836  rb_encoding *enc;
10837  VALUE w;
10838  long width, len, flen = 1, fclen = 1;
10839  VALUE res;
10840  char *p;
10841  const char *f = " ";
10842  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10843  VALUE pad;
10844  int singlebyte = 1, cr;
10845  int termlen;
10846 
10847  rb_scan_args(argc, argv, "11", &w, &pad);
10848  enc = STR_ENC_GET(str);
10849  termlen = rb_enc_mbminlen(enc);
10850  width = NUM2LONG(w);
10851  if (argc == 2) {
10852  StringValue(pad);
10853  enc = rb_enc_check(str, pad);
10854  f = RSTRING_PTR(pad);
10855  flen = RSTRING_LEN(pad);
10856  fclen = str_strlen(pad, enc); /* rb_enc_check */
10857  singlebyte = single_byte_optimizable(pad);
10858  if (flen == 0 || fclen == 0) {
10859  rb_raise(rb_eArgError, "zero width padding");
10860  }
10861  }
10862  len = str_strlen(str, enc); /* rb_enc_check */
10863  if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10864  n = width - len;
10865  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10866  rlen = n - llen;
10867  cr = ENC_CODERANGE(str);
10868  if (flen > 1) {
10869  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10870  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10871  }
10872  size = RSTRING_LEN(str);
10873  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10874  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10875  (len += llen2 + rlen2) >= LONG_MAX - size) {
10876  rb_raise(rb_eArgError, "argument too big");
10877  }
10878  len += size;
10879  res = str_new0(rb_cString, 0, len, termlen);
10880  p = RSTRING_PTR(res);
10881  if (flen <= 1) {
10882  memset(p, *f, llen);
10883  p += llen;
10884  }
10885  else {
10886  while (llen >= fclen) {
10887  memcpy(p,f,flen);
10888  p += flen;
10889  llen -= fclen;
10890  }
10891  if (llen > 0) {
10892  memcpy(p, f, llen2);
10893  p += llen2;
10894  }
10895  }
10896  memcpy(p, RSTRING_PTR(str), size);
10897  p += size;
10898  if (flen <= 1) {
10899  memset(p, *f, rlen);
10900  p += rlen;
10901  }
10902  else {
10903  while (rlen >= fclen) {
10904  memcpy(p,f,flen);
10905  p += flen;
10906  rlen -= fclen;
10907  }
10908  if (rlen > 0) {
10909  memcpy(p, f, rlen2);
10910  p += rlen2;
10911  }
10912  }
10913  TERM_FILL(p, termlen);
10914  STR_SET_LEN(res, p-RSTRING_PTR(res));
10915  rb_enc_associate(res, enc);
10916  if (argc == 2)
10917  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10918  if (cr != ENC_CODERANGE_BROKEN)
10919  ENC_CODERANGE_SET(res, cr);
10920 
10921  RB_GC_GUARD(pad);
10922  return res;
10923 }
10924 
10925 
10926 /*
10927  * call-seq:
10928  * ljust(size, pad_string = ' ') -> new_string
10929  *
10930  * :include: doc/string/ljust.rdoc
10931  *
10932  * Related: String#rjust, String#center.
10933  *
10934  */
10935 
10936 static VALUE
10937 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10938 {
10939  return rb_str_justify(argc, argv, str, 'l');
10940 }
10941 
10942 /*
10943  * call-seq:
10944  * rjust(size, pad_string = ' ') -> new_string
10945  *
10946  * :include: doc/string/rjust.rdoc
10947  *
10948  * Related: String#ljust, String#center.
10949  *
10950  */
10951 
10952 static VALUE
10953 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10954 {
10955  return rb_str_justify(argc, argv, str, 'r');
10956 }
10957 
10958 
10959 /*
10960  * call-seq:
10961  * center(size, pad_string = ' ') -> new_string
10962  *
10963  * :include: doc/string/center.rdoc
10964  *
10965  * Related: String#ljust, String#rjust.
10966  *
10967  */
10968 
10969 static VALUE
10970 rb_str_center(int argc, VALUE *argv, VALUE str)
10971 {
10972  return rb_str_justify(argc, argv, str, 'c');
10973 }
10974 
10975 /*
10976  * call-seq:
10977  * partition(string_or_regexp) -> [head, match, tail]
10978  *
10979  * :include: doc/string/partition.rdoc
10980  *
10981  */
10982 
10983 static VALUE
10984 rb_str_partition(VALUE str, VALUE sep)
10985 {
10986  long pos;
10987 
10988  sep = get_pat_quoted(sep, 0);
10989  if (RB_TYPE_P(sep, T_REGEXP)) {
10990  if (rb_reg_search(sep, str, 0, 0) < 0) {
10991  goto failed;
10992  }
10993  VALUE match = rb_backref_get();
10994  struct re_registers *regs = RMATCH_REGS(match);
10995 
10996  pos = BEG(0);
10997  sep = rb_str_subseq(str, pos, END(0) - pos);
10998  }
10999  else {
11000  pos = rb_str_index(str, sep, 0);
11001  if (pos < 0) goto failed;
11002  }
11003  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11004  sep,
11005  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11006  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11007 
11008  failed:
11009  return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11010 }
11011 
11012 /*
11013  * call-seq:
11014  * rpartition(sep) -> [head, match, tail]
11015  *
11016  * :include: doc/string/rpartition.rdoc
11017  *
11018  */
11019 
11020 static VALUE
11021 rb_str_rpartition(VALUE str, VALUE sep)
11022 {
11023  long pos = RSTRING_LEN(str);
11024 
11025  sep = get_pat_quoted(sep, 0);
11026  if (RB_TYPE_P(sep, T_REGEXP)) {
11027  if (rb_reg_search(sep, str, pos, 1) < 0) {
11028  goto failed;
11029  }
11030  VALUE match = rb_backref_get();
11031  struct re_registers *regs = RMATCH_REGS(match);
11032 
11033  pos = BEG(0);
11034  sep = rb_str_subseq(str, pos, END(0) - pos);
11035  }
11036  else {
11037  pos = rb_str_sublen(str, pos);
11038  pos = rb_str_rindex(str, sep, pos);
11039  if (pos < 0) {
11040  goto failed;
11041  }
11042  }
11043 
11044  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11045  sep,
11046  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11047  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11048  failed:
11049  return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11050 }
11051 
11052 /*
11053  * call-seq:
11054  * start_with?(*string_or_regexp) -> true or false
11055  *
11056  * :include: doc/string/start_with_p.rdoc
11057  *
11058  */
11059 
11060 static VALUE
11061 rb_str_start_with(int argc, VALUE *argv, VALUE str)
11062 {
11063  int i;
11064 
11065  for (i=0; i<argc; i++) {
11066  VALUE tmp = argv[i];
11067  if (RB_TYPE_P(tmp, T_REGEXP)) {
11068  if (rb_reg_start_with_p(tmp, str))
11069  return Qtrue;
11070  }
11071  else {
11072  const char *p, *s, *e;
11073  long slen, tlen;
11074  rb_encoding *enc;
11075 
11076  StringValue(tmp);
11077  enc = rb_enc_check(str, tmp);
11078  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11079  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11080  p = RSTRING_PTR(str);
11081  e = p + slen;
11082  s = p + tlen;
11083  if (!at_char_right_boundary(p, s, e, enc))
11084  continue;
11085  if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11086  return Qtrue;
11087  }
11088  }
11089  return Qfalse;
11090 }
11091 
11092 /*
11093  * call-seq:
11094  * end_with?(*strings) -> true or false
11095  *
11096  * :include: doc/string/end_with_p.rdoc
11097  *
11098  */
11099 
11100 static VALUE
11101 rb_str_end_with(int argc, VALUE *argv, VALUE str)
11102 {
11103  int i;
11104 
11105  for (i=0; i<argc; i++) {
11106  VALUE tmp = argv[i];
11107  const char *p, *s, *e;
11108  long slen, tlen;
11109  rb_encoding *enc;
11110 
11111  StringValue(tmp);
11112  enc = rb_enc_check(str, tmp);
11113  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11114  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11115  p = RSTRING_PTR(str);
11116  e = p + slen;
11117  s = e - tlen;
11118  if (!at_char_boundary(p, s, e, enc))
11119  continue;
11120  if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11121  return Qtrue;
11122  }
11123  return Qfalse;
11124 }
11125 
11135 static long
11136 deleted_prefix_length(VALUE str, VALUE prefix)
11137 {
11138  const char *strptr, *prefixptr;
11139  long olen, prefixlen;
11140  rb_encoding *enc = rb_enc_get(str);
11141 
11142  StringValue(prefix);
11143 
11144  if (!is_broken_string(prefix) ||
11145  !rb_enc_asciicompat(enc) ||
11146  !rb_enc_asciicompat(rb_enc_get(prefix))) {
11147  enc = rb_enc_check(str, prefix);
11148  }
11149 
11150  /* return 0 if not start with prefix */
11151  prefixlen = RSTRING_LEN(prefix);
11152  if (prefixlen <= 0) return 0;
11153  olen = RSTRING_LEN(str);
11154  if (olen < prefixlen) return 0;
11155  strptr = RSTRING_PTR(str);
11156  prefixptr = RSTRING_PTR(prefix);
11157  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11158  if (is_broken_string(prefix)) {
11159  if (!is_broken_string(str)) {
11160  /* prefix in a valid string cannot be broken */
11161  return 0;
11162  }
11163  const char *strend = strptr + olen;
11164  const char *after_prefix = strptr + prefixlen;
11165  if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11166  /* prefix does not end at char-boundary */
11167  return 0;
11168  }
11169  }
11170  /* prefix part in `str` also should be valid. */
11171 
11172  return prefixlen;
11173 }
11174 
11175 /*
11176  * call-seq:
11177  * delete_prefix!(prefix) -> self or nil
11178  *
11179  * Like String#delete_prefix, except that +self+ is modified in place.
11180  * Returns +self+ if the prefix is removed, +nil+ otherwise.
11181  *
11182  */
11183 
11184 static VALUE
11185 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11186 {
11187  long prefixlen;
11188  str_modify_keep_cr(str);
11189 
11190  prefixlen = deleted_prefix_length(str, prefix);
11191  if (prefixlen <= 0) return Qnil;
11192 
11193  return rb_str_drop_bytes(str, prefixlen);
11194 }
11195 
11196 /*
11197  * call-seq:
11198  * delete_prefix(prefix) -> new_string
11199  *
11200  * :include: doc/string/delete_prefix.rdoc
11201  *
11202  */
11203 
11204 static VALUE
11205 rb_str_delete_prefix(VALUE str, VALUE prefix)
11206 {
11207  long prefixlen;
11208 
11209  prefixlen = deleted_prefix_length(str, prefix);
11210  if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11211 
11212  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11213 }
11214 
11224 static long
11225 deleted_suffix_length(VALUE str, VALUE suffix)
11226 {
11227  const char *strptr, *suffixptr;
11228  long olen, suffixlen;
11229  rb_encoding *enc;
11230 
11231  StringValue(suffix);
11232  if (is_broken_string(suffix)) return 0;
11233  enc = rb_enc_check(str, suffix);
11234 
11235  /* return 0 if not start with suffix */
11236  suffixlen = RSTRING_LEN(suffix);
11237  if (suffixlen <= 0) return 0;
11238  olen = RSTRING_LEN(str);
11239  if (olen < suffixlen) return 0;
11240  strptr = RSTRING_PTR(str);
11241  suffixptr = RSTRING_PTR(suffix);
11242  const char *strend = strptr + olen;
11243  const char *before_suffix = strend - suffixlen;
11244  if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11245  if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11246 
11247  return suffixlen;
11248 }
11249 
11250 /*
11251  * call-seq:
11252  * delete_suffix!(suffix) -> self or nil
11253  *
11254  * Like String#delete_suffix, except that +self+ is modified in place.
11255  * Returns +self+ if the suffix is removed, +nil+ otherwise.
11256  *
11257  */
11258 
11259 static VALUE
11260 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11261 {
11262  long olen, suffixlen, len;
11263  str_modifiable(str);
11264 
11265  suffixlen = deleted_suffix_length(str, suffix);
11266  if (suffixlen <= 0) return Qnil;
11267 
11268  olen = RSTRING_LEN(str);
11269  str_modify_keep_cr(str);
11270  len = olen - suffixlen;
11271  STR_SET_LEN(str, len);
11272  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11273  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11274  ENC_CODERANGE_CLEAR(str);
11275  }
11276  return str;
11277 }
11278 
11279 /*
11280  * call-seq:
11281  * delete_suffix(suffix) -> new_string
11282  *
11283  * :include: doc/string/delete_suffix.rdoc
11284  *
11285  */
11286 
11287 static VALUE
11288 rb_str_delete_suffix(VALUE str, VALUE suffix)
11289 {
11290  long suffixlen;
11291 
11292  suffixlen = deleted_suffix_length(str, suffix);
11293  if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11294 
11295  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11296 }
11297 
11298 void
11299 rb_str_setter(VALUE val, ID id, VALUE *var)
11300 {
11301  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11302  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11303  }
11304  *var = val;
11305 }
11306 
11307 static void
11308 rb_fs_setter(VALUE val, ID id, VALUE *var)
11309 {
11310  val = rb_fs_check(val);
11311  if (!val) {
11313  "value of %"PRIsVALUE" must be String or Regexp",
11314  rb_id2str(id));
11315  }
11316  if (!NIL_P(val)) {
11317  rb_warn_deprecated("'$;'", NULL);
11318  }
11319  *var = val;
11320 }
11321 
11322 
11323 /*
11324  * call-seq:
11325  * force_encoding(encoding) -> self
11326  *
11327  * :include: doc/string/force_encoding.rdoc
11328  *
11329  */
11330 
11331 static VALUE
11332 rb_str_force_encoding(VALUE str, VALUE enc)
11333 {
11334  str_modifiable(str);
11335 
11336  rb_encoding *encoding = rb_to_encoding(enc);
11337  int idx = rb_enc_to_index(encoding);
11338 
11339  // If the encoding is unchanged, we do nothing.
11340  if (ENCODING_GET(str) == idx) {
11341  return str;
11342  }
11343 
11344  rb_enc_associate_index(str, idx);
11345 
11346  // If the coderange was 7bit and the new encoding is ASCII-compatible
11347  // we can keep the coderange.
11348  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11349  return str;
11350  }
11351 
11352  ENC_CODERANGE_CLEAR(str);
11353  return str;
11354 }
11355 
11356 /*
11357  * call-seq:
11358  * b -> string
11359  *
11360  * :include: doc/string/b.rdoc
11361  *
11362  */
11363 
11364 static VALUE
11365 rb_str_b(VALUE str)
11366 {
11367  VALUE str2;
11368  if (STR_EMBED_P(str)) {
11369  str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11370  }
11371  else {
11372  str2 = str_alloc_heap(rb_cString);
11373  }
11374  str_replace_shared_without_enc(str2, str);
11375 
11376  if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11377  // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11378  // If we know the receiver's code range then we know the result's code range.
11379  int cr = ENC_CODERANGE(str);
11380  switch (cr) {
11381  case ENC_CODERANGE_7BIT:
11383  break;
11384  case ENC_CODERANGE_BROKEN:
11385  case ENC_CODERANGE_VALID:
11387  break;
11388  default:
11389  ENC_CODERANGE_CLEAR(str2);
11390  break;
11391  }
11392  }
11393 
11394  return str2;
11395 }
11396 
11397 /*
11398  * call-seq:
11399  * valid_encoding? -> true or false
11400  *
11401  * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11402  *
11403  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11404  * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11405  * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11406  */
11407 
11408 static VALUE
11409 rb_str_valid_encoding_p(VALUE str)
11410 {
11411  int cr = rb_enc_str_coderange(str);
11412 
11413  return RBOOL(cr != ENC_CODERANGE_BROKEN);
11414 }
11415 
11416 /*
11417  * call-seq:
11418  * ascii_only? -> true or false
11419  *
11420  * Returns +true+ if +self+ contains only ASCII characters,
11421  * +false+ otherwise:
11422  *
11423  * 'abc'.ascii_only? # => true
11424  * "abc\u{6666}".ascii_only? # => false
11425  *
11426  */
11427 
11428 static VALUE
11429 rb_str_is_ascii_only_p(VALUE str)
11430 {
11431  int cr = rb_enc_str_coderange(str);
11432 
11433  return RBOOL(cr == ENC_CODERANGE_7BIT);
11434 }
11435 
11436 VALUE
11438 {
11439  static const char ellipsis[] = "...";
11440  const long ellipsislen = sizeof(ellipsis) - 1;
11441  rb_encoding *const enc = rb_enc_get(str);
11442  const long blen = RSTRING_LEN(str);
11443  const char *const p = RSTRING_PTR(str), *e = p + blen;
11444  VALUE estr, ret = 0;
11445 
11446  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11447  if (len * rb_enc_mbminlen(enc) >= blen ||
11448  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11449  ret = str;
11450  }
11451  else if (len <= ellipsislen ||
11452  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11453  if (rb_enc_asciicompat(enc)) {
11454  ret = rb_str_new(ellipsis, len);
11455  rb_enc_associate(ret, enc);
11456  }
11457  else {
11458  estr = rb_usascii_str_new(ellipsis, len);
11459  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11460  }
11461  }
11462  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11463  rb_str_cat(ret, ellipsis, ellipsislen);
11464  }
11465  else {
11466  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11467  rb_enc_from_encoding(enc), 0, Qnil);
11468  rb_str_append(ret, estr);
11469  }
11470  return ret;
11471 }
11472 
11473 static VALUE
11474 str_compat_and_valid(VALUE str, rb_encoding *enc)
11475 {
11476  int cr;
11477  str = StringValue(str);
11478  cr = rb_enc_str_coderange(str);
11479  if (cr == ENC_CODERANGE_BROKEN) {
11480  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11481  }
11482  else {
11483  rb_encoding *e = STR_ENC_GET(str);
11484  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11485  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11486  rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11487  }
11488  }
11489  return str;
11490 }
11491 
11492 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11493 
11494 VALUE
11496 {
11497  rb_encoding *enc = STR_ENC_GET(str);
11498  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11499 }
11500 
11501 VALUE
11502 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11503 {
11504  int cr = ENC_CODERANGE_UNKNOWN;
11505  if (enc == STR_ENC_GET(str)) {
11506  /* cached coderange makes sense only when enc equals the
11507  * actual encoding of str */
11508  cr = ENC_CODERANGE(str);
11509  }
11510  return enc_str_scrub(enc, str, repl, cr);
11511 }
11512 
11513 static VALUE
11514 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11515 {
11516  int encidx;
11517  VALUE buf = Qnil;
11518  const char *rep, *p, *e, *p1, *sp;
11519  long replen = -1;
11520  long slen;
11521 
11522  if (rb_block_given_p()) {
11523  if (!NIL_P(repl))
11524  rb_raise(rb_eArgError, "both of block and replacement given");
11525  replen = 0;
11526  }
11527 
11528  if (ENC_CODERANGE_CLEAN_P(cr))
11529  return Qnil;
11530 
11531  if (!NIL_P(repl)) {
11532  repl = str_compat_and_valid(repl, enc);
11533  }
11534 
11535  if (rb_enc_dummy_p(enc)) {
11536  return Qnil;
11537  }
11538  encidx = rb_enc_to_index(enc);
11539 
11540 #define DEFAULT_REPLACE_CHAR(str) do { \
11541  static const char replace[sizeof(str)-1] = str; \
11542  rep = replace; replen = (int)sizeof(replace); \
11543  } while (0)
11544 
11545  slen = RSTRING_LEN(str);
11546  p = RSTRING_PTR(str);
11547  e = RSTRING_END(str);
11548  p1 = p;
11549  sp = p;
11550 
11551  if (rb_enc_asciicompat(enc)) {
11552  int rep7bit_p;
11553  if (!replen) {
11554  rep = NULL;
11555  rep7bit_p = FALSE;
11556  }
11557  else if (!NIL_P(repl)) {
11558  rep = RSTRING_PTR(repl);
11559  replen = RSTRING_LEN(repl);
11560  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11561  }
11562  else if (encidx == rb_utf8_encindex()) {
11563  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11564  rep7bit_p = FALSE;
11565  }
11566  else {
11567  DEFAULT_REPLACE_CHAR("?");
11568  rep7bit_p = TRUE;
11569  }
11570  cr = ENC_CODERANGE_7BIT;
11571 
11572  p = search_nonascii(p, e);
11573  if (!p) {
11574  p = e;
11575  }
11576  while (p < e) {
11577  int ret = rb_enc_precise_mbclen(p, e, enc);
11578  if (MBCLEN_NEEDMORE_P(ret)) {
11579  break;
11580  }
11581  else if (MBCLEN_CHARFOUND_P(ret)) {
11582  cr = ENC_CODERANGE_VALID;
11583  p += MBCLEN_CHARFOUND_LEN(ret);
11584  }
11585  else if (MBCLEN_INVALID_P(ret)) {
11586  /*
11587  * p1~p: valid ascii/multibyte chars
11588  * p ~e: invalid bytes + unknown bytes
11589  */
11590  long clen = rb_enc_mbmaxlen(enc);
11591  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11592  if (p > p1) {
11593  rb_str_buf_cat(buf, p1, p - p1);
11594  }
11595 
11596  if (e - p < clen) clen = e - p;
11597  if (clen <= 2) {
11598  clen = 1;
11599  }
11600  else {
11601  const char *q = p;
11602  clen--;
11603  for (; clen > 1; clen--) {
11604  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11605  if (MBCLEN_NEEDMORE_P(ret)) break;
11606  if (MBCLEN_INVALID_P(ret)) continue;
11607  UNREACHABLE;
11608  }
11609  }
11610  if (rep) {
11611  rb_str_buf_cat(buf, rep, replen);
11612  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11613  }
11614  else {
11615  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11616  str_mod_check(str, sp, slen);
11617  repl = str_compat_and_valid(repl, enc);
11618  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11619  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11620  cr = ENC_CODERANGE_VALID;
11621  }
11622  p += clen;
11623  p1 = p;
11624  p = search_nonascii(p, e);
11625  if (!p) {
11626  p = e;
11627  break;
11628  }
11629  }
11630  else {
11631  UNREACHABLE;
11632  }
11633  }
11634  if (NIL_P(buf)) {
11635  if (p == e) {
11636  ENC_CODERANGE_SET(str, cr);
11637  return Qnil;
11638  }
11639  buf = rb_str_buf_new(RSTRING_LEN(str));
11640  }
11641  if (p1 < p) {
11642  rb_str_buf_cat(buf, p1, p - p1);
11643  }
11644  if (p < e) {
11645  if (rep) {
11646  rb_str_buf_cat(buf, rep, replen);
11647  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11648  }
11649  else {
11650  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11651  str_mod_check(str, sp, slen);
11652  repl = str_compat_and_valid(repl, enc);
11653  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11654  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11655  cr = ENC_CODERANGE_VALID;
11656  }
11657  }
11658  }
11659  else {
11660  /* ASCII incompatible */
11661  long mbminlen = rb_enc_mbminlen(enc);
11662  if (!replen) {
11663  rep = NULL;
11664  }
11665  else if (!NIL_P(repl)) {
11666  rep = RSTRING_PTR(repl);
11667  replen = RSTRING_LEN(repl);
11668  }
11669  else if (encidx == ENCINDEX_UTF_16BE) {
11670  DEFAULT_REPLACE_CHAR("\xFF\xFD");
11671  }
11672  else if (encidx == ENCINDEX_UTF_16LE) {
11673  DEFAULT_REPLACE_CHAR("\xFD\xFF");
11674  }
11675  else if (encidx == ENCINDEX_UTF_32BE) {
11676  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11677  }
11678  else if (encidx == ENCINDEX_UTF_32LE) {
11679  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11680  }
11681  else {
11682  DEFAULT_REPLACE_CHAR("?");
11683  }
11684 
11685  while (p < e) {
11686  int ret = rb_enc_precise_mbclen(p, e, enc);
11687  if (MBCLEN_NEEDMORE_P(ret)) {
11688  break;
11689  }
11690  else if (MBCLEN_CHARFOUND_P(ret)) {
11691  p += MBCLEN_CHARFOUND_LEN(ret);
11692  }
11693  else if (MBCLEN_INVALID_P(ret)) {
11694  const char *q = p;
11695  long clen = rb_enc_mbmaxlen(enc);
11696  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11697  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11698 
11699  if (e - p < clen) clen = e - p;
11700  if (clen <= mbminlen * 2) {
11701  clen = mbminlen;
11702  }
11703  else {
11704  clen -= mbminlen;
11705  for (; clen > mbminlen; clen-=mbminlen) {
11706  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11707  if (MBCLEN_NEEDMORE_P(ret)) break;
11708  if (MBCLEN_INVALID_P(ret)) continue;
11709  UNREACHABLE;
11710  }
11711  }
11712  if (rep) {
11713  rb_str_buf_cat(buf, rep, replen);
11714  }
11715  else {
11716  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11717  str_mod_check(str, sp, slen);
11718  repl = str_compat_and_valid(repl, enc);
11719  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11720  }
11721  p += clen;
11722  p1 = p;
11723  }
11724  else {
11725  UNREACHABLE;
11726  }
11727  }
11728  if (NIL_P(buf)) {
11729  if (p == e) {
11731  return Qnil;
11732  }
11733  buf = rb_str_buf_new(RSTRING_LEN(str));
11734  }
11735  if (p1 < p) {
11736  rb_str_buf_cat(buf, p1, p - p1);
11737  }
11738  if (p < e) {
11739  if (rep) {
11740  rb_str_buf_cat(buf, rep, replen);
11741  }
11742  else {
11743  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11744  str_mod_check(str, sp, slen);
11745  repl = str_compat_and_valid(repl, enc);
11746  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11747  }
11748  }
11749  cr = ENC_CODERANGE_VALID;
11750  }
11751  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11752  return buf;
11753 }
11754 
11755 /*
11756  * call-seq:
11757  * scrub(replacement_string = default_replacement) -> new_string
11758  * scrub{|bytes| ... } -> new_string
11759  *
11760  * :include: doc/string/scrub.rdoc
11761  *
11762  */
11763 static VALUE
11764 str_scrub(int argc, VALUE *argv, VALUE str)
11765 {
11766  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11767  VALUE new = rb_str_scrub(str, repl);
11768  return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11769 }
11770 
11771 /*
11772  * call-seq:
11773  * scrub! -> self
11774  * scrub!(replacement_string = default_replacement) -> self
11775  * scrub!{|bytes| ... } -> self
11776  *
11777  * Like String#scrub, except that any replacements are made in +self+.
11778  *
11779  */
11780 static VALUE
11781 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11782 {
11783  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11784  VALUE new = rb_str_scrub(str, repl);
11785  if (!NIL_P(new)) rb_str_replace(str, new);
11786  return str;
11787 }
11788 
11789 static ID id_normalize;
11790 static ID id_normalized_p;
11791 static VALUE mUnicodeNormalize;
11792 
11793 static VALUE
11794 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11795 {
11796  static int UnicodeNormalizeRequired = 0;
11797  VALUE argv2[2];
11798 
11799  if (!UnicodeNormalizeRequired) {
11800  rb_require("unicode_normalize/normalize.rb");
11801  UnicodeNormalizeRequired = 1;
11802  }
11803  argv2[0] = str;
11804  if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11805  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11806 }
11807 
11808 /*
11809  * call-seq:
11810  * unicode_normalize(form = :nfc) -> string
11811  *
11812  * Returns a copy of +self+ with
11813  * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11814  *
11815  * Argument +form+ must be one of the following symbols
11816  * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11817  *
11818  * - +:nfc+: Canonical decomposition, followed by canonical composition.
11819  * - +:nfd+: Canonical decomposition.
11820  * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11821  * - +:nfkd+: Compatibility decomposition.
11822  *
11823  * The encoding of +self+ must be one of:
11824  *
11825  * - Encoding::UTF_8
11826  * - Encoding::UTF_16BE
11827  * - Encoding::UTF_16LE
11828  * - Encoding::UTF_32BE
11829  * - Encoding::UTF_32LE
11830  * - Encoding::GB18030
11831  * - Encoding::UCS_2BE
11832  * - Encoding::UCS_4BE
11833  *
11834  * Examples:
11835  *
11836  * "a\u0300".unicode_normalize # => "a"
11837  * "\u00E0".unicode_normalize(:nfd) # => "a "
11838  *
11839  * Related: String#unicode_normalize!, String#unicode_normalized?.
11840  */
11841 static VALUE
11842 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11843 {
11844  return unicode_normalize_common(argc, argv, str, id_normalize);
11845 }
11846 
11847 /*
11848  * call-seq:
11849  * unicode_normalize!(form = :nfc) -> self
11850  *
11851  * Like String#unicode_normalize, except that the normalization
11852  * is performed on +self+.
11853  *
11854  * Related String#unicode_normalized?.
11855  *
11856  */
11857 static VALUE
11858 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11859 {
11860  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11861 }
11862 
11863 /* call-seq:
11864  * unicode_normalized?(form = :nfc) -> true or false
11865  *
11866  * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11867  * +false+ otherwise.
11868  * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11869  *
11870  * Examples:
11871  *
11872  * "a\u0300".unicode_normalized? # => false
11873  * "a\u0300".unicode_normalized?(:nfd) # => true
11874  * "\u00E0".unicode_normalized? # => true
11875  * "\u00E0".unicode_normalized?(:nfd) # => false
11876  *
11877  *
11878  * Raises an exception if +self+ is not in a Unicode encoding:
11879  *
11880  * s = "\xE0".force_encoding('ISO-8859-1')
11881  * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11882  *
11883  * Related: String#unicode_normalize, String#unicode_normalize!.
11884  *
11885  */
11886 static VALUE
11887 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11888 {
11889  return unicode_normalize_common(argc, argv, str, id_normalized_p);
11890 }
11891 
11892 /**********************************************************************
11893  * Document-class: Symbol
11894  *
11895  * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11896  *
11897  * You can create a +Symbol+ object explicitly with:
11898  *
11899  * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11900  *
11901  * The same +Symbol+ object will be
11902  * created for a given name or string for the duration of a program's
11903  * execution, regardless of the context or meaning of that name. Thus
11904  * if <code>Fred</code> is a constant in one context, a method in
11905  * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11906  * will be the same object in all three contexts.
11907  *
11908  * module One
11909  * class Fred
11910  * end
11911  * $f1 = :Fred
11912  * end
11913  * module Two
11914  * Fred = 1
11915  * $f2 = :Fred
11916  * end
11917  * def Fred()
11918  * end
11919  * $f3 = :Fred
11920  * $f1.object_id #=> 2514190
11921  * $f2.object_id #=> 2514190
11922  * $f3.object_id #=> 2514190
11923  *
11924  * Constant, method, and variable names are returned as symbols:
11925  *
11926  * module One
11927  * Two = 2
11928  * def three; 3 end
11929  * @four = 4
11930  * @@five = 5
11931  * $six = 6
11932  * end
11933  * seven = 7
11934  *
11935  * One.constants
11936  * # => [:Two]
11937  * One.instance_methods(true)
11938  * # => [:three]
11939  * One.instance_variables
11940  * # => [:@four]
11941  * One.class_variables
11942  * # => [:@@five]
11943  * global_variables.grep(/six/)
11944  * # => [:$six]
11945  * local_variables
11946  * # => [:seven]
11947  *
11948  * A +Symbol+ object differs from a String object in that
11949  * a +Symbol+ object represents an identifier, while a String object
11950  * represents text or data.
11951  *
11952  * == What's Here
11953  *
11954  * First, what's elsewhere. \Class +Symbol+:
11955  *
11956  * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11957  * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11958  *
11959  * Here, class +Symbol+ provides methods that are useful for:
11960  *
11961  * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11962  * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11963  * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11964  *
11965  * === Methods for Querying
11966  *
11967  * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11968  * - #=~: Returns the index of the first substring in symbol that matches a
11969  * given Regexp or other object; returns +nil+ if no match is found.
11970  * - #[], #slice : Returns a substring of symbol
11971  * determined by a given index, start/length, or range, or string.
11972  * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11973  * - #encoding: Returns the Encoding object that represents the encoding
11974  * of symbol.
11975  * - #end_with?: Returns +true+ if symbol ends with
11976  * any of the given strings.
11977  * - #match: Returns a MatchData object if symbol
11978  * matches a given Regexp; +nil+ otherwise.
11979  * - #match?: Returns +true+ if symbol
11980  * matches a given Regexp; +false+ otherwise.
11981  * - #length, #size: Returns the number of characters in symbol.
11982  * - #start_with?: Returns +true+ if symbol starts with
11983  * any of the given strings.
11984  *
11985  * === Methods for Comparing
11986  *
11987  * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11988  * or larger than symbol.
11989  * - #==, #===: Returns +true+ if a given symbol has the same content and
11990  * encoding.
11991  * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11992  * symbol is smaller than, equal to, or larger than symbol.
11993  * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11994  * after Unicode case folding; +false+ otherwise.
11995  *
11996  * === Methods for Converting
11997  *
11998  * - #capitalize: Returns symbol with the first character upcased
11999  * and all other characters downcased.
12000  * - #downcase: Returns symbol with all characters downcased.
12001  * - #inspect: Returns the string representation of +self+ as a symbol literal.
12002  * - #name: Returns the frozen string corresponding to symbol.
12003  * - #succ, #next: Returns the symbol that is the successor to symbol.
12004  * - #swapcase: Returns symbol with all upcase characters downcased
12005  * and all downcase characters upcased.
12006  * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12007  * - #to_s, #id2name: Returns the string corresponding to +self+.
12008  * - #to_sym, #intern: Returns +self+.
12009  * - #upcase: Returns symbol with all characters upcased.
12010  *
12011  */
12012 
12013 
12014 /*
12015  * call-seq:
12016  * symbol == object -> true or false
12017  *
12018  * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12019  */
12020 
12021 #define sym_equal rb_obj_equal
12022 
12023 static int
12024 sym_printable(const char *s, const char *send, rb_encoding *enc)
12025 {
12026  while (s < send) {
12027  int n;
12028  int c = rb_enc_precise_mbclen(s, send, enc);
12029 
12030  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12031  n = MBCLEN_CHARFOUND_LEN(c);
12032  c = rb_enc_mbc_to_codepoint(s, send, enc);
12033  if (!rb_enc_isprint(c, enc)) return FALSE;
12034  s += n;
12035  }
12036  return TRUE;
12037 }
12038 
12039 int
12040 rb_str_symname_p(VALUE sym)
12041 {
12042  rb_encoding *enc;
12043  const char *ptr;
12044  long len;
12046 
12047  if (resenc == NULL) resenc = rb_default_external_encoding();
12048  enc = STR_ENC_GET(sym);
12049  ptr = RSTRING_PTR(sym);
12050  len = RSTRING_LEN(sym);
12051  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12052  !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12053  return FALSE;
12054  }
12055  return TRUE;
12056 }
12057 
12058 VALUE
12059 rb_str_quote_unprintable(VALUE str)
12060 {
12061  rb_encoding *enc;
12062  const char *ptr;
12063  long len;
12064  rb_encoding *resenc;
12065 
12066  Check_Type(str, T_STRING);
12067  resenc = rb_default_internal_encoding();
12068  if (resenc == NULL) resenc = rb_default_external_encoding();
12069  enc = STR_ENC_GET(str);
12070  ptr = RSTRING_PTR(str);
12071  len = RSTRING_LEN(str);
12072  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12073  !sym_printable(ptr, ptr + len, enc)) {
12074  return rb_str_escape(str);
12075  }
12076  return str;
12077 }
12078 
12079 VALUE
12080 rb_id_quote_unprintable(ID id)
12081 {
12082  VALUE str = rb_id2str(id);
12083  if (!rb_str_symname_p(str)) {
12084  return rb_str_escape(str);
12085  }
12086  return str;
12087 }
12088 
12089 /*
12090  * call-seq:
12091  * inspect -> string
12092  *
12093  * Returns a string representation of +self+ (including the leading colon):
12094  *
12095  * :foo.inspect # => ":foo"
12096  *
12097  * Related: Symbol#to_s, Symbol#name.
12098  *
12099  */
12100 
12101 static VALUE
12102 sym_inspect(VALUE sym)
12103 {
12104  VALUE str = rb_sym2str(sym);
12105  const char *ptr;
12106  long len;
12107  char *dest;
12108 
12109  if (!rb_str_symname_p(str)) {
12110  str = rb_str_inspect(str);
12111  len = RSTRING_LEN(str);
12112  rb_str_resize(str, len + 1);
12113  dest = RSTRING_PTR(str);
12114  memmove(dest + 1, dest, len);
12115  }
12116  else {
12117  rb_encoding *enc = STR_ENC_GET(str);
12118  VALUE orig_str = str;
12119 
12120  len = RSTRING_LEN(orig_str);
12121  str = rb_enc_str_new(0, len + 1, enc);
12122 
12123  // Get data pointer after allocation
12124  ptr = RSTRING_PTR(orig_str);
12125  dest = RSTRING_PTR(str);
12126  memcpy(dest + 1, ptr, len);
12127 
12128  RB_GC_GUARD(orig_str);
12129  }
12130  dest[0] = ':';
12131 
12133 
12134  return str;
12135 }
12136 
12137 VALUE
12139 {
12140  return str_new_shared(rb_cString, rb_sym2str(sym));
12141 }
12142 
12143 VALUE
12144 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12145 {
12146  VALUE obj;
12147 
12148  if (argc < 1) {
12149  rb_raise(rb_eArgError, "no receiver given");
12150  }
12151  obj = argv[0];
12152  return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12153 }
12154 
12155 /*
12156  * call-seq:
12157  * succ
12158  *
12159  * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12160  *
12161  * :foo.succ # => :fop
12162  *
12163  * Related: String#succ.
12164  */
12165 
12166 static VALUE
12167 sym_succ(VALUE sym)
12168 {
12169  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12170 }
12171 
12172 /*
12173  * call-seq:
12174  * symbol <=> object -> -1, 0, +1, or nil
12175  *
12176  * If +object+ is a symbol,
12177  * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12178  *
12179  * :bar <=> :foo # => -1
12180  * :foo <=> :foo # => 0
12181  * :foo <=> :bar # => 1
12182  *
12183  * Otherwise, returns +nil+:
12184  *
12185  * :foo <=> 'bar' # => nil
12186  *
12187  * Related: String#<=>.
12188  */
12189 
12190 static VALUE
12191 sym_cmp(VALUE sym, VALUE other)
12192 {
12193  if (!SYMBOL_P(other)) {
12194  return Qnil;
12195  }
12196  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12197 }
12198 
12199 /*
12200  * call-seq:
12201  * casecmp(object) -> -1, 0, 1, or nil
12202  *
12203  * :include: doc/symbol/casecmp.rdoc
12204  *
12205  */
12206 
12207 static VALUE
12208 sym_casecmp(VALUE sym, VALUE other)
12209 {
12210  if (!SYMBOL_P(other)) {
12211  return Qnil;
12212  }
12213  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12214 }
12215 
12216 /*
12217  * call-seq:
12218  * casecmp?(object) -> true, false, or nil
12219  *
12220  * :include: doc/symbol/casecmp_p.rdoc
12221  *
12222  */
12223 
12224 static VALUE
12225 sym_casecmp_p(VALUE sym, VALUE other)
12226 {
12227  if (!SYMBOL_P(other)) {
12228  return Qnil;
12229  }
12230  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12231 }
12232 
12233 /*
12234  * call-seq:
12235  * symbol =~ object -> integer or nil
12236  *
12237  * Equivalent to <tt>symbol.to_s =~ object</tt>,
12238  * including possible updates to global variables;
12239  * see String#=~.
12240  *
12241  */
12242 
12243 static VALUE
12244 sym_match(VALUE sym, VALUE other)
12245 {
12246  return rb_str_match(rb_sym2str(sym), other);
12247 }
12248 
12249 /*
12250  * call-seq:
12251  * match(pattern, offset = 0) -> matchdata or nil
12252  * match(pattern, offset = 0) {|matchdata| } -> object
12253  *
12254  * Equivalent to <tt>self.to_s.match</tt>,
12255  * including possible updates to global variables;
12256  * see String#match.
12257  *
12258  */
12259 
12260 static VALUE
12261 sym_match_m(int argc, VALUE *argv, VALUE sym)
12262 {
12263  return rb_str_match_m(argc, argv, rb_sym2str(sym));
12264 }
12265 
12266 /*
12267  * call-seq:
12268  * match?(pattern, offset) -> true or false
12269  *
12270  * Equivalent to <tt>sym.to_s.match?</tt>;
12271  * see String#match.
12272  *
12273  */
12274 
12275 static VALUE
12276 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12277 {
12278  return rb_str_match_m_p(argc, argv, sym);
12279 }
12280 
12281 /*
12282  * call-seq:
12283  * symbol[index] -> string or nil
12284  * symbol[start, length] -> string or nil
12285  * symbol[range] -> string or nil
12286  * symbol[regexp, capture = 0] -> string or nil
12287  * symbol[substring] -> string or nil
12288  *
12289  * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12290  *
12291  */
12292 
12293 static VALUE
12294 sym_aref(int argc, VALUE *argv, VALUE sym)
12295 {
12296  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12297 }
12298 
12299 /*
12300  * call-seq:
12301  * length -> integer
12302  *
12303  * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12304  */
12305 
12306 static VALUE
12307 sym_length(VALUE sym)
12308 {
12309  return rb_str_length(rb_sym2str(sym));
12310 }
12311 
12312 /*
12313  * call-seq:
12314  * empty? -> true or false
12315  *
12316  * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12317  *
12318  */
12319 
12320 static VALUE
12321 sym_empty(VALUE sym)
12322 {
12323  return rb_str_empty(rb_sym2str(sym));
12324 }
12325 
12326 /*
12327  * call-seq:
12328  * upcase(*options) -> symbol
12329  *
12330  * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12331  *
12332  * See String#upcase.
12333  *
12334  */
12335 
12336 static VALUE
12337 sym_upcase(int argc, VALUE *argv, VALUE sym)
12338 {
12339  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12340 }
12341 
12342 /*
12343  * call-seq:
12344  * downcase(*options) -> symbol
12345  *
12346  * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12347  *
12348  * See String#downcase.
12349  *
12350  * Related: Symbol#upcase.
12351  *
12352  */
12353 
12354 static VALUE
12355 sym_downcase(int argc, VALUE *argv, VALUE sym)
12356 {
12357  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12358 }
12359 
12360 /*
12361  * call-seq:
12362  * capitalize(*options) -> symbol
12363  *
12364  * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12365  *
12366  * See String#capitalize.
12367  *
12368  */
12369 
12370 static VALUE
12371 sym_capitalize(int argc, VALUE *argv, VALUE sym)
12372 {
12373  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12374 }
12375 
12376 /*
12377  * call-seq:
12378  * swapcase(*options) -> symbol
12379  *
12380  * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12381  *
12382  * See String#swapcase.
12383  *
12384  */
12385 
12386 static VALUE
12387 sym_swapcase(int argc, VALUE *argv, VALUE sym)
12388 {
12389  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12390 }
12391 
12392 /*
12393  * call-seq:
12394  * start_with?(*string_or_regexp) -> true or false
12395  *
12396  * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12397  *
12398  */
12399 
12400 static VALUE
12401 sym_start_with(int argc, VALUE *argv, VALUE sym)
12402 {
12403  return rb_str_start_with(argc, argv, rb_sym2str(sym));
12404 }
12405 
12406 /*
12407  * call-seq:
12408  * end_with?(*strings) -> true or false
12409  *
12410  *
12411  * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12412  *
12413  */
12414 
12415 static VALUE
12416 sym_end_with(int argc, VALUE *argv, VALUE sym)
12417 {
12418  return rb_str_end_with(argc, argv, rb_sym2str(sym));
12419 }
12420 
12421 /*
12422  * call-seq:
12423  * encoding -> encoding
12424  *
12425  * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12426  *
12427  */
12428 
12429 static VALUE
12430 sym_encoding(VALUE sym)
12431 {
12432  return rb_obj_encoding(rb_sym2str(sym));
12433 }
12434 
12435 static VALUE
12436 string_for_symbol(VALUE name)
12437 {
12438  if (!RB_TYPE_P(name, T_STRING)) {
12439  VALUE tmp = rb_check_string_type(name);
12440  if (NIL_P(tmp)) {
12441  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12442  name);
12443  }
12444  name = tmp;
12445  }
12446  return name;
12447 }
12448 
12449 ID
12451 {
12452  if (SYMBOL_P(name)) {
12453  return SYM2ID(name);
12454  }
12455  name = string_for_symbol(name);
12456  return rb_intern_str(name);
12457 }
12458 
12459 VALUE
12461 {
12462  if (SYMBOL_P(name)) {
12463  return name;
12464  }
12465  name = string_for_symbol(name);
12466  return rb_str_intern(name);
12467 }
12468 
12469 /*
12470  * call-seq:
12471  * Symbol.all_symbols -> array_of_symbols
12472  *
12473  * Returns an array of all symbols currently in Ruby's symbol table:
12474  *
12475  * Symbol.all_symbols.size # => 9334
12476  * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12477  *
12478  */
12479 
12480 static VALUE
12481 sym_all_symbols(VALUE _)
12482 {
12483  return rb_sym_all_symbols();
12484 }
12485 
12486 VALUE
12488 {
12489  return rb_fstring(str);
12490 }
12491 
12492 VALUE
12493 rb_interned_str(const char *ptr, long len)
12494 {
12495  struct RString fake_str;
12496  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12497 }
12498 
12499 VALUE
12501 {
12502  return rb_interned_str(ptr, strlen(ptr));
12503 }
12504 
12505 VALUE
12506 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12507 {
12508  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12509  rb_enc_autoload(enc);
12510  }
12511 
12512  struct RString fake_str;
12513  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12514 }
12515 
12516 VALUE
12517 rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12518 {
12519  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12520  rb_enc_autoload(enc);
12521  }
12522 
12523  struct RString fake_str;
12524  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12525 }
12526 
12527 VALUE
12529 {
12530  return rb_enc_interned_str(ptr, strlen(ptr), enc);
12531 }
12532 
12533 #if USE_YJIT
12534 void
12535 rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12536 {
12538  ssize_t code = RB_NUM2SSIZE(codepoint);
12539 
12540  if (RB_LIKELY(code >= 0 && code < 0xff)) {
12541  rb_str_buf_cat_byte(str, (char) code);
12542  return;
12543  }
12544  }
12545 
12546  rb_str_concat(str, codepoint);
12547 }
12548 #endif
12549 
12550 void
12551 Init_String(void)
12552 {
12553  rb_cString = rb_define_class("String", rb_cObject);
12554  RUBY_ASSERT(rb_vm_fstring_table());
12555  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12557  rb_define_alloc_func(rb_cString, empty_str_alloc);
12558  rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12559  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12560  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12561  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12562  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12565  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12566  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12567  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12568  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12571  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12572  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12573  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12574  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12575  rb_define_method(rb_cString, "length", rb_str_length, 0);
12577  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12578  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12579  rb_define_method(rb_cString, "=~", rb_str_match, 1);
12580  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12581  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12583  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12585  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12586  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12587  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12588  rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12589  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12590  rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12591  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12592  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12593  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12594  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12595  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12596  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12597  rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12598  rb_define_method(rb_cString, "scrub", str_scrub, -1);
12599  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12600  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12601  rb_define_method(rb_cString, "+@", str_uplus, 0);
12602  rb_define_method(rb_cString, "-@", str_uminus, 0);
12603  rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12604  rb_define_alias(rb_cString, "dedup", "-@");
12605 
12606  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12607  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12608  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12609  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12610  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12612  rb_define_method(rb_cString, "undump", str_undump, 0);
12613 
12614  sym_ascii = ID2SYM(rb_intern_const("ascii"));
12615  sym_turkic = ID2SYM(rb_intern_const("turkic"));
12616  sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12617  sym_fold = ID2SYM(rb_intern_const("fold"));
12618 
12619  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12620  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12621  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12622  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12623 
12624  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12625  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12626  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12627  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12628 
12629  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12630  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12631  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12632  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12633  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12634  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12635  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12636  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12637  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12638  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12639  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12640  rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12642  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12643  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12644  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12645  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12646  rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12647 
12648  rb_define_method(rb_cString, "include?", rb_str_include, 1);
12649  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12650  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12651 
12652  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12653 
12654  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12655  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12656  rb_define_method(rb_cString, "center", rb_str_center, -1);
12657 
12658  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12659  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12660  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12661  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12662  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12663  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12664  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12665  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12666  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12667 
12668  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12669  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12670  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12671  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12672  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12673  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12674  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12675  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12676  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12677 
12678  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12679  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12680  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12681  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12682  rb_define_method(rb_cString, "count", rb_str_count, -1);
12683 
12684  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12685  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12686  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12687  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12688 
12689  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12690  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12691  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12692  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12693  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12694 
12695  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12696 
12697  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12698  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12699 
12700  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12701  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12702 
12703  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12704  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12705  rb_define_method(rb_cString, "b", rb_str_b, 0);
12706  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12707  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12708 
12709  /* define UnicodeNormalize module here so that we don't have to look it up */
12710  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12711  id_normalize = rb_intern_const("normalize");
12712  id_normalized_p = rb_intern_const("normalized?");
12713 
12714  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12715  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12716  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12717 
12718  rb_fs = Qnil;
12719  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12720  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12722 
12723  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12727  rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12728 
12729  rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12730  rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12731  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12732  rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12733  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12734  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12735  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12736 
12737  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12738  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12739  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12740  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12741 
12742  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12743  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12744  rb_define_method(rb_cSymbol, "length", sym_length, 0);
12745  rb_define_method(rb_cSymbol, "size", sym_length, 0);
12746  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12747  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12748  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12749 
12750  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12751  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12752  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12753  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12754 
12755  rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12756  rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12757 
12758  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12759 }
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition: assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition: assert.h:219
Atomic operations.
#define RB_LIKELY(x)
Asserts that the given Boolean expression likely holds.
Definition: assume.h:43
#define RB_UNLIKELY(x)
Asserts that the given Boolean expression likely doesn't hold.
Definition: assume.h:50
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition: coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition: coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition: ctype.h:395
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition: ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition: ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition: ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition: fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition: fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition: fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition: class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition: class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2635
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:2142
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:916
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition: class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition: value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition: value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition: fl_type.h:134
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition: fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition: memory.h:399
#define ISSPACE
Old name of rb_isspace.
Definition: ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition: coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition: coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:137
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition: assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition: value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition: value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition: symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition: coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition: size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition: fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition: encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition: long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition: coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition: memory.h:396
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:394
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition: fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition: fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition: array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition: long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition: fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition: ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition: encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition: st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition: encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition: fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition: long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition: util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition: memory.h:400
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition: fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition: double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition: ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition: encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition: fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition: fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition: fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition: int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition: encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition: coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition: fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition: encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition: error.c:475
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3627
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:676
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition: error.c:3739
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:1088
VALUE rb_eRangeError
RangeError exception.
Definition: error.c:1407
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1403
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition: error.c:3678
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1410
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1401
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1404
VALUE rb_eIndexError
IndexError exception.
Definition: error.c:1405
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1045
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition: error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition: object.c:667
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition: object.c:2091
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition: object.c:2109
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition: object.c:1270
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition: object.c:3477
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition: object.c:574
VALUE rb_cSymbol
Symbol class.
Definition: string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition: object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1258
VALUE rb_mComparable
Comparable module.
Definition: compar.c:19
VALUE rb_cString
String class.
Definition: string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:3186
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition: gc.h:603
Encoding relates APIs.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1508
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1574
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:197
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1176
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:920
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1007
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1472
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1226
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:683
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1134
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1466
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1158
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1454
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1125
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1212
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:638
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1661
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:191
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:704
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:323
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:971
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1047
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:182
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:768
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1460
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:662
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:571
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:643
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:402
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:447
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1164
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:979
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1013
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:99
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:591
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:417
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:726
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:432
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:619
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1188
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1478
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1522
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1270
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition: string.c:2921
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:880
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition: string.c:1135
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition: string.c:1154
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition: string.c:12506
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition: re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition: string.c:2251
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition: string.c:3597
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1082
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_new(), except it additionally takes an encoding.
Definition: string.c:1042
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition: string.c:1375
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition: string.c:1276
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:899
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1148
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition: string.c:12528
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:764
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition: symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2914
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2651
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition: vm_eval.c:1099
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1058
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition: vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition: gc.h:479
void rb_gc_register_address(VALUE *valptr)
Inform the garbage collector that the global or static variable pointed by valptr stores a live Ruby ...
Definition: gc.c:2795
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:1014
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:747
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
Definition: array.c:741
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1384
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
Definition: array.c:648
VALUE rb_ary_new_from_args(long n,...)
Constructs an array from the passed objects.
Definition: array.c:753
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Identical to rb_cstr2inum(), except it takes Ruby's strings instead of C's.
Definition: bignum.c:4308
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition: enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition: enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition: error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:284
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1864
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2073
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2893
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Identical to rb_hash_aref(), except it always returns RUBY_Qnil for misshits.
Definition: hash.c:2099
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1475
VALUE rb_rs
The record separator character for inputs, or the $/.
Definition: io.c:205
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition: string.c:649
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition: io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition: vm.c:1817
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition: symbol.c:1042
void rb_backref_set(VALUE md)
Updates $~.
Definition: vm.c:1823
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition: range.c:1842
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition: re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition: re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition: re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition: re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition: re.c:1905
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition: string.c:12487
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition: string.c:1661
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: string.c:1440
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition: string.c:2402
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition: string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition: string.h:939
VALUE rb_utf8_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1034
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition: string.c:3662
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition: string.c:1351
VALUE rb_utf8_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1074
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition: string.c:12138
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition: string.c:2474
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition: string.c:1327
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1655
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition: string.c:2949
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition: string.c:5254
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition: string.c:4031
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition: string.c:3046
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:11437
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition: random.c:1752
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1708
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition: string.c:1117
VALUE rb_str_buf_cat(VALUE, const char *, long)
Just another name of rb_str_cat.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:934
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1446
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1911
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2642
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition: string.c:4017
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition: string.c:3430
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition: string.c:2340
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition: string.c:1929
VALUE rb_usascii_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1026
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition: string.c:6464
VALUE rb_usascii_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1066
VALUE rb_str_buf_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition: string.c:3054
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition: string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition: string.c:12500
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition: string.c:1357
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition: string.c:3628
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition: string.c:2996
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition: string.c:4133
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3254
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition: string.c:7185
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition: string.c:2694
VALUE rb_str_buf_new_cstr(const char *ptr)
This is a rb_str_buf_new() + rb_str_buf_cat() combo.
Definition: string.c:1643
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition: string.c:12493
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition: string.c:4087
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition: string.c:3904
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition: string.c:4062
#define rb_strlen_lit(str)
Length of a string literal.
Definition: string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition: string.c:3604
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition: string.c:3163
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition: string.c:5766
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:1020
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition: string.c:11495
VALUE rb_str_dup_frozen(VALUE)
Just another name of rb_str_new_frozen.
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition: string.c:1611
VALUE rb_locale_str_new_cstr(const char *ptr)
Identical to rb_locale_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1345
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2845
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition: string.c:3141
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition: string.c:3237
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1054
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3302
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition: string.c:1129
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2650
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:7299
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition: string.c:1339
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1627
VALUE rb_external_str_new_cstr(const char *ptr)
Identical to rb_external_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1333
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition: string.c:2354
VALUE rb_str_cat_cstr(VALUE dst, const char *src)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:3440
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5684
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition: string.c:9392
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition: string.c:1123
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition: symbol.c:878
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition: string.c:1770
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1859
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition: variable.c:1876
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2955
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition: vm_method.c:1286
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:823
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:970
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition: string.c:12460
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: string.c:12450
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: symbol.c:829
VALUE rb_id2str(ID id)
Identical to rb_id2name(), except it returns a Ruby's String instead of C's.
Definition: symbol.c:986
void rb_define_hooked_variable(const char *name, VALUE *var, rb_gvar_getter_t *getter, rb_gvar_setter_t *setter)
Identical to rb_define_virtual_variable(), but can also specify a storage.
Definition: variable.c:707
int capa
Designed capacity of the buffer.
Definition: io.h:11
char * ptr
Pointer to the underlying memory region, of at least capa bytes.
Definition: io.h:2
int off
Offset inside of ptr.
Definition: io.h:5
int len
Length of the buffer.
Definition: io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition: re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition: re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition: re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition: sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition: vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:367
#define ALLOCA_N(type, n)
Definition: memory.h:287
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition: memory.h:355
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:162
VALUE type(ANYARGS)
ANYARGS-ed function type.
Definition: cxxanyargs.hpp:56
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition: rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition: rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition: rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition: rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition: rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition: string.c:1369
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:442
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:416
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition: rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition: rstring.h:488
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition: string.c:2717
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition: string.c:2822
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition: string.c:2706
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition: rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition: string.c:1363
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition: string.c:1699
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition: load.c:1400
#define errno
Ractor-aware version of errno.
Definition: ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition: size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition: stdarg.h:35
VALUE flags
Per-object flags.
Definition: rbasic.h:75
Ruby's String.
Definition: rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition: rstring.h:199
union RString::@48 as
String's specific fields.
long len
Length of the string, not including terminating NUL character.
Definition: rstring.h:206
struct RString::@48::@50 embed
Embedded contents.
struct RString::@48::@49 heap
Strings that use separated memory region for contents use this pattern.
VALUE shared
Parent of the string.
Definition: rstring.h:240
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:200
Definition: st.h:79
Definition: string.c:8257
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition: thread.c:298
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition: value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition: value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:376
ruby_value_type
C-level type of an object.
Definition: value_type.h:113
void ruby_xfree(void *ptr)
Deallocates a storage instance.
Definition: gc.c:4264