Ruby  3.4.0dev (2024-11-22 revision 37a72b0150ec36b4ea27175039afc28c62207b0c)
string.c (37a72b0150ec36b4ea27175039afc28c62207b0c)
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/internal/config.h"
15 
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
19 
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
23 
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "id.h"
27 #include "internal.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
41 #include "probes.h"
42 #include "ruby/encoding.h"
43 #include "ruby/re.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
46 #include "vm_sync.h"
47 
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
50 # include <crypt.h>
51 # endif
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
55 #endif
56 
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
59 
60 #undef rb_str_new
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
63 #undef rb_enc_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
72 #undef rb_str_buf_cat
73 #undef rb_str_buf_cat2
74 #undef rb_str_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
77 
80 
81 /* Flags of RString
82  *
83  * 0: STR_SHARED (equal to ELTS_SHARED)
84  * The string is shared. The buffer this string points to is owned by
85  * another string (the shared root).
86  * 1: RSTRING_NOEMBED
87  * The string is not embedded. When a string is embedded, the contents
88  * follow the header. When a string is not embedded, the contents is
89  * on a separately allocated buffer.
90  * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
91  * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
92  * It emits a deprecation warning when mutated for the first time.
93  * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
94  * The string was allocated by the `Symbol#to_s` method.
95  * It emits a deprecation warning when mutated for the first time.
96  * 4: STR_PRECOMPUTED_HASH
97  * The string is embedded and has its precomputed hashcode stored
98  * after the terminator.
99  * 5: STR_SHARED_ROOT
100  * Other strings may point to the contents of this string. When this
101  * flag is set, STR_SHARED must not be set.
102  * 6: STR_BORROWED
103  * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
104  * to be unshared by rb_str_tmp_frozen_release.
105  * 7: STR_TMPLOCK
106  * The pointer to the buffer is passed to a system call such as
107  * read(2). Any modification and realloc is prohibited.
108  * 8-9: ENC_CODERANGE
109  * Stores the coderange of the string.
110  * 10-16: ENCODING
111  * Stores the encoding of the string.
112  * 17: RSTRING_FSTR
113  * The string is a fstring. The string is deduplicated in the fstring
114  * table.
115  * 18: STR_NOFREE
116  * Do not free this string's buffer when the string is reclaimed
117  * by the garbage collector. Used for when the string buffer is a C
118  * string literal.
119  * 19: STR_FAKESTR
120  * The string is not allocated or managed by the garbage collector.
121  * Typically, the string object header (struct RString) is temporarily
122  * allocated on C stack.
123  */
124 
125 #define RUBY_MAX_CHAR_LEN 16
126 #define STR_PRECOMPUTED_HASH FL_USER4
127 #define STR_SHARED_ROOT FL_USER5
128 #define STR_BORROWED FL_USER6
129 #define STR_TMPLOCK FL_USER7
130 #define STR_NOFREE FL_USER18
131 #define STR_FAKESTR FL_USER19
132 
133 #define STR_SET_NOEMBED(str) do {\
134  FL_SET((str), STR_NOEMBED);\
135  FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
136 } while (0)
137 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
138 
139 #define STR_SET_LEN(str, n) do { \
140  RSTRING(str)->len = (n); \
141 } while (0)
142 
143 static inline bool
144 str_encindex_fastpath(int encindex)
145 {
146  // The overwhelming majority of strings are in one of these 3 encodings.
147  switch (encindex) {
148  case ENCINDEX_ASCII_8BIT:
149  case ENCINDEX_UTF_8:
150  case ENCINDEX_US_ASCII:
151  return true;
152  default:
153  return false;
154  }
155 }
156 
157 static inline bool
158 str_enc_fastpath(VALUE str)
159 {
160  return str_encindex_fastpath(ENCODING_GET_INLINED(str));
161 }
162 
163 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164 #define TERM_FILL(ptr, termlen) do {\
165  char *const term_fill_ptr = (ptr);\
166  const int term_fill_len = (termlen);\
167  *term_fill_ptr = '\0';\
168  if (UNLIKELY(term_fill_len > 1))\
169  memset(term_fill_ptr, 0, term_fill_len);\
170 } while (0)
171 
172 #define RESIZE_CAPA(str,capacity) do {\
173  const int termlen = TERM_LEN(str);\
174  RESIZE_CAPA_TERM(str,capacity,termlen);\
175 } while (0)
176 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177  if (STR_EMBED_P(str)) {\
178  if (str_embed_capa(str) < capacity + termlen) {\
179  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180  const long tlen = RSTRING_LEN(str);\
181  memcpy(tmp, RSTRING_PTR(str), tlen);\
182  RSTRING(str)->as.heap.ptr = tmp;\
183  RSTRING(str)->len = tlen;\
184  STR_SET_NOEMBED(str);\
185  RSTRING(str)->as.heap.aux.capa = (capacity);\
186  }\
187  }\
188  else {\
189  RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191  (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192  RSTRING(str)->as.heap.aux.capa = (capacity);\
193  }\
194 } while (0)
195 
196 #define STR_SET_SHARED(str, shared_str) do { \
197  if (!FL_TEST(str, STR_FAKESTR)) { \
198  RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199  RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201  FL_SET((str), STR_SHARED); \
202  FL_SET((shared_str), STR_SHARED_ROOT); \
203  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
204  FL_SET_RAW((shared_str), STR_BORROWED); \
205  } \
206 } while (0)
207 
208 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
210 /* TODO: include the terminator size in capa. */
211 
212 #define STR_ENC_GET(str) get_encoding(str)
213 
214 #if !defined SHARABLE_MIDDLE_SUBSTRING
215 # define SHARABLE_MIDDLE_SUBSTRING 0
216 #endif
217 #if !SHARABLE_MIDDLE_SUBSTRING
218 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
219 #else
220 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
221 #endif
222 
223 
224 static inline long
225 str_embed_capa(VALUE str)
226 {
227  return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
228 }
229 
230 bool
231 rb_str_reembeddable_p(VALUE str)
232 {
233  return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
234 }
235 
236 static inline size_t
237 rb_str_embed_size(long capa)
238 {
239  return offsetof(struct RString, as.embed.ary) + capa;
240 }
241 
242 size_t
243 rb_str_size_as_embedded(VALUE str)
244 {
245  size_t real_size;
246  if (STR_EMBED_P(str)) {
247  real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
248  }
249  /* if the string is not currently embedded, but it can be embedded, how
250  * much space would it require */
251  else if (rb_str_reembeddable_p(str)) {
252  real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
253  }
254  else {
255  real_size = sizeof(struct RString);
256  }
257 
258  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
259  real_size += sizeof(st_index_t);
260  }
261 
262  return real_size;
263 }
264 
265 static inline bool
266 STR_EMBEDDABLE_P(long len, long termlen)
267 {
268  return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
269 }
270 
271 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
272 static VALUE str_new_frozen(VALUE klass, VALUE orig);
273 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
274 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
275 static VALUE str_new(VALUE klass, const char *ptr, long len);
276 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
277 static inline void str_modifiable(VALUE str);
278 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
279 static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
280 
281 static inline void
282 str_make_independent(VALUE str)
283 {
284  long len = RSTRING_LEN(str);
285  int termlen = TERM_LEN(str);
286  str_make_independent_expand((str), len, 0L, termlen);
287 }
288 
289 static inline int str_dependent_p(VALUE str);
290 
291 void
292 rb_str_make_independent(VALUE str)
293 {
294  if (str_dependent_p(str)) {
295  str_make_independent(str);
296  }
297 }
298 
299 void
300 rb_str_make_embedded(VALUE str)
301 {
302  RUBY_ASSERT(rb_str_reembeddable_p(str));
303  RUBY_ASSERT(!STR_EMBED_P(str));
304 
305  char *buf = RSTRING(str)->as.heap.ptr;
306  long len = RSTRING(str)->len;
307 
308  STR_SET_EMBED(str);
309  STR_SET_LEN(str, len);
310 
311  if (len > 0) {
312  memcpy(RSTRING_PTR(str), buf, len);
313  ruby_xfree(buf);
314  }
315 
316  TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
317 }
318 
319 void
320 rb_debug_rstring_null_ptr(const char *func)
321 {
322  fprintf(stderr, "%s is returning NULL!! "
323  "SIGSEGV is highly expected to follow immediately.\n"
324  "If you could reproduce, attach your debugger here, "
325  "and look at the passed string.\n",
326  func);
327 }
328 
329 /* symbols for [up|down|swap]case/capitalize options */
330 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
331 
332 static rb_encoding *
333 get_encoding(VALUE str)
334 {
335  return rb_enc_from_index(ENCODING_GET(str));
336 }
337 
338 static void
339 mustnot_broken(VALUE str)
340 {
341  if (is_broken_string(str)) {
342  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
343  }
344 }
345 
346 static void
347 mustnot_wchar(VALUE str)
348 {
349  rb_encoding *enc = STR_ENC_GET(str);
350  if (rb_enc_mbminlen(enc) > 1) {
351  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
352  }
353 }
354 
355 static int fstring_cmp(VALUE a, VALUE b);
356 
357 static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
358 
359 #if SIZEOF_LONG == SIZEOF_VOIDP
360 #define PRECOMPUTED_FAKESTR_HASH 1
361 #else
362 #endif
363 
364 #ifdef PRECOMPUTED_FAKESTR_HASH
365 static st_index_t
366 fstring_hash(VALUE str)
367 {
368  if (FL_TEST_RAW(str, STR_FAKESTR)) {
369  // register_fstring precomputes the hash and stores it in capa for fake strings
370  return (st_index_t)RSTRING(str)->as.heap.aux.capa;
371  }
372  else {
373  return rb_str_hash(str);
374  }
375 }
376 #else
377 #define fstring_hash rb_str_hash
378 #endif
379 
380 const struct st_hash_type rb_fstring_hash_type = {
381  fstring_cmp,
382  fstring_hash,
383 };
384 
385 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
386 
387 static inline st_index_t
388 str_do_hash(VALUE str)
389 {
390  st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
391  int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
392  if (e && !is_ascii_string(str)) {
393  h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
394  }
395  return h;
396 }
397 
398 static VALUE
399 str_store_precomputed_hash(VALUE str, st_index_t hash)
400 {
401  RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
402  RUBY_ASSERT(STR_EMBED_P(str));
403 
404 #if RUBY_DEBUG
405  size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406  size_t free_bytes = str_embed_capa(str) - used_bytes;
407  RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
408 #endif
409 
410  memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
411 
412  FL_SET(str, STR_PRECOMPUTED_HASH);
413 
414  return str;
415 }
416 
418  VALUE fstr;
419  bool copy;
420  bool force_precompute_hash;
421 };
422 
423 static int
424 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
425 {
426  struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
427  VALUE str = (VALUE)*key;
428 
429  if (existing) {
430  /* because of lazy sweep, str may be unmarked already and swept
431  * at next time */
432 
433  if (rb_objspace_garbage_object_p(str)) {
434  arg->fstr = Qundef;
435  return ST_DELETE;
436  }
437 
438  arg->fstr = str;
439  return ST_STOP;
440  }
441  else {
442  // Unless the string is empty or binary, its coderange has been precomputed.
443  int coderange = ENC_CODERANGE(str);
444 
445  if (FL_TEST_RAW(str, STR_FAKESTR)) {
446  if (arg->copy) {
447  VALUE new_str;
448  long len = RSTRING_LEN(str);
449  long capa = len + sizeof(st_index_t);
450  int term_len = TERM_LEN(str);
451 
452  if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
453  new_str = str_alloc_embed(rb_cString, capa + term_len);
454  memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
455  STR_SET_LEN(new_str, RSTRING_LEN(str));
456  TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
457  rb_enc_copy(new_str, str);
458  str_store_precomputed_hash(new_str, fstring_hash(str));
459  }
460  else {
461  new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
462  rb_enc_copy(new_str, str);
463 #ifdef PRECOMPUTED_FAKESTR_HASH
464  if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
465  str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
466  }
467 #endif
468  }
469  str = new_str;
470  }
471  else {
472  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
473  RSTRING(str)->len,
474  ENCODING_GET(str));
475  }
476  OBJ_FREEZE(str);
477  }
478  else {
479  if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
480  str = str_new_frozen(rb_cString, str);
481  }
482  if (STR_SHARED_P(str)) { /* str should not be shared */
483  /* shared substring */
484  str_make_independent(str);
485  RUBY_ASSERT(OBJ_FROZEN(str));
486  }
487  if (!BARE_STRING_P(str)) {
488  str = str_new_frozen(rb_cString, str);
489  }
490  }
491 
492  ENC_CODERANGE_SET(str, coderange);
493  RBASIC(str)->flags |= RSTRING_FSTR;
494 
495  *key = *value = arg->fstr = str;
496  return ST_CONTINUE;
497  }
498 }
499 
500 VALUE
501 rb_fstring(VALUE str)
502 {
503  VALUE fstr;
504  int bare;
505 
506  Check_Type(str, T_STRING);
507 
508  if (FL_TEST(str, RSTRING_FSTR))
509  return str;
510 
511  bare = BARE_STRING_P(str);
512  if (!bare) {
513  if (STR_EMBED_P(str)) {
514  OBJ_FREEZE(str);
515  return str;
516  }
517 
518  if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
519  RUBY_ASSERT(OBJ_FROZEN(str));
520  return str;
521  }
522  }
523 
524  if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
525  rb_str_resize(str, RSTRING_LEN(str));
526 
527  fstr = register_fstring(str, false, false);
528 
529  if (!bare) {
530  str_replace_shared_without_enc(str, fstr);
531  OBJ_FREEZE(str);
532  return str;
533  }
534  return fstr;
535 }
536 
537 static VALUE
538 register_fstring(VALUE str, bool copy, bool force_precompute_hash)
539 {
540  struct fstr_update_arg args = {
541  .copy = copy,
542  .force_precompute_hash = force_precompute_hash
543  };
544 
545 #if SIZEOF_VOIDP == SIZEOF_LONG
546  if (FL_TEST_RAW(str, STR_FAKESTR)) {
547  // if the string hasn't been interned, we'll need the hash twice, so we
548  // compute it once and store it in capa
549  RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
550  }
551 #endif
552 
553  RB_VM_LOCK_ENTER();
554  {
555  st_table *frozen_strings = rb_vm_fstring_table();
556  do {
557  args.fstr = str;
558  st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
559  } while (UNDEF_P(args.fstr));
560  }
561  RB_VM_LOCK_LEAVE();
562 
563  RUBY_ASSERT(OBJ_FROZEN(args.fstr));
564  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
565  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
566  RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
567 
568  return args.fstr;
569 }
570 
571 static VALUE
572 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
573 {
574  fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
575 
576  if (!name) {
577  RUBY_ASSERT_ALWAYS(len == 0);
578  name = "";
579  }
580 
581  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
582 
583  RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
584  fake_str->len = len;
585  fake_str->as.heap.ptr = (char *)name;
586  fake_str->as.heap.aux.capa = len;
587  return (VALUE)fake_str;
588 }
589 
590 /*
591  * set up a fake string which refers a static string literal.
592  */
593 VALUE
594 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
595 {
596  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
597 }
598 
599 /*
600  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
601  * shared string which refers a static string literal. `ptr` must
602  * point a constant string.
603  */
604 VALUE
605 rb_fstring_new(const char *ptr, long len)
606 {
607  struct RString fake_str;
608  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
609 }
610 
611 VALUE
612 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
613 {
614  struct RString fake_str;
615  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
616 }
617 
618 VALUE
619 rb_fstring_cstr(const char *ptr)
620 {
621  return rb_fstring_new(ptr, strlen(ptr));
622 }
623 
624 static int
625 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
626 {
627  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
628  return ST_CONTINUE;
629 }
630 
631 static int
632 fstring_cmp(VALUE a, VALUE b)
633 {
634  long alen, blen;
635  const char *aptr, *bptr;
636  RSTRING_GETMEM(a, aptr, alen);
637  RSTRING_GETMEM(b, bptr, blen);
638  return (alen != blen ||
639  ENCODING_GET(a) != ENCODING_GET(b) ||
640  memcmp(aptr, bptr, alen) != 0);
641 }
642 
643 static inline bool
644 single_byte_optimizable(VALUE str)
645 {
646  int encindex = ENCODING_GET(str);
647  switch (encindex) {
648  case ENCINDEX_ASCII_8BIT:
649  case ENCINDEX_US_ASCII:
650  return true;
651  case ENCINDEX_UTF_8:
652  // For UTF-8 it's worth scanning the string coderange when unknown.
654  }
655  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
656  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
657  return true;
658  }
659 
660  if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
661  return true;
662  }
663 
664  /* Conservative. Possibly single byte.
665  * "\xa1" in Shift_JIS for example. */
666  return false;
667 }
668 
670 
671 static inline const char *
672 search_nonascii(const char *p, const char *e)
673 {
674  const uintptr_t *s, *t;
675 
676 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
677 # if SIZEOF_UINTPTR_T == 8
678 # define NONASCII_MASK UINT64_C(0x8080808080808080)
679 # elif SIZEOF_UINTPTR_T == 4
680 # define NONASCII_MASK UINT32_C(0x80808080)
681 # else
682 # error "don't know what to do."
683 # endif
684 #else
685 # if SIZEOF_UINTPTR_T == 8
686 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
687 # elif SIZEOF_UINTPTR_T == 4
688 # define NONASCII_MASK 0x80808080UL /* or...? */
689 # else
690 # error "don't know what to do."
691 # endif
692 #endif
693 
694  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
695 #if !UNALIGNED_WORD_ACCESS
696  if ((uintptr_t)p % SIZEOF_VOIDP) {
697  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
698  p += l;
699  switch (l) {
700  default: UNREACHABLE;
701 #if SIZEOF_VOIDP > 4
702  case 7: if (p[-7]&0x80) return p-7;
703  case 6: if (p[-6]&0x80) return p-6;
704  case 5: if (p[-5]&0x80) return p-5;
705  case 4: if (p[-4]&0x80) return p-4;
706 #endif
707  case 3: if (p[-3]&0x80) return p-3;
708  case 2: if (p[-2]&0x80) return p-2;
709  case 1: if (p[-1]&0x80) return p-1;
710  case 0: break;
711  }
712  }
713 #endif
714 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
715 #define aligned_ptr(value) \
716  __builtin_assume_aligned((value), sizeof(uintptr_t))
717 #else
718 #define aligned_ptr(value) (uintptr_t *)(value)
719 #endif
720  s = aligned_ptr(p);
721  t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
722 #undef aligned_ptr
723  for (;s < t; s++) {
724  if (*s & NONASCII_MASK) {
725 #ifdef WORDS_BIGENDIAN
726  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
727 #else
728  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
729 #endif
730  }
731  }
732  p = (const char *)s;
733  }
734 
735  switch (e - p) {
736  default: UNREACHABLE;
737 #if SIZEOF_VOIDP > 4
738  case 7: if (e[-7]&0x80) return e-7;
739  case 6: if (e[-6]&0x80) return e-6;
740  case 5: if (e[-5]&0x80) return e-5;
741  case 4: if (e[-4]&0x80) return e-4;
742 #endif
743  case 3: if (e[-3]&0x80) return e-3;
744  case 2: if (e[-2]&0x80) return e-2;
745  case 1: if (e[-1]&0x80) return e-1;
746  case 0: return NULL;
747  }
748 }
749 
750 static int
751 coderange_scan(const char *p, long len, rb_encoding *enc)
752 {
753  const char *e = p + len;
754 
755  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
756  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
757  p = search_nonascii(p, e);
759  }
760 
761  if (rb_enc_asciicompat(enc)) {
762  p = search_nonascii(p, e);
763  if (!p) return ENC_CODERANGE_7BIT;
764  for (;;) {
765  int ret = rb_enc_precise_mbclen(p, e, enc);
766  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
767  p += MBCLEN_CHARFOUND_LEN(ret);
768  if (p == e) break;
769  p = search_nonascii(p, e);
770  if (!p) break;
771  }
772  }
773  else {
774  while (p < e) {
775  int ret = rb_enc_precise_mbclen(p, e, enc);
776  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
777  p += MBCLEN_CHARFOUND_LEN(ret);
778  }
779  }
780  return ENC_CODERANGE_VALID;
781 }
782 
783 long
784 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
785 {
786  const char *p = s;
787 
788  if (*cr == ENC_CODERANGE_BROKEN)
789  return e - s;
790 
791  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
792  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
793  if (*cr == ENC_CODERANGE_VALID) return e - s;
794  p = search_nonascii(p, e);
796  return e - s;
797  }
798  else if (rb_enc_asciicompat(enc)) {
799  p = search_nonascii(p, e);
800  if (!p) {
801  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
802  return e - s;
803  }
804  for (;;) {
805  int ret = rb_enc_precise_mbclen(p, e, enc);
806  if (!MBCLEN_CHARFOUND_P(ret)) {
808  return p - s;
809  }
810  p += MBCLEN_CHARFOUND_LEN(ret);
811  if (p == e) break;
812  p = search_nonascii(p, e);
813  if (!p) break;
814  }
815  }
816  else {
817  while (p < e) {
818  int ret = rb_enc_precise_mbclen(p, e, enc);
819  if (!MBCLEN_CHARFOUND_P(ret)) {
821  return p - s;
822  }
823  p += MBCLEN_CHARFOUND_LEN(ret);
824  }
825  }
826  *cr = ENC_CODERANGE_VALID;
827  return e - s;
828 }
829 
830 static inline void
831 str_enc_copy(VALUE str1, VALUE str2)
832 {
833  rb_enc_set_index(str1, ENCODING_GET(str2));
834 }
835 
836 /* Like str_enc_copy, but does not check frozen status of str1.
837  * You should use this only if you're certain that str1 is not frozen. */
838 static inline void
839 str_enc_copy_direct(VALUE str1, VALUE str2)
840 {
841  int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
842  if (inlined_encoding == ENCODING_INLINE_MAX) {
843  rb_enc_set_index(str1, rb_enc_get_index(str2));
844  }
845  else {
846  ENCODING_SET_INLINED(str1, inlined_encoding);
847  }
848 }
849 
850 static void
851 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
852 {
853  /* this function is designed for copying encoding and coderange
854  * from src to new string "dest" which is made from the part of src.
855  */
856  str_enc_copy(dest, src);
857  if (RSTRING_LEN(dest) == 0) {
858  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
860  else
862  return;
863  }
864  switch (ENC_CODERANGE(src)) {
865  case ENC_CODERANGE_7BIT:
867  break;
868  case ENC_CODERANGE_VALID:
869  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
870  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
872  else
874  break;
875  default:
876  break;
877  }
878 }
879 
880 static void
881 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
882 {
883  str_enc_copy(dest, src);
884  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
885 }
886 
887 static int
888 enc_coderange_scan(VALUE str, rb_encoding *enc)
889 {
890  return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
891 }
892 
893 int
894 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
895 {
896  return enc_coderange_scan(str, enc);
897 }
898 
899 int
901 {
902  int cr = ENC_CODERANGE(str);
903 
904  if (cr == ENC_CODERANGE_UNKNOWN) {
905  cr = enc_coderange_scan(str, get_encoding(str));
906  ENC_CODERANGE_SET(str, cr);
907  }
908  return cr;
909 }
910 
911 static inline bool
912 rb_enc_str_asciicompat(VALUE str)
913 {
914  int encindex = ENCODING_GET_INLINED(str);
915  return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
916 }
917 
918 int
920 {
921  switch(ENC_CODERANGE(str)) {
923  return rb_enc_str_asciicompat(str) && is_ascii_string(str);
924  case ENC_CODERANGE_7BIT:
925  return true;
926  default:
927  return false;
928  }
929 }
930 
931 static inline void
932 str_mod_check(VALUE s, const char *p, long len)
933 {
934  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
935  rb_raise(rb_eRuntimeError, "string modified");
936  }
937 }
938 
939 static size_t
940 str_capacity(VALUE str, const int termlen)
941 {
942  if (STR_EMBED_P(str)) {
943  return str_embed_capa(str) - termlen;
944  }
945  else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
946  return RSTRING(str)->len;
947  }
948  else {
949  return RSTRING(str)->as.heap.aux.capa;
950  }
951 }
952 
953 size_t
955 {
956  return str_capacity(str, TERM_LEN(str));
957 }
958 
959 static inline void
960 must_not_null(const char *ptr)
961 {
962  if (!ptr) {
963  rb_raise(rb_eArgError, "NULL pointer given");
964  }
965 }
966 
967 static inline VALUE
968 str_alloc_embed(VALUE klass, size_t capa)
969 {
970  size_t size = rb_str_embed_size(capa);
971  RUBY_ASSERT(size > 0);
972  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
973 
974  NEWOBJ_OF(str, struct RString, klass,
976 
977  return (VALUE)str;
978 }
979 
980 static inline VALUE
981 str_alloc_heap(VALUE klass)
982 {
983  NEWOBJ_OF(str, struct RString, klass,
984  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
985 
986  return (VALUE)str;
987 }
988 
989 static inline VALUE
990 empty_str_alloc(VALUE klass)
991 {
992  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
993  VALUE str = str_alloc_embed(klass, 0);
994  memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
996  return str;
997 }
998 
999 static VALUE
1000 str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1001 {
1002  VALUE str;
1003 
1004  if (len < 0) {
1005  rb_raise(rb_eArgError, "negative string size (or size too big)");
1006  }
1007 
1008  if (enc == NULL) {
1009  enc = rb_ascii8bit_encoding();
1010  }
1011 
1012  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1013 
1014  int termlen = rb_enc_mbminlen(enc);
1015 
1016  if (STR_EMBEDDABLE_P(len, termlen)) {
1017  str = str_alloc_embed(klass, len + termlen);
1018  if (len == 0) {
1020  }
1021  }
1022  else {
1023  str = str_alloc_heap(klass);
1024  RSTRING(str)->as.heap.aux.capa = len;
1025  /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1026  * integer overflow. If we can STATIC_ASSERT that, the following
1027  * mul_add_mul can be reverted to a simple ALLOC_N. */
1028  RSTRING(str)->as.heap.ptr =
1029  rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1030  }
1031 
1032  rb_enc_raw_set(str, enc);
1033 
1034  if (ptr) {
1035  memcpy(RSTRING_PTR(str), ptr, len);
1036  }
1037 
1038  STR_SET_LEN(str, len);
1039  TERM_FILL(RSTRING_PTR(str) + len, termlen);
1040  return str;
1041 }
1042 
1043 static VALUE
1044 str_new(VALUE klass, const char *ptr, long len)
1045 {
1046  return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1047 }
1048 
1049 VALUE
1050 rb_str_new(const char *ptr, long len)
1051 {
1052  return str_new(rb_cString, ptr, len);
1053 }
1054 
1055 VALUE
1056 rb_usascii_str_new(const char *ptr, long len)
1057 {
1058  return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1059 }
1060 
1061 VALUE
1062 rb_utf8_str_new(const char *ptr, long len)
1063 {
1064  return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1065 }
1066 
1067 VALUE
1068 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1069 {
1070  return str_enc_new(rb_cString, ptr, len, enc);
1071 }
1072 
1073 VALUE
1074 rb_str_new_cstr(const char *ptr)
1075 {
1076  must_not_null(ptr);
1077  /* rb_str_new_cstr() can take pointer from non-malloc-generated
1078  * memory regions, and that cannot be detected by the MSAN. Just
1079  * trust the programmer that the argument passed here is a sane C
1080  * string. */
1081  __msan_unpoison_string(ptr);
1082  return rb_str_new(ptr, strlen(ptr));
1083 }
1084 
1085 VALUE
1087 {
1089 }
1090 
1091 VALUE
1093 {
1095 }
1096 
1097 VALUE
1099 {
1100  must_not_null(ptr);
1101  if (rb_enc_mbminlen(enc) != 1) {
1102  rb_raise(rb_eArgError, "wchar encoding given");
1103  }
1104  return rb_enc_str_new(ptr, strlen(ptr), enc);
1105 }
1106 
1107 static VALUE
1108 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1109 {
1110  VALUE str;
1111 
1112  if (len < 0) {
1113  rb_raise(rb_eArgError, "negative string size (or size too big)");
1114  }
1115 
1116  if (!ptr) {
1117  str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1118  }
1119  else {
1120  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1121  str = str_alloc_heap(klass);
1122  RSTRING(str)->len = len;
1123  RSTRING(str)->as.heap.ptr = (char *)ptr;
1124  RSTRING(str)->as.heap.aux.capa = len;
1125  RBASIC(str)->flags |= STR_NOFREE;
1126  rb_enc_associate_index(str, encindex);
1127  }
1128  return str;
1129 }
1130 
1131 VALUE
1132 rb_str_new_static(const char *ptr, long len)
1133 {
1134  return str_new_static(rb_cString, ptr, len, 0);
1135 }
1136 
1137 VALUE
1139 {
1140  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1141 }
1142 
1143 VALUE
1144 rb_utf8_str_new_static(const char *ptr, long len)
1145 {
1146  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1147 }
1148 
1149 VALUE
1150 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1151 {
1152  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1153 }
1154 
1155 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1156  rb_encoding *from, rb_encoding *to,
1157  int ecflags, VALUE ecopts);
1158 
1159 static inline bool
1160 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1161 {
1162  int encidx = rb_enc_to_index(enc);
1163  if (rb_enc_get_index(str) == encidx)
1164  return is_ascii_string(str);
1165  return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1166 }
1167 
1168 VALUE
1169 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1170 {
1171  long len;
1172  const char *ptr;
1173  VALUE newstr;
1174 
1175  if (!to) return str;
1176  if (!from) from = rb_enc_get(str);
1177  if (from == to) return str;
1178  if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1179  rb_is_ascii8bit_enc(to)) {
1180  if (STR_ENC_GET(str) != to) {
1181  str = rb_str_dup(str);
1182  rb_enc_associate(str, to);
1183  }
1184  return str;
1185  }
1186 
1187  RSTRING_GETMEM(str, ptr, len);
1188  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1189  from, to, ecflags, ecopts);
1190  if (NIL_P(newstr)) {
1191  /* some error, return original */
1192  return str;
1193  }
1194  return newstr;
1195 }
1196 
1197 VALUE
1198 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1199  rb_encoding *from, int ecflags, VALUE ecopts)
1200 {
1201  long olen;
1202 
1203  olen = RSTRING_LEN(newstr);
1204  if (ofs < -olen || olen < ofs)
1205  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1206  if (ofs < 0) ofs += olen;
1207  if (!from) {
1208  STR_SET_LEN(newstr, ofs);
1209  return rb_str_cat(newstr, ptr, len);
1210  }
1211 
1212  rb_str_modify(newstr);
1213  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1214  rb_enc_get(newstr),
1215  ecflags, ecopts);
1216 }
1217 
1218 VALUE
1219 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1220 {
1221  STR_SET_LEN(str, 0);
1222  rb_enc_associate(str, enc);
1223  rb_str_cat(str, ptr, len);
1224  return str;
1225 }
1226 
1227 static VALUE
1228 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1229  rb_encoding *from, rb_encoding *to,
1230  int ecflags, VALUE ecopts)
1231 {
1232  rb_econv_t *ec;
1233  rb_econv_result_t ret;
1234  long olen;
1235  VALUE econv_wrapper;
1236  const unsigned char *start, *sp;
1237  unsigned char *dest, *dp;
1238  size_t converted_output = (size_t)ofs;
1239 
1240  olen = rb_str_capacity(newstr);
1241 
1242  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1243  RBASIC_CLEAR_CLASS(econv_wrapper);
1244  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1245  if (!ec) return Qnil;
1246  DATA_PTR(econv_wrapper) = ec;
1247 
1248  sp = (unsigned char*)ptr;
1249  start = sp;
1250  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1251  (dp = dest + converted_output),
1252  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1254  /* destination buffer short */
1255  size_t converted_input = sp - start;
1256  size_t rest = len - converted_input;
1257  converted_output = dp - dest;
1258  rb_str_set_len(newstr, converted_output);
1259  if (converted_input && converted_output &&
1260  rest < (LONG_MAX / converted_output)) {
1261  rest = (rest * converted_output) / converted_input;
1262  }
1263  else {
1264  rest = olen;
1265  }
1266  olen += rest < 2 ? 2 : rest;
1267  rb_str_resize(newstr, olen);
1268  }
1269  DATA_PTR(econv_wrapper) = 0;
1270  RB_GC_GUARD(econv_wrapper);
1271  rb_econv_close(ec);
1272  switch (ret) {
1273  case econv_finished:
1274  len = dp - (unsigned char*)RSTRING_PTR(newstr);
1275  rb_str_set_len(newstr, len);
1276  rb_enc_associate(newstr, to);
1277  return newstr;
1278 
1279  default:
1280  return Qnil;
1281  }
1282 }
1283 
1284 VALUE
1286 {
1287  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1288 }
1289 
1290 VALUE
1292 {
1293  rb_encoding *ienc;
1294  VALUE str;
1295  const int eidx = rb_enc_to_index(eenc);
1296 
1297  if (!ptr) {
1298  return rb_enc_str_new(ptr, len, eenc);
1299  }
1300 
1301  /* ASCII-8BIT case, no conversion */
1302  if ((eidx == rb_ascii8bit_encindex()) ||
1303  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1304  return rb_str_new(ptr, len);
1305  }
1306  /* no default_internal or same encoding, no conversion */
1308  if (!ienc || eenc == ienc) {
1309  return rb_enc_str_new(ptr, len, eenc);
1310  }
1311  /* ASCII compatible, and ASCII only string, no conversion in
1312  * default_internal */
1313  if ((eidx == rb_ascii8bit_encindex()) ||
1314  (eidx == rb_usascii_encindex()) ||
1315  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1316  return rb_enc_str_new(ptr, len, ienc);
1317  }
1318  /* convert from the given encoding to default_internal */
1319  str = rb_enc_str_new(NULL, 0, ienc);
1320  /* when the conversion failed for some reason, just ignore the
1321  * default_internal and result in the given encoding as-is. */
1322  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1323  rb_str_initialize(str, ptr, len, eenc);
1324  }
1325  return str;
1326 }
1327 
1328 VALUE
1329 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1330 {
1331  int eidx = rb_enc_to_index(eenc);
1332  if (eidx == rb_usascii_encindex() &&
1333  !is_ascii_string(str)) {
1335  return str;
1336  }
1337  rb_enc_associate_index(str, eidx);
1338  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1339 }
1340 
1341 VALUE
1342 rb_external_str_new(const char *ptr, long len)
1343 {
1345 }
1346 
1347 VALUE
1349 {
1351 }
1352 
1353 VALUE
1354 rb_locale_str_new(const char *ptr, long len)
1355 {
1357 }
1358 
1359 VALUE
1361 {
1363 }
1364 
1365 VALUE
1366 rb_filesystem_str_new(const char *ptr, long len)
1367 {
1369 }
1370 
1371 VALUE
1373 {
1375 }
1376 
1377 VALUE
1379 {
1381 }
1382 
1383 VALUE
1385 {
1387 }
1388 
1389 VALUE
1391 {
1392  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1393 }
1394 
1395 static VALUE
1396 str_replace_shared_without_enc(VALUE str2, VALUE str)
1397 {
1398  const int termlen = TERM_LEN(str);
1399  char *ptr;
1400  long len;
1401 
1402  RSTRING_GETMEM(str, ptr, len);
1403  if (str_embed_capa(str2) >= len + termlen) {
1404  char *ptr2 = RSTRING(str2)->as.embed.ary;
1405  STR_SET_EMBED(str2);
1406  memcpy(ptr2, RSTRING_PTR(str), len);
1407  TERM_FILL(ptr2+len, termlen);
1408  }
1409  else {
1410  VALUE root;
1411  if (STR_SHARED_P(str)) {
1412  root = RSTRING(str)->as.heap.aux.shared;
1413  RSTRING_GETMEM(str, ptr, len);
1414  }
1415  else {
1416  root = rb_str_new_frozen(str);
1417  RSTRING_GETMEM(root, ptr, len);
1418  }
1419  RUBY_ASSERT(OBJ_FROZEN(root));
1420 
1421  if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1422  if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1423  rb_fatal("about to free a possible shared root");
1424  }
1425  char *ptr2 = STR_HEAP_PTR(str2);
1426  if (ptr2 != ptr) {
1427  ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1428  }
1429  }
1430  FL_SET(str2, STR_NOEMBED);
1431  RSTRING(str2)->as.heap.ptr = ptr;
1432  STR_SET_SHARED(str2, root);
1433  }
1434 
1435  STR_SET_LEN(str2, len);
1436 
1437  return str2;
1438 }
1439 
1440 static VALUE
1441 str_replace_shared(VALUE str2, VALUE str)
1442 {
1443  str_replace_shared_without_enc(str2, str);
1444  rb_enc_cr_str_exact_copy(str2, str);
1445  return str2;
1446 }
1447 
1448 static VALUE
1449 str_new_shared(VALUE klass, VALUE str)
1450 {
1451  return str_replace_shared(str_alloc_heap(klass), str);
1452 }
1453 
1454 VALUE
1456 {
1457  return str_new_shared(rb_obj_class(str), str);
1458 }
1459 
1460 VALUE
1462 {
1463  if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1464  return str_new_frozen(rb_obj_class(orig), orig);
1465 }
1466 
1467 static VALUE
1468 rb_str_new_frozen_String(VALUE orig)
1469 {
1470  if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1471  return str_new_frozen(rb_cString, orig);
1472 }
1473 
1474 VALUE
1475 rb_str_tmp_frozen_acquire(VALUE orig)
1476 {
1477  if (OBJ_FROZEN_RAW(orig)) return orig;
1478  return str_new_frozen_buffer(0, orig, FALSE);
1479 }
1480 
1481 VALUE
1482 rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1483 {
1484  if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1485  if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1486 
1487  VALUE str = str_alloc_heap(0);
1488  OBJ_FREEZE(str);
1489  /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1490  FL_SET(str, STR_SHARED_ROOT);
1491 
1492  size_t capa = str_capacity(orig, TERM_LEN(orig));
1493 
1494  /* If the string is embedded then we want to create a copy that is heap
1495  * allocated. If the string is shared then the shared root must be
1496  * embedded, so we want to create a copy. If the string is a shared root
1497  * then it must be embedded, so we want to create a copy. */
1498  if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1499  RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1500  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1501  }
1502  else {
1503  /* orig must be heap allocated and not shared, so we can safely transfer
1504  * the pointer to str. */
1505  RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1506  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1507  RBASIC(orig)->flags &= ~STR_NOFREE;
1508  STR_SET_SHARED(orig, str);
1509  }
1510 
1511  RSTRING(str)->len = RSTRING(orig)->len;
1512  RSTRING(str)->as.heap.aux.capa = capa;
1513 
1514  return str;
1515 }
1516 
1517 void
1518 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1519 {
1520  if (RBASIC_CLASS(tmp) != 0)
1521  return;
1522 
1523  if (STR_EMBED_P(tmp)) {
1525  }
1526  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1527  !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1528  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1529 
1530  if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1531  RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1532  RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1533 
1534  /* Unshare orig since the root (tmp) only has this one child. */
1535  FL_UNSET_RAW(orig, STR_SHARED);
1536  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1537  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1539 
1540  /* Make tmp embedded and empty so it is safe for sweeping. */
1541  STR_SET_EMBED(tmp);
1542  STR_SET_LEN(tmp, 0);
1543  }
1544  }
1545 }
1546 
1547 static VALUE
1548 str_new_frozen(VALUE klass, VALUE orig)
1549 {
1550  return str_new_frozen_buffer(klass, orig, TRUE);
1551 }
1552 
1553 static VALUE
1554 heap_str_make_shared(VALUE klass, VALUE orig)
1555 {
1556  RUBY_ASSERT(!STR_EMBED_P(orig));
1557  RUBY_ASSERT(!STR_SHARED_P(orig));
1558 
1559  VALUE str = str_alloc_heap(klass);
1560  STR_SET_LEN(str, RSTRING_LEN(orig));
1561  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1562  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1563  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1564  RBASIC(orig)->flags &= ~STR_NOFREE;
1565  STR_SET_SHARED(orig, str);
1566  if (klass == 0)
1567  FL_UNSET_RAW(str, STR_BORROWED);
1568  return str;
1569 }
1570 
1571 static VALUE
1572 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1573 {
1574  VALUE str;
1575 
1576  long len = RSTRING_LEN(orig);
1577  rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1578  int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1579 
1580  if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1581  str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1582  RUBY_ASSERT(STR_EMBED_P(str));
1583  }
1584  else {
1585  if (FL_TEST_RAW(orig, STR_SHARED)) {
1586  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1587  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1588  long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1589  RUBY_ASSERT(ofs >= 0);
1590  RUBY_ASSERT(rest >= 0);
1591  RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1593 
1594  if ((ofs > 0) || (rest > 0) ||
1595  (klass != RBASIC(shared)->klass) ||
1596  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1597  str = str_new_shared(klass, shared);
1598  RUBY_ASSERT(!STR_EMBED_P(str));
1599  RSTRING(str)->as.heap.ptr += ofs;
1600  STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1601  }
1602  else {
1603  if (RBASIC_CLASS(shared) == 0)
1604  FL_SET_RAW(shared, STR_BORROWED);
1605  return shared;
1606  }
1607  }
1608  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1609  str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1610  STR_SET_EMBED(str);
1611  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1612  STR_SET_LEN(str, RSTRING_LEN(orig));
1613  ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1614  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1615  }
1616  else {
1617  str = heap_str_make_shared(klass, orig);
1618  }
1619  }
1620 
1621  if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1622  OBJ_FREEZE(str);
1623  return str;
1624 }
1625 
1626 VALUE
1627 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1628 {
1629  return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1630 }
1631 
1632 static VALUE
1633 str_new_empty_String(VALUE str)
1634 {
1635  VALUE v = rb_str_new(0, 0);
1636  rb_enc_copy(v, str);
1637  return v;
1638 }
1639 
1640 #define STR_BUF_MIN_SIZE 63
1641 
1642 VALUE
1644 {
1645  if (STR_EMBEDDABLE_P(capa, 1)) {
1646  return str_alloc_embed(rb_cString, capa + 1);
1647  }
1648 
1649  VALUE str = str_alloc_heap(rb_cString);
1650 
1651  RSTRING(str)->as.heap.aux.capa = capa;
1652  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1653  RSTRING(str)->as.heap.ptr[0] = '\0';
1654 
1655  return str;
1656 }
1657 
1658 VALUE
1660 {
1661  VALUE str;
1662  long len = strlen(ptr);
1663 
1664  str = rb_str_buf_new(len);
1665  rb_str_buf_cat(str, ptr, len);
1666 
1667  return str;
1668 }
1669 
1670 VALUE
1672 {
1673  return str_new(0, 0, len);
1674 }
1675 
1676 void
1678 {
1679  if (FL_TEST(str, RSTRING_FSTR)) {
1680  st_data_t fstr = (st_data_t)str;
1681 
1682  RB_VM_LOCK_ENTER();
1683  {
1684  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1685  RB_DEBUG_COUNTER_INC(obj_str_fstr);
1686  }
1687  RB_VM_LOCK_LEAVE();
1688  }
1689 
1690  if (STR_EMBED_P(str)) {
1691  RB_DEBUG_COUNTER_INC(obj_str_embed);
1692  }
1693  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1694  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1695  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1696  }
1697  else {
1698  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1699  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1700  }
1701 }
1702 
1703 size_t
1704 rb_str_memsize(VALUE str)
1705 {
1706  if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1707  return STR_HEAP_SIZE(str);
1708  }
1709  else {
1710  return 0;
1711  }
1712 }
1713 
1714 VALUE
1716 {
1717  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1718 }
1719 
1720 static inline void str_discard(VALUE str);
1721 static void str_shared_replace(VALUE str, VALUE str2);
1722 
1723 void
1725 {
1726  if (str != str2) str_shared_replace(str, str2);
1727 }
1728 
1729 static void
1730 str_shared_replace(VALUE str, VALUE str2)
1731 {
1732  rb_encoding *enc;
1733  int cr;
1734  int termlen;
1735 
1736  RUBY_ASSERT(str2 != str);
1737  enc = STR_ENC_GET(str2);
1738  cr = ENC_CODERANGE(str2);
1739  str_discard(str);
1740  termlen = rb_enc_mbminlen(enc);
1741 
1742  STR_SET_LEN(str, RSTRING_LEN(str2));
1743 
1744  if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1745  STR_SET_EMBED(str);
1746  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1747  rb_enc_associate(str, enc);
1748  ENC_CODERANGE_SET(str, cr);
1749  }
1750  else {
1751  if (STR_EMBED_P(str2)) {
1752  RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1753  long len = RSTRING_LEN(str2);
1754  RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1755 
1756  char *new_ptr = ALLOC_N(char, len + termlen);
1757  memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1758  RSTRING(str2)->as.heap.ptr = new_ptr;
1759  STR_SET_LEN(str2, len);
1760  RSTRING(str2)->as.heap.aux.capa = len;
1761  STR_SET_NOEMBED(str2);
1762  }
1763 
1764  STR_SET_NOEMBED(str);
1765  FL_UNSET(str, STR_SHARED);
1766  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1767 
1768  if (FL_TEST(str2, STR_SHARED)) {
1769  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1770  STR_SET_SHARED(str, shared);
1771  }
1772  else {
1773  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1774  }
1775 
1776  /* abandon str2 */
1777  STR_SET_EMBED(str2);
1778  RSTRING_PTR(str2)[0] = 0;
1779  STR_SET_LEN(str2, 0);
1780  rb_enc_associate(str, enc);
1781  ENC_CODERANGE_SET(str, cr);
1782  }
1783 }
1784 
1785 VALUE
1787 {
1788  VALUE str;
1789 
1790  if (RB_TYPE_P(obj, T_STRING)) {
1791  return obj;
1792  }
1793  str = rb_funcall(obj, idTo_s, 0);
1794  return rb_obj_as_string_result(str, obj);
1795 }
1796 
1797 VALUE
1798 rb_obj_as_string_result(VALUE str, VALUE obj)
1799 {
1800  if (!RB_TYPE_P(str, T_STRING))
1801  return rb_any_to_s(obj);
1802  return str;
1803 }
1804 
1805 static VALUE
1806 str_replace(VALUE str, VALUE str2)
1807 {
1808  long len;
1809 
1810  len = RSTRING_LEN(str2);
1811  if (STR_SHARED_P(str2)) {
1812  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1814  STR_SET_NOEMBED(str);
1815  STR_SET_LEN(str, len);
1816  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1817  STR_SET_SHARED(str, shared);
1818  rb_enc_cr_str_exact_copy(str, str2);
1819  }
1820  else {
1821  str_replace_shared(str, str2);
1822  }
1823 
1824  return str;
1825 }
1826 
1827 static inline VALUE
1828 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1829 {
1830  size_t size = rb_str_embed_size(capa);
1831  RUBY_ASSERT(size > 0);
1832  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1833 
1834  NEWOBJ_OF(str, struct RString, klass,
1836 
1837  return (VALUE)str;
1838 }
1839 
1840 static inline VALUE
1841 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1842 {
1843  NEWOBJ_OF(str, struct RString, klass,
1844  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1845 
1846  return (VALUE)str;
1847 }
1848 
1849 static inline VALUE
1850 str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1851 {
1852  int encidx = 0;
1853  if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1854  encidx = rb_enc_get_index(str);
1855  flags &= ~ENCODING_MASK;
1856  }
1857  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1858  if (encidx) rb_enc_associate_index(dup, encidx);
1859  return dup;
1860 }
1861 
1862 static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1863 
1864 static inline VALUE
1865 str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1866 {
1867  VALUE flags = FL_TEST_RAW(str, flag_mask);
1868  long len = RSTRING_LEN(str);
1869 
1870  RUBY_ASSERT(STR_EMBED_P(dup));
1871  RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1872  MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1873  STR_SET_LEN(dup, RSTRING_LEN(str));
1874  return str_duplicate_setup_encoding(str, dup, flags);
1875 }
1876 
1877 static inline VALUE
1878 str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1879 {
1880  VALUE flags = FL_TEST_RAW(str, flag_mask);
1881  VALUE root = str;
1882  if (FL_TEST_RAW(str, STR_SHARED)) {
1883  root = RSTRING(str)->as.heap.aux.shared;
1884  }
1885  else if (UNLIKELY(!(flags & FL_FREEZE))) {
1886  root = str = str_new_frozen(klass, str);
1887  flags = FL_TEST_RAW(str, flag_mask);
1888  }
1889  RUBY_ASSERT(!STR_SHARED_P(root));
1891 
1892  RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1893  FL_SET(root, STR_SHARED_ROOT);
1894  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1895  flags |= RSTRING_NOEMBED | STR_SHARED;
1896 
1897  STR_SET_LEN(dup, RSTRING_LEN(str));
1898  return str_duplicate_setup_encoding(str, dup, flags);
1899 }
1900 
1901 static inline VALUE
1902 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1903 {
1904  if (STR_EMBED_P(str)) {
1905  return str_duplicate_setup_embed(klass, str, dup);
1906  }
1907  else {
1908  return str_duplicate_setup_heap(klass, str, dup);
1909  }
1910 }
1911 
1912 static inline VALUE
1913 str_duplicate(VALUE klass, VALUE str)
1914 {
1915  VALUE dup;
1916  if (STR_EMBED_P(str)) {
1917  dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1918  }
1919  else {
1920  dup = str_alloc_heap(klass);
1921  }
1922 
1923  return str_duplicate_setup(klass, str, dup);
1924 }
1925 
1926 VALUE
1928 {
1929  return str_duplicate(rb_obj_class(str), str);
1930 }
1931 
1932 /* :nodoc: */
1933 VALUE
1934 rb_str_dup_m(VALUE str)
1935 {
1936  if (LIKELY(BARE_STRING_P(str))) {
1937  return str_duplicate(rb_obj_class(str), str);
1938  }
1939  else {
1940  return rb_obj_dup(str);
1941  }
1942 }
1943 
1944 VALUE
1946 {
1947  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1948  return str_duplicate(rb_cString, str);
1949 }
1950 
1951 VALUE
1952 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1953 {
1954  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1955  VALUE new_str, klass = rb_cString;
1956 
1957  if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1958  new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1959  str_duplicate_setup_embed(klass, str, new_str);
1960  }
1961  else {
1962  new_str = ec_str_alloc_heap(ec, klass);
1963  str_duplicate_setup_heap(klass, str, new_str);
1964  }
1965  if (chilled) {
1966  FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1967  }
1968  return new_str;
1969 }
1970 
1971 VALUE
1972 rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1973 {
1974  VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1975  if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1976  rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1977  FL_SET_RAW(str, STR_CHILLED_LITERAL);
1978  return rb_str_freeze(str);
1979 }
1980 
1981 /*
1982  *
1983  * call-seq:
1984  * String.new(string = '', **opts) -> new_string
1985  *
1986  * :include: doc/string/new.rdoc
1987  *
1988  */
1989 
1990 static VALUE
1991 rb_str_init(int argc, VALUE *argv, VALUE str)
1992 {
1993  static ID keyword_ids[2];
1994  VALUE orig, opt, venc, vcapa;
1995  VALUE kwargs[2];
1996  rb_encoding *enc = 0;
1997  int n;
1998 
1999  if (!keyword_ids[0]) {
2000  keyword_ids[0] = rb_id_encoding();
2001  CONST_ID(keyword_ids[1], "capacity");
2002  }
2003 
2004  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2005  if (!NIL_P(opt)) {
2006  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2007  venc = kwargs[0];
2008  vcapa = kwargs[1];
2009  if (!UNDEF_P(venc) && !NIL_P(venc)) {
2010  enc = rb_to_encoding(venc);
2011  }
2012  if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2013  long capa = NUM2LONG(vcapa);
2014  long len = 0;
2015  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2016 
2017  if (capa < STR_BUF_MIN_SIZE) {
2018  capa = STR_BUF_MIN_SIZE;
2019  }
2020  if (n == 1) {
2021  StringValue(orig);
2022  len = RSTRING_LEN(orig);
2023  if (capa < len) {
2024  capa = len;
2025  }
2026  if (orig == str) n = 0;
2027  }
2028  str_modifiable(str);
2029  if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2030  /* make noembed always */
2031  const size_t size = (size_t)capa + termlen;
2032  const char *const old_ptr = RSTRING_PTR(str);
2033  const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2034  char *new_ptr = ALLOC_N(char, size);
2035  if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2036  memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2037  FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2038  RSTRING(str)->as.heap.ptr = new_ptr;
2039  }
2040  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2041  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2042  (size_t)capa + termlen, STR_HEAP_SIZE(str));
2043  }
2044  STR_SET_LEN(str, len);
2045  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2046  if (n == 1) {
2047  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2048  rb_enc_cr_str_exact_copy(str, orig);
2049  }
2050  FL_SET(str, STR_NOEMBED);
2051  RSTRING(str)->as.heap.aux.capa = capa;
2052  }
2053  else if (n == 1) {
2054  rb_str_replace(str, orig);
2055  }
2056  if (enc) {
2057  rb_enc_associate(str, enc);
2058  ENC_CODERANGE_CLEAR(str);
2059  }
2060  }
2061  else if (n == 1) {
2062  rb_str_replace(str, orig);
2063  }
2064  return str;
2065 }
2066 
2067 /* :nodoc: */
2068 static VALUE
2069 rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2070 {
2071  if (klass != rb_cString) {
2072  return rb_class_new_instance_pass_kw(argc, argv, klass);
2073  }
2074 
2075  static ID keyword_ids[2];
2076  VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2077  VALUE kwargs[2];
2078  rb_encoding *enc = NULL;
2079 
2080  int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2081  if (NIL_P(opt)) {
2082  return rb_class_new_instance_pass_kw(argc, argv, klass);
2083  }
2084 
2085  keyword_ids[0] = rb_id_encoding();
2086  CONST_ID(keyword_ids[1], "capacity");
2087  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2088  encoding = kwargs[0];
2089  capacity = kwargs[1];
2090 
2091  if (n == 1) {
2092  orig = StringValue(orig);
2093  }
2094  else {
2095  orig = Qnil;
2096  }
2097 
2098  if (UNDEF_P(encoding)) {
2099  if (!NIL_P(orig)) {
2100  encoding = rb_obj_encoding(orig);
2101  }
2102  }
2103 
2104  if (!UNDEF_P(encoding)) {
2105  enc = rb_to_encoding(encoding);
2106  }
2107 
2108  // If capacity is nil, we're basically just duping `orig`.
2109  if (UNDEF_P(capacity)) {
2110  if (NIL_P(orig)) {
2111  VALUE empty_str = str_new(klass, "", 0);
2112  if (enc) {
2113  rb_enc_associate(empty_str, enc);
2114  }
2115  return empty_str;
2116  }
2117  VALUE copy = str_duplicate(klass, orig);
2118  rb_enc_associate(copy, enc);
2119  ENC_CODERANGE_CLEAR(copy);
2120  return copy;
2121  }
2122 
2123  long capa = 0;
2124  capa = NUM2LONG(capacity);
2125  if (capa < 0) {
2126  capa = 0;
2127  }
2128 
2129  if (!NIL_P(orig)) {
2130  long orig_capa = rb_str_capacity(orig);
2131  if (orig_capa > capa) {
2132  capa = orig_capa;
2133  }
2134  }
2135 
2136  VALUE str = str_enc_new(klass, NULL, capa, enc);
2137  STR_SET_LEN(str, 0);
2138  TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2139 
2140  if (!NIL_P(orig)) {
2141  rb_str_buf_append(str, orig);
2142  }
2143 
2144  return str;
2145 }
2146 
2147 #ifdef NONASCII_MASK
2148 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2149 
2150 /*
2151  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2152  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2153  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2154  *
2155  * if (!(byte & 0x80))
2156  * byte |= 0x40; // turn on bit6
2157  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2158  *
2159  * This function calculates whether a byte is leading or not for all bytes
2160  * in the argument word by concurrently using the above logic, and then
2161  * adds up the number of leading bytes in the word.
2162  */
2163 static inline uintptr_t
2164 count_utf8_lead_bytes_with_word(const uintptr_t *s)
2165 {
2166  uintptr_t d = *s;
2167 
2168  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2169  d = (d>>6) | (~d>>7);
2170  d &= NONASCII_MASK >> 7;
2171 
2172  /* Gather all bytes. */
2173 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2174  /* use only if it can use POPCNT */
2175  return rb_popcount_intptr(d);
2176 #else
2177  d += (d>>8);
2178  d += (d>>16);
2179 # if SIZEOF_VOIDP == 8
2180  d += (d>>32);
2181 # endif
2182  return (d&0xF);
2183 #endif
2184 }
2185 #endif
2186 
2187 static inline long
2188 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2189 {
2190  long c;
2191  const char *q;
2192 
2193  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2194  long diff = (long)(e - p);
2195  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2196  }
2197 #ifdef NONASCII_MASK
2198  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2199  uintptr_t len = 0;
2200  if ((int)sizeof(uintptr_t) * 2 < e - p) {
2201  const uintptr_t *s, *t;
2202  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2203  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2204  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2205  while (p < (const char *)s) {
2206  if (is_utf8_lead_byte(*p)) len++;
2207  p++;
2208  }
2209  while (s < t) {
2210  len += count_utf8_lead_bytes_with_word(s);
2211  s++;
2212  }
2213  p = (const char *)s;
2214  }
2215  while (p < e) {
2216  if (is_utf8_lead_byte(*p)) len++;
2217  p++;
2218  }
2219  return (long)len;
2220  }
2221 #endif
2222  else if (rb_enc_asciicompat(enc)) {
2223  c = 0;
2224  if (ENC_CODERANGE_CLEAN_P(cr)) {
2225  while (p < e) {
2226  if (ISASCII(*p)) {
2227  q = search_nonascii(p, e);
2228  if (!q)
2229  return c + (e - p);
2230  c += q - p;
2231  p = q;
2232  }
2233  p += rb_enc_fast_mbclen(p, e, enc);
2234  c++;
2235  }
2236  }
2237  else {
2238  while (p < e) {
2239  if (ISASCII(*p)) {
2240  q = search_nonascii(p, e);
2241  if (!q)
2242  return c + (e - p);
2243  c += q - p;
2244  p = q;
2245  }
2246  p += rb_enc_mbclen(p, e, enc);
2247  c++;
2248  }
2249  }
2250  return c;
2251  }
2252 
2253  for (c=0; p<e; c++) {
2254  p += rb_enc_mbclen(p, e, enc);
2255  }
2256  return c;
2257 }
2258 
2259 long
2260 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2261 {
2262  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2263 }
2264 
2265 /* To get strlen with cr
2266  * Note that given cr is not used.
2267  */
2268 long
2269 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2270 {
2271  long c;
2272  const char *q;
2273  int ret;
2274 
2275  *cr = 0;
2276  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2277  long diff = (long)(e - p);
2278  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2279  }
2280  else if (rb_enc_asciicompat(enc)) {
2281  c = 0;
2282  while (p < e) {
2283  if (ISASCII(*p)) {
2284  q = search_nonascii(p, e);
2285  if (!q) {
2286  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2287  return c + (e - p);
2288  }
2289  c += q - p;
2290  p = q;
2291  }
2292  ret = rb_enc_precise_mbclen(p, e, enc);
2293  if (MBCLEN_CHARFOUND_P(ret)) {
2294  *cr |= ENC_CODERANGE_VALID;
2295  p += MBCLEN_CHARFOUND_LEN(ret);
2296  }
2297  else {
2298  *cr = ENC_CODERANGE_BROKEN;
2299  p++;
2300  }
2301  c++;
2302  }
2303  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2304  return c;
2305  }
2306 
2307  for (c=0; p<e; c++) {
2308  ret = rb_enc_precise_mbclen(p, e, enc);
2309  if (MBCLEN_CHARFOUND_P(ret)) {
2310  *cr |= ENC_CODERANGE_VALID;
2311  p += MBCLEN_CHARFOUND_LEN(ret);
2312  }
2313  else {
2314  *cr = ENC_CODERANGE_BROKEN;
2315  if (p + rb_enc_mbminlen(enc) <= e)
2316  p += rb_enc_mbminlen(enc);
2317  else
2318  p = e;
2319  }
2320  }
2321  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2322  return c;
2323 }
2324 
2325 /* enc must be str's enc or rb_enc_check(str, str2) */
2326 static long
2327 str_strlen(VALUE str, rb_encoding *enc)
2328 {
2329  const char *p, *e;
2330  int cr;
2331 
2332  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2333  if (!enc) enc = STR_ENC_GET(str);
2334  p = RSTRING_PTR(str);
2335  e = RSTRING_END(str);
2336  cr = ENC_CODERANGE(str);
2337 
2338  if (cr == ENC_CODERANGE_UNKNOWN) {
2339  long n = rb_enc_strlen_cr(p, e, enc, &cr);
2340  if (cr) ENC_CODERANGE_SET(str, cr);
2341  return n;
2342  }
2343  else {
2344  return enc_strlen(p, e, enc, cr);
2345  }
2346 }
2347 
2348 long
2350 {
2351  return str_strlen(str, NULL);
2352 }
2353 
2354 /*
2355  * call-seq:
2356  * length -> integer
2357  *
2358  * :include: doc/string/length.rdoc
2359  *
2360  */
2361 
2362 VALUE
2364 {
2365  return LONG2NUM(str_strlen(str, NULL));
2366 }
2367 
2368 /*
2369  * call-seq:
2370  * bytesize -> integer
2371  *
2372  * :include: doc/string/bytesize.rdoc
2373  *
2374  */
2375 
2376 VALUE
2377 rb_str_bytesize(VALUE str)
2378 {
2379  return LONG2NUM(RSTRING_LEN(str));
2380 }
2381 
2382 /*
2383  * call-seq:
2384  * empty? -> true or false
2385  *
2386  * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2387  *
2388  * "hello".empty? # => false
2389  * " ".empty? # => false
2390  * "".empty? # => true
2391  *
2392  */
2393 
2394 static VALUE
2395 rb_str_empty(VALUE str)
2396 {
2397  return RBOOL(RSTRING_LEN(str) == 0);
2398 }
2399 
2400 /*
2401  * call-seq:
2402  * string + other_string -> new_string
2403  *
2404  * Returns a new +String+ containing +other_string+ concatenated to +self+:
2405  *
2406  * "Hello from " + self.to_s # => "Hello from main"
2407  *
2408  */
2409 
2410 VALUE
2412 {
2413  VALUE str3;
2414  rb_encoding *enc;
2415  char *ptr1, *ptr2, *ptr3;
2416  long len1, len2;
2417  int termlen;
2418 
2419  StringValue(str2);
2420  enc = rb_enc_check_str(str1, str2);
2421  RSTRING_GETMEM(str1, ptr1, len1);
2422  RSTRING_GETMEM(str2, ptr2, len2);
2423  termlen = rb_enc_mbminlen(enc);
2424  if (len1 > LONG_MAX - len2) {
2425  rb_raise(rb_eArgError, "string size too big");
2426  }
2427  str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2428  ptr3 = RSTRING_PTR(str3);
2429  memcpy(ptr3, ptr1, len1);
2430  memcpy(ptr3+len1, ptr2, len2);
2431  TERM_FILL(&ptr3[len1+len2], termlen);
2432 
2435  RB_GC_GUARD(str1);
2436  RB_GC_GUARD(str2);
2437  return str3;
2438 }
2439 
2440 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2441 VALUE
2442 rb_str_opt_plus(VALUE str1, VALUE str2)
2443 {
2446  long len1, len2;
2447  MAYBE_UNUSED(char) *ptr1, *ptr2;
2448  RSTRING_GETMEM(str1, ptr1, len1);
2449  RSTRING_GETMEM(str2, ptr2, len2);
2450  int enc1 = rb_enc_get_index(str1);
2451  int enc2 = rb_enc_get_index(str2);
2452 
2453  if (enc1 < 0) {
2454  return Qundef;
2455  }
2456  else if (enc2 < 0) {
2457  return Qundef;
2458  }
2459  else if (enc1 != enc2) {
2460  return Qundef;
2461  }
2462  else if (len1 > LONG_MAX - len2) {
2463  return Qundef;
2464  }
2465  else {
2466  return rb_str_plus(str1, str2);
2467  }
2468 
2469 }
2470 
2471 /*
2472  * call-seq:
2473  * string * integer -> new_string
2474  *
2475  * Returns a new +String+ containing +integer+ copies of +self+:
2476  *
2477  * "Ho! " * 3 # => "Ho! Ho! Ho! "
2478  * "Ho! " * 0 # => ""
2479  *
2480  */
2481 
2482 VALUE
2484 {
2485  VALUE str2;
2486  long n, len;
2487  char *ptr2;
2488  int termlen;
2489 
2490  if (times == INT2FIX(1)) {
2491  return str_duplicate(rb_cString, str);
2492  }
2493  if (times == INT2FIX(0)) {
2494  str2 = str_alloc_embed(rb_cString, 0);
2495  rb_enc_copy(str2, str);
2496  return str2;
2497  }
2498  len = NUM2LONG(times);
2499  if (len < 0) {
2500  rb_raise(rb_eArgError, "negative argument");
2501  }
2502  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2503  if (STR_EMBEDDABLE_P(len, 1)) {
2504  str2 = str_alloc_embed(rb_cString, len + 1);
2505  memset(RSTRING_PTR(str2), 0, len + 1);
2506  }
2507  else {
2508  str2 = str_alloc_heap(rb_cString);
2509  RSTRING(str2)->as.heap.aux.capa = len;
2510  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2511  }
2512  STR_SET_LEN(str2, len);
2513  rb_enc_copy(str2, str);
2514  return str2;
2515  }
2516  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2517  rb_raise(rb_eArgError, "argument too big");
2518  }
2519 
2520  len *= RSTRING_LEN(str);
2521  termlen = TERM_LEN(str);
2522  str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2523  ptr2 = RSTRING_PTR(str2);
2524  if (len) {
2525  n = RSTRING_LEN(str);
2526  memcpy(ptr2, RSTRING_PTR(str), n);
2527  while (n <= len/2) {
2528  memcpy(ptr2 + n, ptr2, n);
2529  n *= 2;
2530  }
2531  memcpy(ptr2 + n, ptr2, len-n);
2532  }
2533  STR_SET_LEN(str2, len);
2534  TERM_FILL(&ptr2[len], termlen);
2535  rb_enc_cr_str_copy_for_substr(str2, str);
2536 
2537  return str2;
2538 }
2539 
2540 /*
2541  * call-seq:
2542  * string % object -> new_string
2543  *
2544  * Returns the result of formatting +object+ into the format specification +self+
2545  * (see Kernel#sprintf for formatting details):
2546  *
2547  * "%05d" % 123 # => "00123"
2548  *
2549  * If +self+ contains multiple substitutions, +object+ must be
2550  * an Array or Hash containing the values to be substituted:
2551  *
2552  * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2553  * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2554  * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2555  *
2556  */
2557 
2558 static VALUE
2559 rb_str_format_m(VALUE str, VALUE arg)
2560 {
2561  VALUE tmp = rb_check_array_type(arg);
2562 
2563  if (!NIL_P(tmp)) {
2564  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2565  }
2566  return rb_str_format(1, &arg, str);
2567 }
2568 
2569 static inline void
2570 rb_check_lockedtmp(VALUE str)
2571 {
2572  if (FL_TEST(str, STR_TMPLOCK)) {
2573  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2574  }
2575 }
2576 
2577 // If none of these flags are set, we know we have an modifiable string.
2578 // If any is set, we need to do more detailed checks.
2579 #define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2580 static inline void
2581 str_modifiable(VALUE str)
2582 {
2583  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2584  if (CHILLED_STRING_P(str)) {
2585  CHILLED_STRING_MUTATED(str);
2586  }
2587  rb_check_lockedtmp(str);
2588  rb_check_frozen(str);
2589  }
2590 }
2591 
2592 static inline int
2593 str_dependent_p(VALUE str)
2594 {
2595  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2596  return FALSE;
2597  }
2598  else {
2599  return TRUE;
2600  }
2601 }
2602 
2603 // If none of these flags are set, we know we have an independent string.
2604 // If any is set, we need to do more detailed checks.
2605 #define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2606 static inline int
2607 str_independent(VALUE str)
2608 {
2609  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2610  str_modifiable(str);
2611  return !str_dependent_p(str);
2612  }
2613  return TRUE;
2614 }
2615 
2616 static void
2617 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2618 {
2619  char *ptr;
2620  char *oldptr;
2621  long capa = len + expand;
2622 
2623  if (len > capa) len = capa;
2624 
2625  if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2626  ptr = RSTRING(str)->as.heap.ptr;
2627  STR_SET_EMBED(str);
2628  memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2629  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2630  STR_SET_LEN(str, len);
2631  return;
2632  }
2633 
2634  ptr = ALLOC_N(char, (size_t)capa + termlen);
2635  oldptr = RSTRING_PTR(str);
2636  if (oldptr) {
2637  memcpy(ptr, oldptr, len);
2638  }
2639  if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2640  xfree(oldptr);
2641  }
2642  STR_SET_NOEMBED(str);
2643  FL_UNSET(str, STR_SHARED|STR_NOFREE);
2644  TERM_FILL(ptr + len, termlen);
2645  RSTRING(str)->as.heap.ptr = ptr;
2646  STR_SET_LEN(str, len);
2647  RSTRING(str)->as.heap.aux.capa = capa;
2648 }
2649 
2650 void
2652 {
2653  if (!str_independent(str))
2654  str_make_independent(str);
2655  ENC_CODERANGE_CLEAR(str);
2656 }
2657 
2658 void
2659 rb_str_modify_expand(VALUE str, long expand)
2660 {
2661  int termlen = TERM_LEN(str);
2662  long len = RSTRING_LEN(str);
2663 
2664  if (expand < 0) {
2665  rb_raise(rb_eArgError, "negative expanding string size");
2666  }
2667  if (expand >= LONG_MAX - len) {
2668  rb_raise(rb_eArgError, "string size too big");
2669  }
2670 
2671  if (!str_independent(str)) {
2672  str_make_independent_expand(str, len, expand, termlen);
2673  }
2674  else if (expand > 0) {
2675  RESIZE_CAPA_TERM(str, len + expand, termlen);
2676  }
2677  ENC_CODERANGE_CLEAR(str);
2678 }
2679 
2680 /* As rb_str_modify(), but don't clear coderange */
2681 static void
2682 str_modify_keep_cr(VALUE str)
2683 {
2684  if (!str_independent(str))
2685  str_make_independent(str);
2686  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2687  /* Force re-scan later */
2688  ENC_CODERANGE_CLEAR(str);
2689 }
2690 
2691 static inline void
2692 str_discard(VALUE str)
2693 {
2694  str_modifiable(str);
2695  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2696  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2697  RSTRING(str)->as.heap.ptr = 0;
2698  STR_SET_LEN(str, 0);
2699  }
2700 }
2701 
2702 void
2704 {
2705  rb_encoding *enc = rb_enc_get(str);
2706  if (!enc) {
2707  rb_raise(rb_eTypeError, "not encoding capable object");
2708  }
2709  if (!rb_enc_asciicompat(enc)) {
2710  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2711  }
2712 }
2713 
2714 VALUE
2716 {
2717  VALUE s = *ptr;
2718  if (!RB_TYPE_P(s, T_STRING)) {
2719  s = rb_str_to_str(s);
2720  *ptr = s;
2721  }
2722  return s;
2723 }
2724 
2725 char *
2727 {
2728  VALUE str = rb_string_value(ptr);
2729  return RSTRING_PTR(str);
2730 }
2731 
2732 static int
2733 zero_filled(const char *s, int n)
2734 {
2735  for (; n > 0; --n) {
2736  if (*s++) return 0;
2737  }
2738  return 1;
2739 }
2740 
2741 static const char *
2742 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2743 {
2744  const char *e = s + len;
2745 
2746  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2747  if (zero_filled(s, minlen)) return s;
2748  }
2749  return 0;
2750 }
2751 
2752 static char *
2753 str_fill_term(VALUE str, char *s, long len, int termlen)
2754 {
2755  /* This function assumes that (capa + termlen) bytes of memory
2756  * is allocated, like many other functions in this file.
2757  */
2758  if (str_dependent_p(str)) {
2759  if (!zero_filled(s + len, termlen))
2760  str_make_independent_expand(str, len, 0L, termlen);
2761  }
2762  else {
2763  TERM_FILL(s + len, termlen);
2764  return s;
2765  }
2766  return RSTRING_PTR(str);
2767 }
2768 
2769 void
2770 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2771 {
2772  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2773  long len = RSTRING_LEN(str);
2774 
2775  RUBY_ASSERT(capa >= len);
2776  if (capa - len < termlen) {
2777  rb_check_lockedtmp(str);
2778  str_make_independent_expand(str, len, 0L, termlen);
2779  }
2780  else if (str_dependent_p(str)) {
2781  if (termlen > oldtermlen)
2782  str_make_independent_expand(str, len, 0L, termlen);
2783  }
2784  else {
2785  if (!STR_EMBED_P(str)) {
2786  /* modify capa instead of realloc */
2787  RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2788  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2789  }
2790  if (termlen > oldtermlen) {
2791  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2792  }
2793  }
2794 
2795  return;
2796 }
2797 
2798 static char *
2799 str_null_check(VALUE str, int *w)
2800 {
2801  char *s = RSTRING_PTR(str);
2802  long len = RSTRING_LEN(str);
2803  rb_encoding *enc = rb_enc_get(str);
2804  const int minlen = rb_enc_mbminlen(enc);
2805 
2806  if (minlen > 1) {
2807  *w = 1;
2808  if (str_null_char(s, len, minlen, enc)) {
2809  return NULL;
2810  }
2811  return str_fill_term(str, s, len, minlen);
2812  }
2813  *w = 0;
2814  if (!s || memchr(s, 0, len)) {
2815  return NULL;
2816  }
2817  if (s[len]) {
2818  s = str_fill_term(str, s, len, minlen);
2819  }
2820  return s;
2821 }
2822 
2823 char *
2824 rb_str_to_cstr(VALUE str)
2825 {
2826  int w;
2827  return str_null_check(str, &w);
2828 }
2829 
2830 char *
2832 {
2833  VALUE str = rb_string_value(ptr);
2834  int w;
2835  char *s = str_null_check(str, &w);
2836  if (!s) {
2837  if (w) {
2838  rb_raise(rb_eArgError, "string contains null char");
2839  }
2840  rb_raise(rb_eArgError, "string contains null byte");
2841  }
2842  return s;
2843 }
2844 
2845 char *
2846 rb_str_fill_terminator(VALUE str, const int newminlen)
2847 {
2848  char *s = RSTRING_PTR(str);
2849  long len = RSTRING_LEN(str);
2850  return str_fill_term(str, s, len, newminlen);
2851 }
2852 
2853 VALUE
2855 {
2856  str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2857  return str;
2858 }
2859 
2860 /*
2861  * call-seq:
2862  * String.try_convert(object) -> object, new_string, or nil
2863  *
2864  * If +object+ is a +String+ object, returns +object+.
2865  *
2866  * Otherwise if +object+ responds to <tt>:to_str</tt>,
2867  * calls <tt>object.to_str</tt> and returns the result.
2868  *
2869  * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2870  *
2871  * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2872  */
2873 static VALUE
2874 rb_str_s_try_convert(VALUE dummy, VALUE str)
2875 {
2876  return rb_check_string_type(str);
2877 }
2878 
2879 static char*
2880 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2881 {
2882  long nth = *nthp;
2883  if (rb_enc_mbmaxlen(enc) == 1) {
2884  p += nth;
2885  }
2886  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2887  p += nth * rb_enc_mbmaxlen(enc);
2888  }
2889  else if (rb_enc_asciicompat(enc)) {
2890  const char *p2, *e2;
2891  int n;
2892 
2893  while (p < e && 0 < nth) {
2894  e2 = p + nth;
2895  if (e < e2) {
2896  *nthp = nth;
2897  return (char *)e;
2898  }
2899  if (ISASCII(*p)) {
2900  p2 = search_nonascii(p, e2);
2901  if (!p2) {
2902  nth -= e2 - p;
2903  *nthp = nth;
2904  return (char *)e2;
2905  }
2906  nth -= p2 - p;
2907  p = p2;
2908  }
2909  n = rb_enc_mbclen(p, e, enc);
2910  p += n;
2911  nth--;
2912  }
2913  *nthp = nth;
2914  if (nth != 0) {
2915  return (char *)e;
2916  }
2917  return (char *)p;
2918  }
2919  else {
2920  while (p < e && nth--) {
2921  p += rb_enc_mbclen(p, e, enc);
2922  }
2923  }
2924  if (p > e) p = e;
2925  *nthp = nth;
2926  return (char*)p;
2927 }
2928 
2929 char*
2930 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2931 {
2932  return str_nth_len(p, e, &nth, enc);
2933 }
2934 
2935 static char*
2936 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2937 {
2938  if (singlebyte)
2939  p += nth;
2940  else {
2941  p = str_nth_len(p, e, &nth, enc);
2942  }
2943  if (!p) return 0;
2944  if (p > e) p = e;
2945  return (char *)p;
2946 }
2947 
2948 /* char offset to byte offset */
2949 static long
2950 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2951 {
2952  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2953  if (!pp) return e - p;
2954  return pp - p;
2955 }
2956 
2957 long
2958 rb_str_offset(VALUE str, long pos)
2959 {
2960  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2961  STR_ENC_GET(str), single_byte_optimizable(str));
2962 }
2963 
2964 #ifdef NONASCII_MASK
2965 static char *
2966 str_utf8_nth(const char *p, const char *e, long *nthp)
2967 {
2968  long nth = *nthp;
2969  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2970  const uintptr_t *s, *t;
2971  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2972  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2973  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2974  while (p < (const char *)s) {
2975  if (is_utf8_lead_byte(*p)) nth--;
2976  p++;
2977  }
2978  do {
2979  nth -= count_utf8_lead_bytes_with_word(s);
2980  s++;
2981  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2982  p = (char *)s;
2983  }
2984  while (p < e) {
2985  if (is_utf8_lead_byte(*p)) {
2986  if (nth == 0) break;
2987  nth--;
2988  }
2989  p++;
2990  }
2991  *nthp = nth;
2992  return (char *)p;
2993 }
2994 
2995 static long
2996 str_utf8_offset(const char *p, const char *e, long nth)
2997 {
2998  const char *pp = str_utf8_nth(p, e, &nth);
2999  return pp - p;
3000 }
3001 #endif
3002 
3003 /* byte offset to char offset */
3004 long
3005 rb_str_sublen(VALUE str, long pos)
3006 {
3007  if (single_byte_optimizable(str) || pos < 0)
3008  return pos;
3009  else {
3010  char *p = RSTRING_PTR(str);
3011  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3012  }
3013 }
3014 
3015 static VALUE
3016 str_subseq(VALUE str, long beg, long len)
3017 {
3018  VALUE str2;
3019 
3020  RUBY_ASSERT(beg >= 0);
3021  RUBY_ASSERT(len >= 0);
3022  RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3023 
3024  const int termlen = TERM_LEN(str);
3025  if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3026  str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3027  RB_GC_GUARD(str);
3028  return str2;
3029  }
3030 
3031  str2 = str_alloc_heap(rb_cString);
3032  if (str_embed_capa(str2) >= len + termlen) {
3033  char *ptr2 = RSTRING(str2)->as.embed.ary;
3034  STR_SET_EMBED(str2);
3035  memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3036  TERM_FILL(ptr2+len, termlen);
3037 
3038  STR_SET_LEN(str2, len);
3039  RB_GC_GUARD(str);
3040  }
3041  else {
3042  str_replace_shared(str2, str);
3043  RUBY_ASSERT(!STR_EMBED_P(str2));
3044  ENC_CODERANGE_CLEAR(str2);
3045  RSTRING(str2)->as.heap.ptr += beg;
3046  if (RSTRING_LEN(str2) > len) {
3047  STR_SET_LEN(str2, len);
3048  }
3049  }
3050 
3051  return str2;
3052 }
3053 
3054 VALUE
3055 rb_str_subseq(VALUE str, long beg, long len)
3056 {
3057  VALUE str2 = str_subseq(str, beg, len);
3058  rb_enc_cr_str_copy_for_substr(str2, str);
3059  return str2;
3060 }
3061 
3062 char *
3063 rb_str_subpos(VALUE str, long beg, long *lenp)
3064 {
3065  long len = *lenp;
3066  long slen = -1L;
3067  long blen = RSTRING_LEN(str);
3068  rb_encoding *enc = STR_ENC_GET(str);
3069  char *p, *s = RSTRING_PTR(str), *e = s + blen;
3070 
3071  if (len < 0) return 0;
3072  if (!blen) {
3073  len = 0;
3074  }
3075  if (single_byte_optimizable(str)) {
3076  if (beg > blen) return 0;
3077  if (beg < 0) {
3078  beg += blen;
3079  if (beg < 0) return 0;
3080  }
3081  if (len > blen - beg)
3082  len = blen - beg;
3083  if (len < 0) return 0;
3084  p = s + beg;
3085  goto end;
3086  }
3087  if (beg < 0) {
3088  if (len > -beg) len = -beg;
3089  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
3090  beg = -beg;
3091  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3092  p = e;
3093  if (!p) return 0;
3094  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3095  if (!p) return 0;
3096  len = e - p;
3097  goto end;
3098  }
3099  else {
3100  slen = str_strlen(str, enc);
3101  beg += slen;
3102  if (beg < 0) return 0;
3103  p = s + beg;
3104  if (len == 0) goto end;
3105  }
3106  }
3107  else if (beg > 0 && beg > RSTRING_LEN(str)) {
3108  return 0;
3109  }
3110  if (len == 0) {
3111  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3112  p = s + beg;
3113  }
3114 #ifdef NONASCII_MASK
3115  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3116  enc == rb_utf8_encoding()) {
3117  p = str_utf8_nth(s, e, &beg);
3118  if (beg > 0) return 0;
3119  len = str_utf8_offset(p, e, len);
3120  }
3121 #endif
3122  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3123  int char_sz = rb_enc_mbmaxlen(enc);
3124 
3125  p = s + beg * char_sz;
3126  if (p > e) {
3127  return 0;
3128  }
3129  else if (len * char_sz > e - p)
3130  len = e - p;
3131  else
3132  len *= char_sz;
3133  }
3134  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3135  if (beg > 0) return 0;
3136  len = 0;
3137  }
3138  else {
3139  len = str_offset(p, e, len, enc, 0);
3140  }
3141  end:
3142  *lenp = len;
3143  RB_GC_GUARD(str);
3144  return p;
3145 }
3146 
3147 static VALUE str_substr(VALUE str, long beg, long len, int empty);
3148 
3149 VALUE
3150 rb_str_substr(VALUE str, long beg, long len)
3151 {
3152  return str_substr(str, beg, len, TRUE);
3153 }
3154 
3155 VALUE
3156 rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3157 {
3158  return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3159 }
3160 
3161 static VALUE
3162 str_substr(VALUE str, long beg, long len, int empty)
3163 {
3164  char *p = rb_str_subpos(str, beg, &len);
3165 
3166  if (!p) return Qnil;
3167  if (!len && !empty) return Qnil;
3168 
3169  beg = p - RSTRING_PTR(str);
3170 
3171  VALUE str2 = str_subseq(str, beg, len);
3172  rb_enc_cr_str_copy_for_substr(str2, str);
3173  return str2;
3174 }
3175 
3176 /* :nodoc: */
3177 VALUE
3179 {
3180  if (CHILLED_STRING_P(str)) {
3181  FL_UNSET_RAW(str, STR_CHILLED);
3182  }
3183 
3184  if (OBJ_FROZEN(str)) return str;
3185  rb_str_resize(str, RSTRING_LEN(str));
3186  return rb_obj_freeze(str);
3187 }
3188 
3189 /*
3190  * call-seq:
3191  * +string -> new_string or self
3192  *
3193  * Returns +self+ if +self+ is not frozen.
3194  *
3195  * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3196  */
3197 static VALUE
3198 str_uplus(VALUE str)
3199 {
3200  if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3201  return rb_str_dup(str);
3202  }
3203  else {
3204  return str;
3205  }
3206 }
3207 
3208 /*
3209  * call-seq:
3210  * -string -> frozen_string
3211  * dedup -> frozen_string
3212  *
3213  * Returns a frozen, possibly pre-existing copy of the string.
3214  *
3215  * The returned +String+ will be deduplicated as long as it does not have
3216  * any instance variables set on it and is not a String subclass.
3217  *
3218  * Note that <tt>-string</tt> variant is more convenient for defining
3219  * constants:
3220  *
3221  * FILENAME = -'config/database.yml'
3222  *
3223  * while +dedup+ is better suitable for using the method in chains
3224  * of calculations:
3225  *
3226  * @url_list.concat(urls.map(&:dedup))
3227  *
3228  */
3229 static VALUE
3230 str_uminus(VALUE str)
3231 {
3232  if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3233  str = rb_str_dup(str);
3234  }
3235  return rb_fstring(str);
3236 }
3237 
3238 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3239 #define rb_str_dup_frozen rb_str_new_frozen
3240 
3241 VALUE
3242 rb_str_locktmp(VALUE str)
3243 {
3244  if (FL_TEST(str, STR_TMPLOCK)) {
3245  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3246  }
3247  FL_SET(str, STR_TMPLOCK);
3248  return str;
3249 }
3250 
3251 VALUE
3253 {
3254  if (!FL_TEST(str, STR_TMPLOCK)) {
3255  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3256  }
3257  FL_UNSET(str, STR_TMPLOCK);
3258  return str;
3259 }
3260 
3261 VALUE
3262 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3263 {
3264  rb_str_locktmp(str);
3265  return rb_ensure(func, arg, rb_str_unlocktmp, str);
3266 }
3267 
3268 void
3270 {
3271  long capa;
3272  const int termlen = TERM_LEN(str);
3273 
3274  str_modifiable(str);
3275  if (STR_SHARED_P(str)) {
3276  rb_raise(rb_eRuntimeError, "can't set length of shared string");
3277  }
3278  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3279  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3280  }
3281 
3282  int cr = ENC_CODERANGE(str);
3283  if (len == 0) {
3284  /* Empty string does not contain non-ASCII */
3286  }
3287  else if (cr == ENC_CODERANGE_UNKNOWN) {
3288  /* Leave unknown. */
3289  }
3290  else if (len > RSTRING_LEN(str)) {
3291  if (ENC_CODERANGE_CLEAN_P(cr)) {
3292  /* Update the coderange regarding the extended part. */
3293  const char *const prev_end = RSTRING_END(str);
3294  const char *const new_end = RSTRING_PTR(str) + len;
3295  rb_encoding *enc = rb_enc_get(str);
3296  rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3297  ENC_CODERANGE_SET(str, cr);
3298  }
3299  else if (cr == ENC_CODERANGE_BROKEN) {
3300  /* May be valid now, by appended part. */
3302  }
3303  }
3304  else if (len < RSTRING_LEN(str)) {
3305  if (cr != ENC_CODERANGE_7BIT) {
3306  /* ASCII-only string is keeping after truncated. Valid
3307  * and broken may be invalid or valid, leave unknown. */
3309  }
3310  }
3311 
3312  STR_SET_LEN(str, len);
3313  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3314 }
3315 
3316 VALUE
3318 {
3319  if (len < 0) {
3320  rb_raise(rb_eArgError, "negative string size (or size too big)");
3321  }
3322 
3323  int independent = str_independent(str);
3324  long slen = RSTRING_LEN(str);
3325  const int termlen = TERM_LEN(str);
3326 
3327  if (slen > len || (termlen != 1 && slen < len)) {
3328  ENC_CODERANGE_CLEAR(str);
3329  }
3330 
3331  {
3332  long capa;
3333  if (STR_EMBED_P(str)) {
3334  if (len == slen) return str;
3335  if (str_embed_capa(str) >= len + termlen) {
3336  STR_SET_LEN(str, len);
3337  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3338  return str;
3339  }
3340  str_make_independent_expand(str, slen, len - slen, termlen);
3341  }
3342  else if (str_embed_capa(str) >= len + termlen) {
3343  char *ptr = STR_HEAP_PTR(str);
3344  STR_SET_EMBED(str);
3345  if (slen > len) slen = len;
3346  if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3347  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3348  STR_SET_LEN(str, len);
3349  if (independent) ruby_xfree(ptr);
3350  return str;
3351  }
3352  else if (!independent) {
3353  if (len == slen) return str;
3354  str_make_independent_expand(str, slen, len - slen, termlen);
3355  }
3356  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3357  (capa - len) > (len < 1024 ? len : 1024)) {
3358  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3359  (size_t)len + termlen, STR_HEAP_SIZE(str));
3360  RSTRING(str)->as.heap.aux.capa = len;
3361  }
3362  else if (len == slen) return str;
3363  STR_SET_LEN(str, len);
3364  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3365  }
3366  return str;
3367 }
3368 
3369 static void
3370 str_ensure_available_capa(VALUE str, long len)
3371 {
3372  str_modify_keep_cr(str);
3373 
3374  const int termlen = TERM_LEN(str);
3375  long olen = RSTRING_LEN(str);
3376 
3377  if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3378  rb_raise(rb_eArgError, "string sizes too big");
3379  }
3380 
3381  long total = olen + len;
3382  long capa = str_capacity(str, termlen);
3383 
3384  if (capa < total) {
3385  if (total >= LONG_MAX / 2) {
3386  capa = total;
3387  }
3388  while (total > capa) {
3389  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3390  }
3391  RESIZE_CAPA_TERM(str, capa, termlen);
3392  }
3393 }
3394 
3395 static VALUE
3396 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3397 {
3398  if (keep_cr) {
3399  str_modify_keep_cr(str);
3400  }
3401  else {
3402  rb_str_modify(str);
3403  }
3404  if (len == 0) return 0;
3405 
3406  long total, olen, off = -1;
3407  char *sptr;
3408  const int termlen = TERM_LEN(str);
3409 
3410  RSTRING_GETMEM(str, sptr, olen);
3411  if (ptr >= sptr && ptr <= sptr + olen) {
3412  off = ptr - sptr;
3413  }
3414 
3415  long capa = str_capacity(str, termlen);
3416 
3417  if (olen > LONG_MAX - len) {
3418  rb_raise(rb_eArgError, "string sizes too big");
3419  }
3420  total = olen + len;
3421  if (capa < total) {
3422  if (total >= LONG_MAX / 2) {
3423  capa = total;
3424  }
3425  while (total > capa) {
3426  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3427  }
3428  RESIZE_CAPA_TERM(str, capa, termlen);
3429  sptr = RSTRING_PTR(str);
3430  }
3431  if (off != -1) {
3432  ptr = sptr + off;
3433  }
3434  memcpy(sptr + olen, ptr, len);
3435  STR_SET_LEN(str, total);
3436  TERM_FILL(sptr + total, termlen); /* sentinel */
3437 
3438  return str;
3439 }
3440 
3441 #define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3442 #define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3443 
3444 VALUE
3445 rb_str_cat(VALUE str, const char *ptr, long len)
3446 {
3447  if (len == 0) return str;
3448  if (len < 0) {
3449  rb_raise(rb_eArgError, "negative string size (or size too big)");
3450  }
3451  return str_buf_cat(str, ptr, len);
3452 }
3453 
3454 VALUE
3455 rb_str_cat_cstr(VALUE str, const char *ptr)
3456 {
3457  must_not_null(ptr);
3458  return rb_str_buf_cat(str, ptr, strlen(ptr));
3459 }
3460 
3461 static void
3462 rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3463 {
3464  RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3465 
3466  // We can't write directly to shared strings without impacting others, so we must make the string independent.
3467  if (UNLIKELY(!str_independent(str))) {
3468  str_make_independent(str);
3469  }
3470 
3471  long string_length = -1;
3472  const int null_terminator_length = 1;
3473  char *sptr;
3474  RSTRING_GETMEM(str, sptr, string_length);
3475 
3476  // Ensure the resulting string wouldn't be too long.
3477  if (UNLIKELY(string_length > LONG_MAX - 1)) {
3478  rb_raise(rb_eArgError, "string sizes too big");
3479  }
3480 
3481  long string_capacity = str_capacity(str, null_terminator_length);
3482 
3483  // Get the code range before any modifications since those might clear the code range.
3484  int cr = ENC_CODERANGE(str);
3485 
3486  // Check if the string has spare string_capacity to write the new byte.
3487  if (LIKELY(string_capacity >= string_length + 1)) {
3488  // In fast path we can write the new byte and note the string's new length.
3489  sptr[string_length] = byte;
3490  STR_SET_LEN(str, string_length + 1);
3491  TERM_FILL(sptr + string_length + 1, null_terminator_length);
3492  }
3493  else {
3494  // If there's not enough string_capacity, make a call into the general string concatenation function.
3495  str_buf_cat(str, (char *)&byte, 1);
3496  }
3497 
3498  // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3499  // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3500  // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3501  // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3502  if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3503  if (ISASCII(byte)) {
3505  }
3506  else {
3508 
3509  // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3510  if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3511  rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3512  }
3513  }
3514  }
3515 }
3516 
3517 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3518 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3519 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3520 
3521 static VALUE
3522 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3523  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3524 {
3525  int str_encindex = ENCODING_GET(str);
3526  int res_encindex;
3527  int str_cr, res_cr;
3528  rb_encoding *str_enc, *ptr_enc;
3529 
3530  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3531 
3532  if (str_encindex == ptr_encindex) {
3533  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3534  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3535  }
3536  }
3537  else {
3538  str_enc = rb_enc_from_index(str_encindex);
3539  ptr_enc = rb_enc_from_index(ptr_encindex);
3540  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3541  if (len == 0)
3542  return str;
3543  if (RSTRING_LEN(str) == 0) {
3544  rb_str_buf_cat(str, ptr, len);
3545  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3546  rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3547  return str;
3548  }
3549  goto incompatible;
3550  }
3551  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3552  ptr_cr = coderange_scan(ptr, len, ptr_enc);
3553  }
3554  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3555  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3556  str_cr = rb_enc_str_coderange(str);
3557  }
3558  }
3559  }
3560  if (ptr_cr_ret)
3561  *ptr_cr_ret = ptr_cr;
3562 
3563  if (str_encindex != ptr_encindex &&
3564  str_cr != ENC_CODERANGE_7BIT &&
3565  ptr_cr != ENC_CODERANGE_7BIT) {
3566  str_enc = rb_enc_from_index(str_encindex);
3567  ptr_enc = rb_enc_from_index(ptr_encindex);
3568  goto incompatible;
3569  }
3570 
3571  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3572  res_encindex = str_encindex;
3573  res_cr = ENC_CODERANGE_UNKNOWN;
3574  }
3575  else if (str_cr == ENC_CODERANGE_7BIT) {
3576  if (ptr_cr == ENC_CODERANGE_7BIT) {
3577  res_encindex = str_encindex;
3578  res_cr = ENC_CODERANGE_7BIT;
3579  }
3580  else {
3581  res_encindex = ptr_encindex;
3582  res_cr = ptr_cr;
3583  }
3584  }
3585  else if (str_cr == ENC_CODERANGE_VALID) {
3586  res_encindex = str_encindex;
3587  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3588  res_cr = str_cr;
3589  else
3590  res_cr = ptr_cr;
3591  }
3592  else { /* str_cr == ENC_CODERANGE_BROKEN */
3593  res_encindex = str_encindex;
3594  res_cr = str_cr;
3595  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3596  }
3597 
3598  if (len < 0) {
3599  rb_raise(rb_eArgError, "negative string size (or size too big)");
3600  }
3601  str_buf_cat(str, ptr, len);
3602  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3603  return str;
3604 
3605  incompatible:
3606  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3607  rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3609 }
3610 
3611 VALUE
3612 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3613 {
3614  return rb_enc_cr_str_buf_cat(str, ptr, len,
3615  rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3616 }
3617 
3618 VALUE
3619 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3620 {
3621  /* ptr must reference NUL terminated ASCII string. */
3622  int encindex = ENCODING_GET(str);
3623  rb_encoding *enc = rb_enc_from_index(encindex);
3624  if (rb_enc_asciicompat(enc)) {
3625  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3626  encindex, ENC_CODERANGE_7BIT, 0);
3627  }
3628  else {
3629  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3630  while (*ptr) {
3631  unsigned int c = (unsigned char)*ptr;
3632  int len = rb_enc_codelen(c, enc);
3633  rb_enc_mbcput(c, buf, enc);
3634  rb_enc_cr_str_buf_cat(str, buf, len,
3635  encindex, ENC_CODERANGE_VALID, 0);
3636  ptr++;
3637  }
3638  return str;
3639  }
3640 }
3641 
3642 VALUE
3644 {
3645  int str2_cr = rb_enc_str_coderange(str2);
3646 
3647  if (str_enc_fastpath(str)) {
3648  switch (str2_cr) {
3649  case ENC_CODERANGE_7BIT:
3650  // If RHS is 7bit we can do simple concatenation
3651  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3652  RB_GC_GUARD(str2);
3653  return str;
3654  case ENC_CODERANGE_VALID:
3655  // If RHS is valid, we can do simple concatenation if encodings are the same
3656  if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3657  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3658  int str_cr = ENC_CODERANGE(str);
3659  if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3660  ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3661  }
3662  RB_GC_GUARD(str2);
3663  return str;
3664  }
3665  }
3666  }
3667 
3668  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3669  ENCODING_GET(str2), str2_cr, &str2_cr);
3670 
3671  ENC_CODERANGE_SET(str2, str2_cr);
3672 
3673  return str;
3674 }
3675 
3676 VALUE
3678 {
3679  StringValue(str2);
3680  return rb_str_buf_append(str, str2);
3681 }
3682 
3683 VALUE
3684 rb_str_concat_literals(size_t num, const VALUE *strary)
3685 {
3686  VALUE str;
3687  size_t i, s = 0;
3688  unsigned long len = 1;
3689 
3690  if (UNLIKELY(!num)) return rb_str_new(0, 0);
3691  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3692 
3693  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3694  str = rb_str_buf_new(len);
3695  str_enc_copy_direct(str, strary[0]);
3696 
3697  for (i = s; i < num; ++i) {
3698  const VALUE v = strary[i];
3699  int encidx = ENCODING_GET(v);
3700 
3701  rb_str_buf_append(str, v);
3702  if (encidx != ENCINDEX_US_ASCII) {
3703  if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3704  rb_enc_set_index(str, encidx);
3705  }
3706  }
3707  return str;
3708 }
3709 
3710 /*
3711  * call-seq:
3712  * concat(*objects) -> string
3713  *
3714  * Concatenates each object in +objects+ to +self+ and returns +self+:
3715  *
3716  * s = 'foo'
3717  * s.concat('bar', 'baz') # => "foobarbaz"
3718  * s # => "foobarbaz"
3719  *
3720  * For each given object +object+ that is an Integer,
3721  * the value is considered a codepoint and converted to a character before concatenation:
3722  *
3723  * s = 'foo'
3724  * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3725  *
3726  * Related: String#<<, which takes a single argument.
3727  */
3728 static VALUE
3729 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3730 {
3731  str_modifiable(str);
3732 
3733  if (argc == 1) {
3734  return rb_str_concat(str, argv[0]);
3735  }
3736  else if (argc > 1) {
3737  int i;
3738  VALUE arg_str = rb_str_tmp_new(0);
3739  rb_enc_copy(arg_str, str);
3740  for (i = 0; i < argc; i++) {
3741  rb_str_concat(arg_str, argv[i]);
3742  }
3743  rb_str_buf_append(str, arg_str);
3744  }
3745 
3746  return str;
3747 }
3748 
3749 /*
3750  * call-seq:
3751  * append_as_bytes(*objects) -> string
3752  *
3753  * Concatenates each object in +objects+ into +self+ without any encoding
3754  * validation or conversion and returns +self+:
3755  *
3756  * s = 'foo'
3757  * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3758  * s.valid_encoding? # => false
3759  * s.append_as_bytes("\xAC 12")
3760  * s.valid_encoding? # => true
3761  *
3762  * For each given object +object+ that is an Integer,
3763  * the value is considered a Byte. If the Integer is bigger
3764  * than one byte, only the lower byte is considered, similar to String#setbyte:
3765  *
3766  * s = ""
3767  * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3768  *
3769  * Related: String#<<, String#concat, which do an encoding aware concatenation.
3770  */
3771 
3772 VALUE
3773 rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3774 {
3775  long needed_capacity = 0;
3776  volatile VALUE t0;
3777  enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3778 
3779  for (int index = 0; index < argc; index++) {
3780  VALUE obj = argv[index];
3781  enum ruby_value_type type = types[index] = rb_type(obj);
3782  switch (type) {
3783  case T_FIXNUM:
3784  case T_BIGNUM:
3785  needed_capacity++;
3786  break;
3787  case T_STRING:
3788  needed_capacity += RSTRING_LEN(obj);
3789  break;
3790  default:
3791  rb_raise(
3792  rb_eTypeError,
3793  "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3794  rb_obj_class(obj)
3795  );
3796  break;
3797  }
3798  }
3799 
3800  str_ensure_available_capa(str, needed_capacity);
3801  char *sptr = RSTRING_END(str);
3802 
3803  for (int index = 0; index < argc; index++) {
3804  VALUE obj = argv[index];
3805  enum ruby_value_type type = types[index];
3806  switch (type) {
3807  case T_FIXNUM:
3808  case T_BIGNUM: {
3809  argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3810  char byte = (char)(NUM2INT(obj) & 0xFF);
3811  *sptr = byte;
3812  sptr++;
3813  break;
3814  }
3815  case T_STRING: {
3816  const char *ptr;
3817  long len;
3818  RSTRING_GETMEM(obj, ptr, len);
3819  memcpy(sptr, ptr, len);
3820  sptr += len;
3821  break;
3822  }
3823  default:
3824  rb_bug("append_as_bytes arguments should have been validated");
3825  }
3826  }
3827 
3828  STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3829  TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3830 
3831  int cr = ENC_CODERANGE(str);
3832  switch (cr) {
3833  case ENC_CODERANGE_7BIT: {
3834  for (int index = 0; index < argc; index++) {
3835  VALUE obj = argv[index];
3836  enum ruby_value_type type = types[index];
3837  switch (type) {
3838  case T_FIXNUM:
3839  case T_BIGNUM: {
3840  if (!ISASCII(NUM2INT(obj))) {
3841  goto clear_cr;
3842  }
3843  break;
3844  }
3845  case T_STRING: {
3846  if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3847  goto clear_cr;
3848  }
3849  break;
3850  }
3851  default:
3852  rb_bug("append_as_bytes arguments should have been validated");
3853  }
3854  }
3855  break;
3856  }
3857  case ENC_CODERANGE_VALID:
3858  if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3859  goto keep_cr;
3860  }
3861  else {
3862  goto clear_cr;
3863  }
3864  break;
3865  default:
3866  goto clear_cr;
3867  break;
3868  }
3869 
3870  RB_GC_GUARD(t0);
3871 
3872  clear_cr:
3873  // If no fast path was hit, we clear the coderange.
3874  // append_as_bytes is predominently meant to be used in
3875  // buffering situation, hence it's likely the coderange
3876  // will never be scanned, so it's not worth spending time
3877  // precomputing the coderange except for simple and common
3878  // situations.
3879  ENC_CODERANGE_CLEAR(str);
3880  keep_cr:
3881  return str;
3882 }
3883 
3884 /*
3885  * call-seq:
3886  * string << object -> string
3887  *
3888  * Concatenates +object+ to +self+ and returns +self+:
3889  *
3890  * s = 'foo'
3891  * s << 'bar' # => "foobar"
3892  * s # => "foobar"
3893  *
3894  * If +object+ is an Integer,
3895  * the value is considered a codepoint and converted to a character before concatenation:
3896  *
3897  * s = 'foo'
3898  * s << 33 # => "foo!"
3899  *
3900  * If that codepoint is not representable in the encoding of
3901  * _string_, RangeError is raised.
3902  *
3903  * s = 'foo'
3904  * s.encoding # => <Encoding:UTF-8>
3905  * s << 0x00110000 # 1114112 out of char range (RangeError)
3906  * s = 'foo'.encode('EUC-JP')
3907  * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3908  *
3909  * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3910  * is automatically promoted to ASCII-8BIT.
3911  *
3912  * s = 'foo'.encode('US-ASCII')
3913  * s << 0xff
3914  * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3915  *
3916  * Related: String#concat, which takes multiple arguments.
3917  */
3918 VALUE
3920 {
3921  unsigned int code;
3922  rb_encoding *enc = STR_ENC_GET(str1);
3923  int encidx;
3924 
3925  if (RB_INTEGER_TYPE_P(str2)) {
3926  if (rb_num_to_uint(str2, &code) == 0) {
3927  }
3928  else if (FIXNUM_P(str2)) {
3929  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3930  }
3931  else {
3932  rb_raise(rb_eRangeError, "bignum out of char range");
3933  }
3934  }
3935  else {
3936  return rb_str_append(str1, str2);
3937  }
3938 
3939  encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3940 
3941  if (encidx >= 0) {
3942  rb_str_buf_cat_byte(str1, (unsigned char)code);
3943  }
3944  else {
3945  long pos = RSTRING_LEN(str1);
3946  int cr = ENC_CODERANGE(str1);
3947  int len;
3948  char *buf;
3949 
3950  switch (len = rb_enc_codelen(code, enc)) {
3951  case ONIGERR_INVALID_CODE_POINT_VALUE:
3952  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3953  break;
3954  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3955  case 0:
3956  rb_raise(rb_eRangeError, "%u out of char range", code);
3957  break;
3958  }
3959  buf = ALLOCA_N(char, len + 1);
3960  rb_enc_mbcput(code, buf, enc);
3961  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3962  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3963  }
3964  rb_str_resize(str1, pos+len);
3965  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3966  if (cr == ENC_CODERANGE_7BIT && code > 127) {
3967  cr = ENC_CODERANGE_VALID;
3968  }
3969  else if (cr == ENC_CODERANGE_BROKEN) {
3970  cr = ENC_CODERANGE_UNKNOWN;
3971  }
3972  ENC_CODERANGE_SET(str1, cr);
3973  }
3974  return str1;
3975 }
3976 
3977 int
3978 rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3979 {
3980  int encidx = rb_enc_to_index(enc);
3981 
3982  if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3983  /* US-ASCII automatically extended to ASCII-8BIT */
3984  if (code > 0xFF) {
3985  rb_raise(rb_eRangeError, "%u out of char range", code);
3986  }
3987  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3988  return ENCINDEX_ASCII_8BIT;
3989  }
3990  return encidx;
3991  }
3992  else {
3993  return -1;
3994  }
3995 }
3996 
3997 /*
3998  * call-seq:
3999  * prepend(*other_strings) -> string
4000  *
4001  * Prepends each string in +other_strings+ to +self+ and returns +self+:
4002  *
4003  * s = 'foo'
4004  * s.prepend('bar', 'baz') # => "barbazfoo"
4005  * s # => "barbazfoo"
4006  *
4007  * Related: String#concat.
4008  */
4009 
4010 static VALUE
4011 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4012 {
4013  str_modifiable(str);
4014 
4015  if (argc == 1) {
4016  rb_str_update(str, 0L, 0L, argv[0]);
4017  }
4018  else if (argc > 1) {
4019  int i;
4020  VALUE arg_str = rb_str_tmp_new(0);
4021  rb_enc_copy(arg_str, str);
4022  for (i = 0; i < argc; i++) {
4023  rb_str_append(arg_str, argv[i]);
4024  }
4025  rb_str_update(str, 0L, 0L, arg_str);
4026  }
4027 
4028  return str;
4029 }
4030 
4031 st_index_t
4033 {
4034  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4035  st_index_t precomputed_hash;
4036  memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4037 
4038  RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4039  return precomputed_hash;
4040  }
4041 
4042  return str_do_hash(str);
4043 }
4044 
4045 int
4047 {
4048  long len1, len2;
4049  const char *ptr1, *ptr2;
4050  RSTRING_GETMEM(str1, ptr1, len1);
4051  RSTRING_GETMEM(str2, ptr2, len2);
4052  return (len1 != len2 ||
4053  !rb_str_comparable(str1, str2) ||
4054  memcmp(ptr1, ptr2, len1) != 0);
4055 }
4056 
4057 /*
4058  * call-seq:
4059  * hash -> integer
4060  *
4061  * Returns the integer hash value for +self+.
4062  * The value is based on the length, content and encoding of +self+.
4063  *
4064  * Related: Object#hash.
4065  */
4066 
4067 static VALUE
4068 rb_str_hash_m(VALUE str)
4069 {
4070  st_index_t hval = rb_str_hash(str);
4071  return ST2FIX(hval);
4072 }
4073 
4074 #define lesser(a,b) (((a)>(b))?(b):(a))
4075 
4076 int
4078 {
4079  int idx1, idx2;
4080  int rc1, rc2;
4081 
4082  if (RSTRING_LEN(str1) == 0) return TRUE;
4083  if (RSTRING_LEN(str2) == 0) return TRUE;
4084  idx1 = ENCODING_GET(str1);
4085  idx2 = ENCODING_GET(str2);
4086  if (idx1 == idx2) return TRUE;
4087  rc1 = rb_enc_str_coderange(str1);
4088  rc2 = rb_enc_str_coderange(str2);
4089  if (rc1 == ENC_CODERANGE_7BIT) {
4090  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4092  return TRUE;
4093  }
4094  if (rc2 == ENC_CODERANGE_7BIT) {
4096  return TRUE;
4097  }
4098  return FALSE;
4099 }
4100 
4101 int
4103 {
4104  long len1, len2;
4105  const char *ptr1, *ptr2;
4106  int retval;
4107 
4108  if (str1 == str2) return 0;
4109  RSTRING_GETMEM(str1, ptr1, len1);
4110  RSTRING_GETMEM(str2, ptr2, len2);
4111  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4112  if (len1 == len2) {
4113  if (!rb_str_comparable(str1, str2)) {
4114  if (ENCODING_GET(str1) > ENCODING_GET(str2))
4115  return 1;
4116  return -1;
4117  }
4118  return 0;
4119  }
4120  if (len1 > len2) return 1;
4121  return -1;
4122  }
4123  if (retval > 0) return 1;
4124  return -1;
4125 }
4126 
4127 /*
4128  * call-seq:
4129  * string == object -> true or false
4130  * string === object -> true or false
4131  *
4132  * Returns +true+ if +object+ has the same length and content;
4133  * as +self+; +false+ otherwise:
4134  *
4135  * s = 'foo'
4136  * s == 'foo' # => true
4137  * s == 'food' # => false
4138  * s == 'FOO' # => false
4139  *
4140  * Returns +false+ if the two strings' encodings are not compatible:
4141  * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4142  *
4143  * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4144  * two strings are compared using <code>object.==</code>.
4145  */
4146 
4147 VALUE
4149 {
4150  if (str1 == str2) return Qtrue;
4151  if (!RB_TYPE_P(str2, T_STRING)) {
4152  if (!rb_respond_to(str2, idTo_str)) {
4153  return Qfalse;
4154  }
4155  return rb_equal(str2, str1);
4156  }
4157  return rb_str_eql_internal(str1, str2);
4158 }
4159 
4160 /*
4161  * call-seq:
4162  * eql?(object) -> true or false
4163  *
4164  * Returns +true+ if +object+ has the same length and content;
4165  * as +self+; +false+ otherwise:
4166  *
4167  * s = 'foo'
4168  * s.eql?('foo') # => true
4169  * s.eql?('food') # => false
4170  * s.eql?('FOO') # => false
4171  *
4172  * Returns +false+ if the two strings' encodings are not compatible:
4173  *
4174  * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4175  *
4176  */
4177 
4178 VALUE
4179 rb_str_eql(VALUE str1, VALUE str2)
4180 {
4181  if (str1 == str2) return Qtrue;
4182  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4183  return rb_str_eql_internal(str1, str2);
4184 }
4185 
4186 /*
4187  * call-seq:
4188  * string <=> other_string -> -1, 0, 1, or nil
4189  *
4190  * Compares +self+ and +other_string+, returning:
4191  *
4192  * - -1 if +other_string+ is larger.
4193  * - 0 if the two are equal.
4194  * - 1 if +other_string+ is smaller.
4195  * - +nil+ if the two are incomparable.
4196  *
4197  * Examples:
4198  *
4199  * 'foo' <=> 'foo' # => 0
4200  * 'foo' <=> 'food' # => -1
4201  * 'food' <=> 'foo' # => 1
4202  * 'FOO' <=> 'foo' # => -1
4203  * 'foo' <=> 'FOO' # => 1
4204  * 'foo' <=> 1 # => nil
4205  *
4206  */
4207 
4208 static VALUE
4209 rb_str_cmp_m(VALUE str1, VALUE str2)
4210 {
4211  int result;
4212  VALUE s = rb_check_string_type(str2);
4213  if (NIL_P(s)) {
4214  return rb_invcmp(str1, str2);
4215  }
4216  result = rb_str_cmp(str1, s);
4217  return INT2FIX(result);
4218 }
4219 
4220 static VALUE str_casecmp(VALUE str1, VALUE str2);
4221 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4222 
4223 /*
4224  * call-seq:
4225  * casecmp(other_string) -> -1, 0, 1, or nil
4226  *
4227  * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4228  *
4229  * - -1 if <tt>other_string.downcase</tt> is larger.
4230  * - 0 if the two are equal.
4231  * - 1 if <tt>other_string.downcase</tt> is smaller.
4232  * - +nil+ if the two are incomparable.
4233  *
4234  * Examples:
4235  *
4236  * 'foo'.casecmp('foo') # => 0
4237  * 'foo'.casecmp('food') # => -1
4238  * 'food'.casecmp('foo') # => 1
4239  * 'FOO'.casecmp('foo') # => 0
4240  * 'foo'.casecmp('FOO') # => 0
4241  * 'foo'.casecmp(1) # => nil
4242  *
4243  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4244  *
4245  * Related: String#casecmp?.
4246  *
4247  */
4248 
4249 static VALUE
4250 rb_str_casecmp(VALUE str1, VALUE str2)
4251 {
4252  VALUE s = rb_check_string_type(str2);
4253  if (NIL_P(s)) {
4254  return Qnil;
4255  }
4256  return str_casecmp(str1, s);
4257 }
4258 
4259 static VALUE
4260 str_casecmp(VALUE str1, VALUE str2)
4261 {
4262  long len;
4263  rb_encoding *enc;
4264  const char *p1, *p1end, *p2, *p2end;
4265 
4266  enc = rb_enc_compatible(str1, str2);
4267  if (!enc) {
4268  return Qnil;
4269  }
4270 
4271  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4272  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4273  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4274  while (p1 < p1end && p2 < p2end) {
4275  if (*p1 != *p2) {
4276  unsigned int c1 = TOLOWER(*p1 & 0xff);
4277  unsigned int c2 = TOLOWER(*p2 & 0xff);
4278  if (c1 != c2)
4279  return INT2FIX(c1 < c2 ? -1 : 1);
4280  }
4281  p1++;
4282  p2++;
4283  }
4284  }
4285  else {
4286  while (p1 < p1end && p2 < p2end) {
4287  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4288  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4289 
4290  if (0 <= c1 && 0 <= c2) {
4291  c1 = TOLOWER(c1);
4292  c2 = TOLOWER(c2);
4293  if (c1 != c2)
4294  return INT2FIX(c1 < c2 ? -1 : 1);
4295  }
4296  else {
4297  int r;
4298  l1 = rb_enc_mbclen(p1, p1end, enc);
4299  l2 = rb_enc_mbclen(p2, p2end, enc);
4300  len = l1 < l2 ? l1 : l2;
4301  r = memcmp(p1, p2, len);
4302  if (r != 0)
4303  return INT2FIX(r < 0 ? -1 : 1);
4304  if (l1 != l2)
4305  return INT2FIX(l1 < l2 ? -1 : 1);
4306  }
4307  p1 += l1;
4308  p2 += l2;
4309  }
4310  }
4311  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4312  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4313  return INT2FIX(-1);
4314 }
4315 
4316 /*
4317  * call-seq:
4318  * casecmp?(other_string) -> true, false, or nil
4319  *
4320  * Returns +true+ if +self+ and +other_string+ are equal after
4321  * Unicode case folding, otherwise +false+:
4322  *
4323  * 'foo'.casecmp?('foo') # => true
4324  * 'foo'.casecmp?('food') # => false
4325  * 'food'.casecmp?('foo') # => false
4326  * 'FOO'.casecmp?('foo') # => true
4327  * 'foo'.casecmp?('FOO') # => true
4328  *
4329  * Returns +nil+ if the two values are incomparable:
4330  *
4331  * 'foo'.casecmp?(1) # => nil
4332  *
4333  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4334  *
4335  * Related: String#casecmp.
4336  *
4337  */
4338 
4339 static VALUE
4340 rb_str_casecmp_p(VALUE str1, VALUE str2)
4341 {
4342  VALUE s = rb_check_string_type(str2);
4343  if (NIL_P(s)) {
4344  return Qnil;
4345  }
4346  return str_casecmp_p(str1, s);
4347 }
4348 
4349 static VALUE
4350 str_casecmp_p(VALUE str1, VALUE str2)
4351 {
4352  rb_encoding *enc;
4353  VALUE folded_str1, folded_str2;
4354  VALUE fold_opt = sym_fold;
4355 
4356  enc = rb_enc_compatible(str1, str2);
4357  if (!enc) {
4358  return Qnil;
4359  }
4360 
4361  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4362  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4363 
4364  return rb_str_eql(folded_str1, folded_str2);
4365 }
4366 
4367 static long
4368 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4369  const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4370 {
4371  const char *search_start = str_ptr;
4372  long pos, search_len = str_len - offset;
4373 
4374  for (;;) {
4375  const char *t;
4376  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4377  if (pos < 0) return pos;
4378  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4379  if (t == search_start + pos) break;
4380  search_len -= t - search_start;
4381  if (search_len <= 0) return -1;
4382  offset += t - search_start;
4383  search_start = t;
4384  }
4385  return pos + offset;
4386 }
4387 
4388 /* found index in byte */
4389 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4390 #define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4391 
4392 static long
4393 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4394 {
4395  const char *str_ptr, *str_ptr_end, *sub_ptr;
4396  long str_len, sub_len;
4397  rb_encoding *enc;
4398 
4399  enc = rb_enc_check(str, sub);
4400  if (is_broken_string(sub)) return -1;
4401 
4402  str_ptr = RSTRING_PTR(str);
4403  str_ptr_end = RSTRING_END(str);
4404  str_len = RSTRING_LEN(str);
4405  sub_ptr = RSTRING_PTR(sub);
4406  sub_len = RSTRING_LEN(sub);
4407 
4408  if (str_len < sub_len) return -1;
4409 
4410  if (offset != 0) {
4411  long str_len_char, sub_len_char;
4412  int single_byte = single_byte_optimizable(str);
4413  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4414  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4415  if (offset < 0) {
4416  offset += str_len_char;
4417  if (offset < 0) return -1;
4418  }
4419  if (str_len_char - offset < sub_len_char) return -1;
4420  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4421  str_ptr += offset;
4422  }
4423  if (sub_len == 0) return offset;
4424 
4425  /* need proceed one character at a time */
4426  return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4427 }
4428 
4429 
4430 /*
4431  * call-seq:
4432  * index(substring, offset = 0) -> integer or nil
4433  * index(regexp, offset = 0) -> integer or nil
4434  *
4435  * :include: doc/string/index.rdoc
4436  *
4437  */
4438 
4439 static VALUE
4440 rb_str_index_m(int argc, VALUE *argv, VALUE str)
4441 {
4442  VALUE sub;
4443  VALUE initpos;
4444  rb_encoding *enc = STR_ENC_GET(str);
4445  long pos;
4446 
4447  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4448  long slen = str_strlen(str, enc); /* str's enc */
4449  pos = NUM2LONG(initpos);
4450  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4451  if (RB_TYPE_P(sub, T_REGEXP)) {
4453  }
4454  return Qnil;
4455  }
4456  }
4457  else {
4458  pos = 0;
4459  }
4460 
4461  if (RB_TYPE_P(sub, T_REGEXP)) {
4462  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4463  enc, single_byte_optimizable(str));
4464 
4465  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4466  VALUE match = rb_backref_get();
4467  struct re_registers *regs = RMATCH_REGS(match);
4468  pos = rb_str_sublen(str, BEG(0));
4469  return LONG2NUM(pos);
4470  }
4471  }
4472  else {
4473  StringValue(sub);
4474  pos = rb_str_index(str, sub, pos);
4475  if (pos >= 0) {
4476  pos = rb_str_sublen(str, pos);
4477  return LONG2NUM(pos);
4478  }
4479  }
4480  return Qnil;
4481 }
4482 
4483 /* Ensure that the given pos is a valid character boundary.
4484  * Note that in this function, "character" means a code point
4485  * (Unicode scalar value), not a grapheme cluster.
4486  */
4487 static void
4488 str_ensure_byte_pos(VALUE str, long pos)
4489 {
4490  if (!single_byte_optimizable(str)) {
4491  const char *s = RSTRING_PTR(str);
4492  const char *e = RSTRING_END(str);
4493  const char *p = s + pos;
4494  if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4496  "offset %ld does not land on character boundary", pos);
4497  }
4498  }
4499 }
4500 
4501 /*
4502  * call-seq:
4503  * byteindex(substring, offset = 0) -> integer or nil
4504  * byteindex(regexp, offset = 0) -> integer or nil
4505  *
4506  * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4507  * or +nil+ if none found:
4508  *
4509  * 'foo'.byteindex('f') # => 0
4510  * 'foo'.byteindex('o') # => 1
4511  * 'foo'.byteindex('oo') # => 1
4512  * 'foo'.byteindex('ooo') # => nil
4513  *
4514  * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4515  * or +nil+ if none found:
4516  *
4517  * 'foo'.byteindex(/f/) # => 0
4518  * 'foo'.byteindex(/o/) # => 1
4519  * 'foo'.byteindex(/oo/) # => 1
4520  * 'foo'.byteindex(/ooo/) # => nil
4521  *
4522  * Integer argument +offset+, if given, specifies the byte-based position in the
4523  * string to begin the search:
4524  *
4525  * 'foo'.byteindex('o', 1) # => 1
4526  * 'foo'.byteindex('o', 2) # => 2
4527  * 'foo'.byteindex('o', 3) # => nil
4528  *
4529  * If +offset+ is negative, counts backward from the end of +self+:
4530  *
4531  * 'foo'.byteindex('o', -1) # => 2
4532  * 'foo'.byteindex('o', -2) # => 1
4533  * 'foo'.byteindex('o', -3) # => 1
4534  * 'foo'.byteindex('o', -4) # => nil
4535  *
4536  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4537  * raised.
4538  *
4539  * Related: String#index, String#byterindex.
4540  */
4541 
4542 static VALUE
4543 rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4544 {
4545  VALUE sub;
4546  VALUE initpos;
4547  long pos;
4548 
4549  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4550  long slen = RSTRING_LEN(str);
4551  pos = NUM2LONG(initpos);
4552  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4553  if (RB_TYPE_P(sub, T_REGEXP)) {
4555  }
4556  return Qnil;
4557  }
4558  }
4559  else {
4560  pos = 0;
4561  }
4562 
4563  str_ensure_byte_pos(str, pos);
4564 
4565  if (RB_TYPE_P(sub, T_REGEXP)) {
4566  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4567  VALUE match = rb_backref_get();
4568  struct re_registers *regs = RMATCH_REGS(match);
4569  pos = BEG(0);
4570  return LONG2NUM(pos);
4571  }
4572  }
4573  else {
4574  StringValue(sub);
4575  pos = rb_str_byteindex(str, sub, pos);
4576  if (pos >= 0) return LONG2NUM(pos);
4577  }
4578  return Qnil;
4579 }
4580 
4581 #ifndef HAVE_MEMRCHR
4582 static void*
4583 memrchr(const char *search_str, int chr, long search_len)
4584 {
4585  const char *ptr = search_str + search_len;
4586  while (ptr > search_str) {
4587  if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4588  }
4589 
4590  return ((void *)0);
4591 }
4592 #endif
4593 
4594 static long
4595 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4596 {
4597  char *hit, *adjusted;
4598  int c;
4599  long slen, searchlen;
4600  char *sbeg, *e, *t;
4601 
4602  sbeg = RSTRING_PTR(str);
4603  slen = RSTRING_LEN(sub);
4604  if (slen == 0) return s - sbeg;
4605  e = RSTRING_END(str);
4606  t = RSTRING_PTR(sub);
4607  c = *t & 0xff;
4608  searchlen = s - sbeg + 1;
4609 
4610  if (memcmp(s, t, slen) == 0) {
4611  return s - sbeg;
4612  }
4613 
4614  do {
4615  hit = memrchr(sbeg, c, searchlen);
4616  if (!hit) break;
4617  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4618  if (hit != adjusted) {
4619  searchlen = adjusted - sbeg;
4620  continue;
4621  }
4622  if (memcmp(hit, t, slen) == 0)
4623  return hit - sbeg;
4624  searchlen = adjusted - sbeg;
4625  } while (searchlen > 0);
4626 
4627  return -1;
4628 }
4629 
4630 /* found index in byte */
4631 static long
4632 rb_str_rindex(VALUE str, VALUE sub, long pos)
4633 {
4634  long len, slen;
4635  char *sbeg, *s;
4636  rb_encoding *enc;
4637  int singlebyte;
4638 
4639  enc = rb_enc_check(str, sub);
4640  if (is_broken_string(sub)) return -1;
4641  singlebyte = single_byte_optimizable(str);
4642  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4643  slen = str_strlen(sub, enc); /* rb_enc_check */
4644 
4645  /* substring longer than string */
4646  if (len < slen) return -1;
4647  if (len - pos < slen) pos = len - slen;
4648  if (len == 0) return pos;
4649 
4650  sbeg = RSTRING_PTR(str);
4651 
4652  if (pos == 0) {
4653  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4654  return 0;
4655  else
4656  return -1;
4657  }
4658 
4659  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4660  return str_rindex(str, sub, s, enc);
4661 }
4662 
4663 /*
4664  * call-seq:
4665  * rindex(substring, offset = self.length) -> integer or nil
4666  * rindex(regexp, offset = self.length) -> integer or nil
4667  *
4668  * Returns the Integer index of the _last_ occurrence of the given +substring+,
4669  * or +nil+ if none found:
4670  *
4671  * 'foo'.rindex('f') # => 0
4672  * 'foo'.rindex('o') # => 2
4673  * 'foo'.rindex('oo') # => 1
4674  * 'foo'.rindex('ooo') # => nil
4675  *
4676  * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4677  * or +nil+ if none found:
4678  *
4679  * 'foo'.rindex(/f/) # => 0
4680  * 'foo'.rindex(/o/) # => 2
4681  * 'foo'.rindex(/oo/) # => 1
4682  * 'foo'.rindex(/ooo/) # => nil
4683  *
4684  * The _last_ match means starting at the possible last position, not
4685  * the last of longest matches.
4686  *
4687  * 'foo'.rindex(/o+/) # => 2
4688  * $~ #=> #<MatchData "o">
4689  *
4690  * To get the last longest match, needs to combine with negative
4691  * lookbehind.
4692  *
4693  * 'foo'.rindex(/(?<!o)o+/) # => 1
4694  * $~ #=> #<MatchData "oo">
4695  *
4696  * Or String#index with negative lookforward.
4697  *
4698  * 'foo'.index(/o+(?!.*o)/) # => 1
4699  * $~ #=> #<MatchData "oo">
4700  *
4701  * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4702  * string to _end_ the search:
4703  *
4704  * 'foo'.rindex('o', 0) # => nil
4705  * 'foo'.rindex('o', 1) # => 1
4706  * 'foo'.rindex('o', 2) # => 2
4707  * 'foo'.rindex('o', 3) # => 2
4708  *
4709  * If +offset+ is a negative Integer, the maximum starting position in the
4710  * string to _end_ the search is the sum of the string's length and +offset+:
4711  *
4712  * 'foo'.rindex('o', -1) # => 2
4713  * 'foo'.rindex('o', -2) # => 1
4714  * 'foo'.rindex('o', -3) # => nil
4715  * 'foo'.rindex('o', -4) # => nil
4716  *
4717  * Related: String#index.
4718  */
4719 
4720 static VALUE
4721 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4722 {
4723  VALUE sub;
4724  VALUE initpos;
4725  rb_encoding *enc = STR_ENC_GET(str);
4726  long pos, len = str_strlen(str, enc); /* str's enc */
4727 
4728  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4729  pos = NUM2LONG(initpos);
4730  if (pos < 0 && (pos += len) < 0) {
4731  if (RB_TYPE_P(sub, T_REGEXP)) {
4733  }
4734  return Qnil;
4735  }
4736  if (pos > len) pos = len;
4737  }
4738  else {
4739  pos = len;
4740  }
4741 
4742  if (RB_TYPE_P(sub, T_REGEXP)) {
4743  /* enc = rb_enc_check(str, sub); */
4744  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4745  enc, single_byte_optimizable(str));
4746 
4747  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4748  VALUE match = rb_backref_get();
4749  struct re_registers *regs = RMATCH_REGS(match);
4750  pos = rb_str_sublen(str, BEG(0));
4751  return LONG2NUM(pos);
4752  }
4753  }
4754  else {
4755  StringValue(sub);
4756  pos = rb_str_rindex(str, sub, pos);
4757  if (pos >= 0) {
4758  pos = rb_str_sublen(str, pos);
4759  return LONG2NUM(pos);
4760  }
4761  }
4762  return Qnil;
4763 }
4764 
4765 static long
4766 rb_str_byterindex(VALUE str, VALUE sub, long pos)
4767 {
4768  long len, slen;
4769  char *sbeg, *s;
4770  rb_encoding *enc;
4771 
4772  enc = rb_enc_check(str, sub);
4773  if (is_broken_string(sub)) return -1;
4774  len = RSTRING_LEN(str);
4775  slen = RSTRING_LEN(sub);
4776 
4777  /* substring longer than string */
4778  if (len < slen) return -1;
4779  if (len - pos < slen) pos = len - slen;
4780  if (len == 0) return pos;
4781 
4782  sbeg = RSTRING_PTR(str);
4783 
4784  if (pos == 0) {
4785  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4786  return 0;
4787  else
4788  return -1;
4789  }
4790 
4791  s = sbeg + pos;
4792  return str_rindex(str, sub, s, enc);
4793 }
4794 
4795 
4796 /*
4797  * call-seq:
4798  * byterindex(substring, offset = self.bytesize) -> integer or nil
4799  * byterindex(regexp, offset = self.bytesize) -> integer or nil
4800  *
4801  * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4802  * or +nil+ if none found:
4803  *
4804  * 'foo'.byterindex('f') # => 0
4805  * 'foo'.byterindex('o') # => 2
4806  * 'foo'.byterindex('oo') # => 1
4807  * 'foo'.byterindex('ooo') # => nil
4808  *
4809  * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4810  * or +nil+ if none found:
4811  *
4812  * 'foo'.byterindex(/f/) # => 0
4813  * 'foo'.byterindex(/o/) # => 2
4814  * 'foo'.byterindex(/oo/) # => 1
4815  * 'foo'.byterindex(/ooo/) # => nil
4816  *
4817  * The _last_ match means starting at the possible last position, not
4818  * the last of longest matches.
4819  *
4820  * 'foo'.byterindex(/o+/) # => 2
4821  * $~ #=> #<MatchData "o">
4822  *
4823  * To get the last longest match, needs to combine with negative
4824  * lookbehind.
4825  *
4826  * 'foo'.byterindex(/(?<!o)o+/) # => 1
4827  * $~ #=> #<MatchData "oo">
4828  *
4829  * Or String#byteindex with negative lookforward.
4830  *
4831  * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4832  * $~ #=> #<MatchData "oo">
4833  *
4834  * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4835  * string to _end_ the search:
4836  *
4837  * 'foo'.byterindex('o', 0) # => nil
4838  * 'foo'.byterindex('o', 1) # => 1
4839  * 'foo'.byterindex('o', 2) # => 2
4840  * 'foo'.byterindex('o', 3) # => 2
4841  *
4842  * If +offset+ is a negative Integer, the maximum starting position in the
4843  * string to _end_ the search is the sum of the string's length and +offset+:
4844  *
4845  * 'foo'.byterindex('o', -1) # => 2
4846  * 'foo'.byterindex('o', -2) # => 1
4847  * 'foo'.byterindex('o', -3) # => nil
4848  * 'foo'.byterindex('o', -4) # => nil
4849  *
4850  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4851  * raised.
4852  *
4853  * Related: String#byteindex.
4854  */
4855 
4856 static VALUE
4857 rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4858 {
4859  VALUE sub;
4860  VALUE initpos;
4861  long pos, len = RSTRING_LEN(str);
4862 
4863  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4864  pos = NUM2LONG(initpos);
4865  if (pos < 0 && (pos += len) < 0) {
4866  if (RB_TYPE_P(sub, T_REGEXP)) {
4868  }
4869  return Qnil;
4870  }
4871  if (pos > len) pos = len;
4872  }
4873  else {
4874  pos = len;
4875  }
4876 
4877  str_ensure_byte_pos(str, pos);
4878 
4879  if (RB_TYPE_P(sub, T_REGEXP)) {
4880  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4881  VALUE match = rb_backref_get();
4882  struct re_registers *regs = RMATCH_REGS(match);
4883  pos = BEG(0);
4884  return LONG2NUM(pos);
4885  }
4886  }
4887  else {
4888  StringValue(sub);
4889  pos = rb_str_byterindex(str, sub, pos);
4890  if (pos >= 0) return LONG2NUM(pos);
4891  }
4892  return Qnil;
4893 }
4894 
4895 /*
4896  * call-seq:
4897  * string =~ regexp -> integer or nil
4898  * string =~ object -> integer or nil
4899  *
4900  * Returns the Integer index of the first substring that matches
4901  * the given +regexp+, or +nil+ if no match found:
4902  *
4903  * 'foo' =~ /f/ # => 0
4904  * 'foo' =~ /o/ # => 1
4905  * 'foo' =~ /x/ # => nil
4906  *
4907  * Note: also updates Regexp@Global+Variables.
4908  *
4909  * If the given +object+ is not a Regexp, returns the value
4910  * returned by <tt>object =~ self</tt>.
4911  *
4912  * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4913  * (see Regexp#=~):
4914  *
4915  * number= nil
4916  * "no. 9" =~ /(?<number>\d+)/
4917  * number # => nil (not assigned)
4918  * /(?<number>\d+)/ =~ "no. 9"
4919  * number #=> "9"
4920  *
4921  */
4922 
4923 static VALUE
4924 rb_str_match(VALUE x, VALUE y)
4925 {
4926  switch (OBJ_BUILTIN_TYPE(y)) {
4927  case T_STRING:
4928  rb_raise(rb_eTypeError, "type mismatch: String given");
4929 
4930  case T_REGEXP:
4931  return rb_reg_match(y, x);
4932 
4933  default:
4934  return rb_funcall(y, idEqTilde, 1, x);
4935  }
4936 }
4937 
4938 
4939 static VALUE get_pat(VALUE);
4940 
4941 
4942 /*
4943  * call-seq:
4944  * match(pattern, offset = 0) -> matchdata or nil
4945  * match(pattern, offset = 0) {|matchdata| ... } -> object
4946  *
4947  * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4948  *
4949  * Note: also updates Regexp@Global+Variables.
4950  *
4951  * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4952  * regexp = Regexp.new(pattern)
4953  * - Computes +matchdata+, which will be either a MatchData object or +nil+
4954  * (see Regexp#match):
4955  * matchdata = <tt>regexp.match(self)
4956  *
4957  * With no block given, returns the computed +matchdata+:
4958  *
4959  * 'foo'.match('f') # => #<MatchData "f">
4960  * 'foo'.match('o') # => #<MatchData "o">
4961  * 'foo'.match('x') # => nil
4962  *
4963  * If Integer argument +offset+ is given, the search begins at index +offset+:
4964  *
4965  * 'foo'.match('f', 1) # => nil
4966  * 'foo'.match('o', 1) # => #<MatchData "o">
4967  *
4968  * With a block given, calls the block with the computed +matchdata+
4969  * and returns the block's return value:
4970  *
4971  * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4972  * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4973  * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4974  *
4975  */
4976 
4977 static VALUE
4978 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4979 {
4980  VALUE re, result;
4981  if (argc < 1)
4982  rb_check_arity(argc, 1, 2);
4983  re = argv[0];
4984  argv[0] = str;
4985  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4986  if (!NIL_P(result) && rb_block_given_p()) {
4987  return rb_yield(result);
4988  }
4989  return result;
4990 }
4991 
4992 /*
4993  * call-seq:
4994  * match?(pattern, offset = 0) -> true or false
4995  *
4996  * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4997  *
4998  * Note: does not update Regexp@Global+Variables.
4999  *
5000  * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
5001  * regexp = Regexp.new(pattern)
5002  *
5003  * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5004  * +false+ otherwise:
5005  *
5006  * 'foo'.match?(/o/) # => true
5007  * 'foo'.match?('o') # => true
5008  * 'foo'.match?(/x/) # => false
5009  *
5010  * If Integer argument +offset+ is given, the search begins at index +offset+:
5011  * 'foo'.match?('f', 1) # => false
5012  * 'foo'.match?('o', 1) # => true
5013  *
5014  */
5015 
5016 static VALUE
5017 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5018 {
5019  VALUE re;
5020  rb_check_arity(argc, 1, 2);
5021  re = get_pat(argv[0]);
5022  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5023 }
5024 
5025 enum neighbor_char {
5026  NEIGHBOR_NOT_CHAR,
5027  NEIGHBOR_FOUND,
5028  NEIGHBOR_WRAPPED
5029 };
5030 
5031 static enum neighbor_char
5032 enc_succ_char(char *p, long len, rb_encoding *enc)
5033 {
5034  long i;
5035  int l;
5036 
5037  if (rb_enc_mbminlen(enc) > 1) {
5038  /* wchar, trivial case */
5039  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5040  if (!MBCLEN_CHARFOUND_P(r)) {
5041  return NEIGHBOR_NOT_CHAR;
5042  }
5043  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5044  l = rb_enc_code_to_mbclen(c, enc);
5045  if (!l) return NEIGHBOR_NOT_CHAR;
5046  if (l != len) return NEIGHBOR_WRAPPED;
5047  rb_enc_mbcput(c, p, enc);
5048  r = rb_enc_precise_mbclen(p, p + len, enc);
5049  if (!MBCLEN_CHARFOUND_P(r)) {
5050  return NEIGHBOR_NOT_CHAR;
5051  }
5052  return NEIGHBOR_FOUND;
5053  }
5054  while (1) {
5055  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5056  p[i] = '\0';
5057  if (i < 0)
5058  return NEIGHBOR_WRAPPED;
5059  ++((unsigned char*)p)[i];
5060  l = rb_enc_precise_mbclen(p, p+len, enc);
5061  if (MBCLEN_CHARFOUND_P(l)) {
5062  l = MBCLEN_CHARFOUND_LEN(l);
5063  if (l == len) {
5064  return NEIGHBOR_FOUND;
5065  }
5066  else {
5067  memset(p+l, 0xff, len-l);
5068  }
5069  }
5070  if (MBCLEN_INVALID_P(l) && i < len-1) {
5071  long len2;
5072  int l2;
5073  for (len2 = len-1; 0 < len2; len2--) {
5074  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5075  if (!MBCLEN_INVALID_P(l2))
5076  break;
5077  }
5078  memset(p+len2+1, 0xff, len-(len2+1));
5079  }
5080  }
5081 }
5082 
5083 static enum neighbor_char
5084 enc_pred_char(char *p, long len, rb_encoding *enc)
5085 {
5086  long i;
5087  int l;
5088  if (rb_enc_mbminlen(enc) > 1) {
5089  /* wchar, trivial case */
5090  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5091  if (!MBCLEN_CHARFOUND_P(r)) {
5092  return NEIGHBOR_NOT_CHAR;
5093  }
5094  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5095  if (!c) return NEIGHBOR_NOT_CHAR;
5096  --c;
5097  l = rb_enc_code_to_mbclen(c, enc);
5098  if (!l) return NEIGHBOR_NOT_CHAR;
5099  if (l != len) return NEIGHBOR_WRAPPED;
5100  rb_enc_mbcput(c, p, enc);
5101  r = rb_enc_precise_mbclen(p, p + len, enc);
5102  if (!MBCLEN_CHARFOUND_P(r)) {
5103  return NEIGHBOR_NOT_CHAR;
5104  }
5105  return NEIGHBOR_FOUND;
5106  }
5107  while (1) {
5108  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5109  p[i] = '\xff';
5110  if (i < 0)
5111  return NEIGHBOR_WRAPPED;
5112  --((unsigned char*)p)[i];
5113  l = rb_enc_precise_mbclen(p, p+len, enc);
5114  if (MBCLEN_CHARFOUND_P(l)) {
5115  l = MBCLEN_CHARFOUND_LEN(l);
5116  if (l == len) {
5117  return NEIGHBOR_FOUND;
5118  }
5119  else {
5120  memset(p+l, 0, len-l);
5121  }
5122  }
5123  if (MBCLEN_INVALID_P(l) && i < len-1) {
5124  long len2;
5125  int l2;
5126  for (len2 = len-1; 0 < len2; len2--) {
5127  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5128  if (!MBCLEN_INVALID_P(l2))
5129  break;
5130  }
5131  memset(p+len2+1, 0, len-(len2+1));
5132  }
5133  }
5134 }
5135 
5136 /*
5137  overwrite +p+ by succeeding letter in +enc+ and returns
5138  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5139  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5140  assuming each ranges are successive, and mbclen
5141  never change in each ranges.
5142  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5143  character.
5144  */
5145 static enum neighbor_char
5146 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5147 {
5148  enum neighbor_char ret;
5149  unsigned int c;
5150  int ctype;
5151  int range;
5152  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5153 
5154  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5155  int try;
5156  const int max_gaps = 1;
5157 
5158  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5159  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5160  ctype = ONIGENC_CTYPE_DIGIT;
5161  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5162  ctype = ONIGENC_CTYPE_ALPHA;
5163  else
5164  return NEIGHBOR_NOT_CHAR;
5165 
5166  MEMCPY(save, p, char, len);
5167  for (try = 0; try <= max_gaps; ++try) {
5168  ret = enc_succ_char(p, len, enc);
5169  if (ret == NEIGHBOR_FOUND) {
5170  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5171  if (rb_enc_isctype(c, ctype, enc))
5172  return NEIGHBOR_FOUND;
5173  }
5174  }
5175  MEMCPY(p, save, char, len);
5176  range = 1;
5177  while (1) {
5178  MEMCPY(save, p, char, len);
5179  ret = enc_pred_char(p, len, enc);
5180  if (ret == NEIGHBOR_FOUND) {
5181  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5182  if (!rb_enc_isctype(c, ctype, enc)) {
5183  MEMCPY(p, save, char, len);
5184  break;
5185  }
5186  }
5187  else {
5188  MEMCPY(p, save, char, len);
5189  break;
5190  }
5191  range++;
5192  }
5193  if (range == 1) {
5194  return NEIGHBOR_NOT_CHAR;
5195  }
5196 
5197  if (ctype != ONIGENC_CTYPE_DIGIT) {
5198  MEMCPY(carry, p, char, len);
5199  return NEIGHBOR_WRAPPED;
5200  }
5201 
5202  MEMCPY(carry, p, char, len);
5203  enc_succ_char(carry, len, enc);
5204  return NEIGHBOR_WRAPPED;
5205 }
5206 
5207 
5208 static VALUE str_succ(VALUE str);
5209 
5210 /*
5211  * call-seq:
5212  * succ -> new_str
5213  *
5214  * Returns the successor to +self+. The successor is calculated by
5215  * incrementing characters.
5216  *
5217  * The first character to be incremented is the rightmost alphanumeric:
5218  * or, if no alphanumerics, the rightmost character:
5219  *
5220  * 'THX1138'.succ # => "THX1139"
5221  * '<<koala>>'.succ # => "<<koalb>>"
5222  * '***'.succ # => '**+'
5223  *
5224  * The successor to a digit is another digit, "carrying" to the next-left
5225  * character for a "rollover" from 9 to 0, and prepending another digit
5226  * if necessary:
5227  *
5228  * '00'.succ # => "01"
5229  * '09'.succ # => "10"
5230  * '99'.succ # => "100"
5231  *
5232  * The successor to a letter is another letter of the same case,
5233  * carrying to the next-left character for a rollover,
5234  * and prepending another same-case letter if necessary:
5235  *
5236  * 'aa'.succ # => "ab"
5237  * 'az'.succ # => "ba"
5238  * 'zz'.succ # => "aaa"
5239  * 'AA'.succ # => "AB"
5240  * 'AZ'.succ # => "BA"
5241  * 'ZZ'.succ # => "AAA"
5242  *
5243  * The successor to a non-alphanumeric character is the next character
5244  * in the underlying character set's collating sequence,
5245  * carrying to the next-left character for a rollover,
5246  * and prepending another character if necessary:
5247  *
5248  * s = 0.chr * 3
5249  * s # => "\x00\x00\x00"
5250  * s.succ # => "\x00\x00\x01"
5251  * s = 255.chr * 3
5252  * s # => "\xFF\xFF\xFF"
5253  * s.succ # => "\x01\x00\x00\x00"
5254  *
5255  * Carrying can occur between and among mixtures of alphanumeric characters:
5256  *
5257  * s = 'zz99zz99'
5258  * s.succ # => "aaa00aa00"
5259  * s = '99zz99zz'
5260  * s.succ # => "100aa00aa"
5261  *
5262  * The successor to an empty +String+ is a new empty +String+:
5263  *
5264  * ''.succ # => ""
5265  *
5266  */
5267 
5268 VALUE
5270 {
5271  VALUE str;
5272  str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5273  rb_enc_cr_str_copy_for_substr(str, orig);
5274  return str_succ(str);
5275 }
5276 
5277 static VALUE
5278 str_succ(VALUE str)
5279 {
5280  rb_encoding *enc;
5281  char *sbeg, *s, *e, *last_alnum = 0;
5282  int found_alnum = 0;
5283  long l, slen;
5284  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5285  long carry_pos = 0, carry_len = 1;
5286  enum neighbor_char neighbor = NEIGHBOR_FOUND;
5287 
5288  slen = RSTRING_LEN(str);
5289  if (slen == 0) return str;
5290 
5291  enc = STR_ENC_GET(str);
5292  sbeg = RSTRING_PTR(str);
5293  s = e = sbeg + slen;
5294 
5295  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5296  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5297  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5298  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5299  break;
5300  }
5301  }
5302  l = rb_enc_precise_mbclen(s, e, enc);
5303  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5304  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5305  neighbor = enc_succ_alnum_char(s, l, enc, carry);
5306  switch (neighbor) {
5307  case NEIGHBOR_NOT_CHAR:
5308  continue;
5309  case NEIGHBOR_FOUND:
5310  return str;
5311  case NEIGHBOR_WRAPPED:
5312  last_alnum = s;
5313  break;
5314  }
5315  found_alnum = 1;
5316  carry_pos = s - sbeg;
5317  carry_len = l;
5318  }
5319  if (!found_alnum) { /* str contains no alnum */
5320  s = e;
5321  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5322  enum neighbor_char neighbor;
5323  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5324  l = rb_enc_precise_mbclen(s, e, enc);
5325  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5326  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5327  MEMCPY(tmp, s, char, l);
5328  neighbor = enc_succ_char(tmp, l, enc);
5329  switch (neighbor) {
5330  case NEIGHBOR_FOUND:
5331  MEMCPY(s, tmp, char, l);
5332  return str;
5333  break;
5334  case NEIGHBOR_WRAPPED:
5335  MEMCPY(s, tmp, char, l);
5336  break;
5337  case NEIGHBOR_NOT_CHAR:
5338  break;
5339  }
5340  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5341  /* wrapped to \0...\0. search next valid char. */
5342  enc_succ_char(s, l, enc);
5343  }
5344  if (!rb_enc_asciicompat(enc)) {
5345  MEMCPY(carry, s, char, l);
5346  carry_len = l;
5347  }
5348  carry_pos = s - sbeg;
5349  }
5351  }
5352  RESIZE_CAPA(str, slen + carry_len);
5353  sbeg = RSTRING_PTR(str);
5354  s = sbeg + carry_pos;
5355  memmove(s + carry_len, s, slen - carry_pos);
5356  memmove(s, carry, carry_len);
5357  slen += carry_len;
5358  STR_SET_LEN(str, slen);
5359  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5360  rb_enc_str_coderange(str);
5361  return str;
5362 }
5363 
5364 
5365 /*
5366  * call-seq:
5367  * succ! -> self
5368  *
5369  * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5370  */
5371 
5372 static VALUE
5373 rb_str_succ_bang(VALUE str)
5374 {
5375  rb_str_modify(str);
5376  str_succ(str);
5377  return str;
5378 }
5379 
5380 static int
5381 all_digits_p(const char *s, long len)
5382 {
5383  while (len-- > 0) {
5384  if (!ISDIGIT(*s)) return 0;
5385  s++;
5386  }
5387  return 1;
5388 }
5389 
5390 static int
5391 str_upto_i(VALUE str, VALUE arg)
5392 {
5393  rb_yield(str);
5394  return 0;
5395 }
5396 
5397 /*
5398  * call-seq:
5399  * upto(other_string, exclusive = false) {|string| ... } -> self
5400  * upto(other_string, exclusive = false) -> new_enumerator
5401  *
5402  * With a block given, calls the block with each +String+ value
5403  * returned by successive calls to String#succ;
5404  * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5405  * the sequence terminates when value +other_string+ is reached;
5406  * returns +self+:
5407  *
5408  * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5409  * Output:
5410  *
5411  * a8 a9 b0 b1 b2 b3 b4 b5 b6
5412  *
5413  * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5414  *
5415  * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5416  *
5417  * Output:
5418  *
5419  * a8 a9 b0 b1 b2 b3 b4 b5
5420  *
5421  * If +other_string+ would not be reached, does not call the block:
5422  *
5423  * '25'.upto('5') {|s| fail s }
5424  * 'aa'.upto('a') {|s| fail s }
5425  *
5426  * With no block given, returns a new Enumerator:
5427  *
5428  * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5429  *
5430  */
5431 
5432 static VALUE
5433 rb_str_upto(int argc, VALUE *argv, VALUE beg)
5434 {
5435  VALUE end, exclusive;
5436 
5437  rb_scan_args(argc, argv, "11", &end, &exclusive);
5438  RETURN_ENUMERATOR(beg, argc, argv);
5439  return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5440 }
5441 
5442 VALUE
5443 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5444 {
5445  VALUE current, after_end;
5446  ID succ;
5447  int n, ascii;
5448  rb_encoding *enc;
5449 
5450  CONST_ID(succ, "succ");
5451  StringValue(end);
5452  enc = rb_enc_check(beg, end);
5453  ascii = (is_ascii_string(beg) && is_ascii_string(end));
5454  /* single character */
5455  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5456  char c = RSTRING_PTR(beg)[0];
5457  char e = RSTRING_PTR(end)[0];
5458 
5459  if (c > e || (excl && c == e)) return beg;
5460  for (;;) {
5461  VALUE str = rb_enc_str_new(&c, 1, enc);
5463  if ((*each)(str, arg)) break;
5464  if (!excl && c == e) break;
5465  c++;
5466  if (excl && c == e) break;
5467  }
5468  return beg;
5469  }
5470  /* both edges are all digits */
5471  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5472  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5473  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5474  VALUE b, e;
5475  int width;
5476 
5477  width = RSTRING_LENINT(beg);
5478  b = rb_str_to_inum(beg, 10, FALSE);
5479  e = rb_str_to_inum(end, 10, FALSE);
5480  if (FIXNUM_P(b) && FIXNUM_P(e)) {
5481  long bi = FIX2LONG(b);
5482  long ei = FIX2LONG(e);
5483  rb_encoding *usascii = rb_usascii_encoding();
5484 
5485  while (bi <= ei) {
5486  if (excl && bi == ei) break;
5487  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5488  bi++;
5489  }
5490  }
5491  else {
5492  ID op = excl ? '<' : idLE;
5493  VALUE args[2], fmt = rb_fstring_lit("%.*d");
5494 
5495  args[0] = INT2FIX(width);
5496  while (rb_funcall(b, op, 1, e)) {
5497  args[1] = b;
5498  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5499  b = rb_funcallv(b, succ, 0, 0);
5500  }
5501  }
5502  return beg;
5503  }
5504  /* normal case */
5505  n = rb_str_cmp(beg, end);
5506  if (n > 0 || (excl && n == 0)) return beg;
5507 
5508  after_end = rb_funcallv(end, succ, 0, 0);
5509  current = str_duplicate(rb_cString, beg);
5510  while (!rb_str_equal(current, after_end)) {
5511  VALUE next = Qnil;
5512  if (excl || !rb_str_equal(current, end))
5513  next = rb_funcallv(current, succ, 0, 0);
5514  if ((*each)(current, arg)) break;
5515  if (NIL_P(next)) break;
5516  current = next;
5517  StringValue(current);
5518  if (excl && rb_str_equal(current, end)) break;
5519  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5520  break;
5521  }
5522 
5523  return beg;
5524 }
5525 
5526 VALUE
5527 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5528 {
5529  VALUE current;
5530  ID succ;
5531 
5532  CONST_ID(succ, "succ");
5533  /* both edges are all digits */
5534  if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5535  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5536  VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5537  int width = RSTRING_LENINT(beg);
5538  b = rb_str_to_inum(beg, 10, FALSE);
5539  if (FIXNUM_P(b)) {
5540  long bi = FIX2LONG(b);
5541  rb_encoding *usascii = rb_usascii_encoding();
5542 
5543  while (FIXABLE(bi)) {
5544  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5545  bi++;
5546  }
5547  b = LONG2NUM(bi);
5548  }
5549  args[0] = INT2FIX(width);
5550  while (1) {
5551  args[1] = b;
5552  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5553  b = rb_funcallv(b, succ, 0, 0);
5554  }
5555  }
5556  /* normal case */
5557  current = str_duplicate(rb_cString, beg);
5558  while (1) {
5559  VALUE next = rb_funcallv(current, succ, 0, 0);
5560  if ((*each)(current, arg)) break;
5561  current = next;
5562  StringValue(current);
5563  if (RSTRING_LEN(current) == 0)
5564  break;
5565  }
5566 
5567  return beg;
5568 }
5569 
5570 static int
5571 include_range_i(VALUE str, VALUE arg)
5572 {
5573  VALUE *argp = (VALUE *)arg;
5574  if (!rb_equal(str, *argp)) return 0;
5575  *argp = Qnil;
5576  return 1;
5577 }
5578 
5579 VALUE
5580 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5581 {
5582  beg = rb_str_new_frozen(beg);
5583  StringValue(end);
5584  end = rb_str_new_frozen(end);
5585  if (NIL_P(val)) return Qfalse;
5586  val = rb_check_string_type(val);
5587  if (NIL_P(val)) return Qfalse;
5588  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5589  rb_enc_asciicompat(STR_ENC_GET(end)) &&
5590  rb_enc_asciicompat(STR_ENC_GET(val))) {
5591  const char *bp = RSTRING_PTR(beg);
5592  const char *ep = RSTRING_PTR(end);
5593  const char *vp = RSTRING_PTR(val);
5594  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5595  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5596  return Qfalse;
5597  else {
5598  char b = *bp;
5599  char e = *ep;
5600  char v = *vp;
5601 
5602  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5603  if (b <= v && v < e) return Qtrue;
5604  return RBOOL(!RTEST(exclusive) && v == e);
5605  }
5606  }
5607  }
5608 #if 0
5609  /* both edges are all digits */
5610  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5611  all_digits_p(bp, RSTRING_LEN(beg)) &&
5612  all_digits_p(ep, RSTRING_LEN(end))) {
5613  /* TODO */
5614  }
5615 #endif
5616  }
5617  rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5618 
5619  return RBOOL(NIL_P(val));
5620 }
5621 
5622 static VALUE
5623 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5624 {
5625  if (rb_reg_search(re, str, 0, 0) >= 0) {
5626  VALUE match = rb_backref_get();
5627  int nth = rb_reg_backref_number(match, backref);
5628  return rb_reg_nth_match(nth, match);
5629  }
5630  return Qnil;
5631 }
5632 
5633 static VALUE
5634 rb_str_aref(VALUE str, VALUE indx)
5635 {
5636  long idx;
5637 
5638  if (FIXNUM_P(indx)) {
5639  idx = FIX2LONG(indx);
5640  }
5641  else if (RB_TYPE_P(indx, T_REGEXP)) {
5642  return rb_str_subpat(str, indx, INT2FIX(0));
5643  }
5644  else if (RB_TYPE_P(indx, T_STRING)) {
5645  if (rb_str_index(str, indx, 0) != -1)
5646  return str_duplicate(rb_cString, indx);
5647  return Qnil;
5648  }
5649  else {
5650  /* check if indx is Range */
5651  long beg, len = str_strlen(str, NULL);
5652  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5653  case Qfalse:
5654  break;
5655  case Qnil:
5656  return Qnil;
5657  default:
5658  return rb_str_substr(str, beg, len);
5659  }
5660  idx = NUM2LONG(indx);
5661  }
5662 
5663  return str_substr(str, idx, 1, FALSE);
5664 }
5665 
5666 
5667 /*
5668  * call-seq:
5669  * string[index] -> new_string or nil
5670  * string[start, length] -> new_string or nil
5671  * string[range] -> new_string or nil
5672  * string[regexp, capture = 0] -> new_string or nil
5673  * string[substring] -> new_string or nil
5674  *
5675  * Returns the substring of +self+ specified by the arguments.
5676  * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5677  *
5678  *
5679  */
5680 
5681 static VALUE
5682 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5683 {
5684  if (argc == 2) {
5685  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5686  return rb_str_subpat(str, argv[0], argv[1]);
5687  }
5688  else {
5689  return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5690  }
5691  }
5692  rb_check_arity(argc, 1, 2);
5693  return rb_str_aref(str, argv[0]);
5694 }
5695 
5696 VALUE
5698 {
5699  char *ptr = RSTRING_PTR(str);
5700  long olen = RSTRING_LEN(str), nlen;
5701 
5702  str_modifiable(str);
5703  if (len > olen) len = olen;
5704  nlen = olen - len;
5705  if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5706  char *oldptr = ptr;
5707  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5708  STR_SET_EMBED(str);
5709  ptr = RSTRING(str)->as.embed.ary;
5710  memmove(ptr, oldptr + len, nlen);
5711  if (fl == STR_NOEMBED) xfree(oldptr);
5712  }
5713  else {
5714  if (!STR_SHARED_P(str)) {
5715  VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5716  rb_enc_cr_str_exact_copy(shared, str);
5717  OBJ_FREEZE(shared);
5718  }
5719  ptr = RSTRING(str)->as.heap.ptr += len;
5720  }
5721  STR_SET_LEN(str, nlen);
5722 
5723  if (!SHARABLE_MIDDLE_SUBSTRING) {
5724  TERM_FILL(ptr + nlen, TERM_LEN(str));
5725  }
5726  ENC_CODERANGE_CLEAR(str);
5727  return str;
5728 }
5729 
5730 static void
5731 rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5732 {
5733  char *sptr;
5734  long slen;
5735  int cr;
5736 
5737  if (beg == 0 && vlen == 0) {
5738  rb_str_drop_bytes(str, len);
5739  return;
5740  }
5741 
5742  str_modify_keep_cr(str);
5743  RSTRING_GETMEM(str, sptr, slen);
5744  if (len < vlen) {
5745  /* expand string */
5746  RESIZE_CAPA(str, slen + vlen - len);
5747  sptr = RSTRING_PTR(str);
5748  }
5749 
5750  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5751  cr = rb_enc_str_coderange(val);
5752  else
5753  cr = ENC_CODERANGE_UNKNOWN;
5754 
5755  if (vlen != len) {
5756  memmove(sptr + beg + vlen,
5757  sptr + beg + len,
5758  slen - (beg + len));
5759  }
5760  if (vlen < beg && len < 0) {
5761  MEMZERO(sptr + slen, char, -len);
5762  }
5763  if (vlen > 0) {
5764  memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5765  }
5766  slen += vlen - len;
5767  STR_SET_LEN(str, slen);
5768  TERM_FILL(&sptr[slen], TERM_LEN(str));
5769  ENC_CODERANGE_SET(str, cr);
5770 }
5771 
5772 static inline void
5773 rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5774 {
5775  rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5776 }
5777 
5778 void
5779 rb_str_update(VALUE str, long beg, long len, VALUE val)
5780 {
5781  long slen;
5782  char *p, *e;
5783  rb_encoding *enc;
5784  int singlebyte = single_byte_optimizable(str);
5785  int cr;
5786 
5787  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5788 
5789  StringValue(val);
5790  enc = rb_enc_check(str, val);
5791  slen = str_strlen(str, enc); /* rb_enc_check */
5792 
5793  if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5794  rb_raise(rb_eIndexError, "index %ld out of string", beg);
5795  }
5796  if (beg < 0) {
5797  beg += slen;
5798  }
5799  RUBY_ASSERT(beg >= 0);
5800  RUBY_ASSERT(beg <= slen);
5801 
5802  if (len > slen - beg) {
5803  len = slen - beg;
5804  }
5805  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5806  if (!p) p = RSTRING_END(str);
5807  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5808  if (!e) e = RSTRING_END(str);
5809  /* error check */
5810  beg = p - RSTRING_PTR(str); /* physical position */
5811  len = e - p; /* physical length */
5812  rb_str_update_0(str, beg, len, val);
5813  rb_enc_associate(str, enc);
5815  if (cr != ENC_CODERANGE_BROKEN)
5816  ENC_CODERANGE_SET(str, cr);
5817 }
5818 
5819 static void
5820 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5821 {
5822  int nth;
5823  VALUE match;
5824  long start, end, len;
5825  rb_encoding *enc;
5826  struct re_registers *regs;
5827 
5828  if (rb_reg_search(re, str, 0, 0) < 0) {
5829  rb_raise(rb_eIndexError, "regexp not matched");
5830  }
5831  match = rb_backref_get();
5832  nth = rb_reg_backref_number(match, backref);
5833  regs = RMATCH_REGS(match);
5834  if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5835  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5836  }
5837  if (nth < 0) {
5838  nth += regs->num_regs;
5839  }
5840 
5841  start = BEG(nth);
5842  if (start == -1) {
5843  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5844  }
5845  end = END(nth);
5846  len = end - start;
5847  StringValue(val);
5848  enc = rb_enc_check_str(str, val);
5849  rb_str_update_0(str, start, len, val);
5850  rb_enc_associate(str, enc);
5851 }
5852 
5853 static VALUE
5854 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5855 {
5856  long idx, beg;
5857 
5858  switch (TYPE(indx)) {
5859  case T_REGEXP:
5860  rb_str_subpat_set(str, indx, INT2FIX(0), val);
5861  return val;
5862 
5863  case T_STRING:
5864  beg = rb_str_index(str, indx, 0);
5865  if (beg < 0) {
5866  rb_raise(rb_eIndexError, "string not matched");
5867  }
5868  beg = rb_str_sublen(str, beg);
5869  rb_str_update(str, beg, str_strlen(indx, NULL), val);
5870  return val;
5871 
5872  default:
5873  /* check if indx is Range */
5874  {
5875  long beg, len;
5876  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5877  rb_str_update(str, beg, len, val);
5878  return val;
5879  }
5880  }
5881  /* FALLTHROUGH */
5882 
5883  case T_FIXNUM:
5884  idx = NUM2LONG(indx);
5885  rb_str_update(str, idx, 1, val);
5886  return val;
5887  }
5888 }
5889 
5890 /*
5891  * call-seq:
5892  * string[index] = new_string
5893  * string[start, length] = new_string
5894  * string[range] = new_string
5895  * string[regexp, capture = 0] = new_string
5896  * string[substring] = new_string
5897  *
5898  * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5899  * See {String Slices}[rdoc-ref:String@String+Slices].
5900  *
5901  * A few examples:
5902  *
5903  * s = 'foo'
5904  * s[2] = 'rtune' # => "rtune"
5905  * s # => "fortune"
5906  * s[1, 5] = 'init' # => "init"
5907  * s # => "finite"
5908  * s[3..4] = 'al' # => "al"
5909  * s # => "finale"
5910  * s[/e$/] = 'ly' # => "ly"
5911  * s # => "finally"
5912  * s['lly'] = 'ncial' # => "ncial"
5913  * s # => "financial"
5914  *
5915  */
5916 
5917 static VALUE
5918 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5919 {
5920  if (argc == 3) {
5921  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5922  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5923  }
5924  else {
5925  rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5926  }
5927  return argv[2];
5928  }
5929  rb_check_arity(argc, 2, 3);
5930  return rb_str_aset(str, argv[0], argv[1]);
5931 }
5932 
5933 /*
5934  * call-seq:
5935  * insert(index, other_string) -> self
5936  *
5937  * Inserts the given +other_string+ into +self+; returns +self+.
5938  *
5939  * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5940  *
5941  * 'foo'.insert(1, 'bar') # => "fbaroo"
5942  *
5943  * If the Integer +index+ is negative, counts backward from the end of +self+
5944  * and inserts +other_string+ at offset <tt>index+1</tt>
5945  * (that is, _after_ <tt>self[index]</tt>):
5946  *
5947  * 'foo'.insert(-2, 'bar') # => "fobaro"
5948  *
5949  */
5950 
5951 static VALUE
5952 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5953 {
5954  long pos = NUM2LONG(idx);
5955 
5956  if (pos == -1) {
5957  return rb_str_append(str, str2);
5958  }
5959  else if (pos < 0) {
5960  pos++;
5961  }
5962  rb_str_update(str, pos, 0, str2);
5963  return str;
5964 }
5965 
5966 
5967 /*
5968  * call-seq:
5969  * slice!(index) -> new_string or nil
5970  * slice!(start, length) -> new_string or nil
5971  * slice!(range) -> new_string or nil
5972  * slice!(regexp, capture = 0) -> new_string or nil
5973  * slice!(substring) -> new_string or nil
5974  *
5975  * Removes and returns the substring of +self+ specified by the arguments.
5976  * See {String Slices}[rdoc-ref:String@String+Slices].
5977  *
5978  * A few examples:
5979  *
5980  * string = "This is a string"
5981  * string.slice!(2) #=> "i"
5982  * string.slice!(3..6) #=> " is "
5983  * string.slice!(/s.*t/) #=> "sa st"
5984  * string.slice!("r") #=> "r"
5985  * string #=> "Thing"
5986  *
5987  */
5988 
5989 static VALUE
5990 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5991 {
5992  VALUE result = Qnil;
5993  VALUE indx;
5994  long beg, len = 1;
5995  char *p;
5996 
5997  rb_check_arity(argc, 1, 2);
5998  str_modify_keep_cr(str);
5999  indx = argv[0];
6000  if (RB_TYPE_P(indx, T_REGEXP)) {
6001  if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6002  VALUE match = rb_backref_get();
6003  struct re_registers *regs = RMATCH_REGS(match);
6004  int nth = 0;
6005  if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6006  if ((nth += regs->num_regs) <= 0) return Qnil;
6007  }
6008  else if (nth >= regs->num_regs) return Qnil;
6009  beg = BEG(nth);
6010  len = END(nth) - beg;
6011  goto subseq;
6012  }
6013  else if (argc == 2) {
6014  beg = NUM2LONG(indx);
6015  len = NUM2LONG(argv[1]);
6016  goto num_index;
6017  }
6018  else if (FIXNUM_P(indx)) {
6019  beg = FIX2LONG(indx);
6020  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6021  if (!len) return Qnil;
6022  beg = p - RSTRING_PTR(str);
6023  goto subseq;
6024  }
6025  else if (RB_TYPE_P(indx, T_STRING)) {
6026  beg = rb_str_index(str, indx, 0);
6027  if (beg == -1) return Qnil;
6028  len = RSTRING_LEN(indx);
6029  result = str_duplicate(rb_cString, indx);
6030  goto squash;
6031  }
6032  else {
6033  switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6034  case Qnil:
6035  return Qnil;
6036  case Qfalse:
6037  beg = NUM2LONG(indx);
6038  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6039  if (!len) return Qnil;
6040  beg = p - RSTRING_PTR(str);
6041  goto subseq;
6042  default:
6043  goto num_index;
6044  }
6045  }
6046 
6047  num_index:
6048  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6049  beg = p - RSTRING_PTR(str);
6050 
6051  subseq:
6052  result = rb_str_new(RSTRING_PTR(str)+beg, len);
6053  rb_enc_cr_str_copy_for_substr(result, str);
6054 
6055  squash:
6056  if (len > 0) {
6057  if (beg == 0) {
6058  rb_str_drop_bytes(str, len);
6059  }
6060  else {
6061  char *sptr = RSTRING_PTR(str);
6062  long slen = RSTRING_LEN(str);
6063  if (beg + len > slen) /* pathological check */
6064  len = slen - beg;
6065  memmove(sptr + beg,
6066  sptr + beg + len,
6067  slen - (beg + len));
6068  slen -= len;
6069  STR_SET_LEN(str, slen);
6070  TERM_FILL(&sptr[slen], TERM_LEN(str));
6071  }
6072  }
6073  return result;
6074 }
6075 
6076 static VALUE
6077 get_pat(VALUE pat)
6078 {
6079  VALUE val;
6080 
6081  switch (OBJ_BUILTIN_TYPE(pat)) {
6082  case T_REGEXP:
6083  return pat;
6084 
6085  case T_STRING:
6086  break;
6087 
6088  default:
6089  val = rb_check_string_type(pat);
6090  if (NIL_P(val)) {
6091  Check_Type(pat, T_REGEXP);
6092  }
6093  pat = val;
6094  }
6095 
6096  return rb_reg_regcomp(pat);
6097 }
6098 
6099 static VALUE
6100 get_pat_quoted(VALUE pat, int check)
6101 {
6102  VALUE val;
6103 
6104  switch (OBJ_BUILTIN_TYPE(pat)) {
6105  case T_REGEXP:
6106  return pat;
6107 
6108  case T_STRING:
6109  break;
6110 
6111  default:
6112  val = rb_check_string_type(pat);
6113  if (NIL_P(val)) {
6114  Check_Type(pat, T_REGEXP);
6115  }
6116  pat = val;
6117  }
6118  if (check && is_broken_string(pat)) {
6119  rb_exc_raise(rb_reg_check_preprocess(pat));
6120  }
6121  return pat;
6122 }
6123 
6124 static long
6125 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6126 {
6127  if (BUILTIN_TYPE(pat) == T_STRING) {
6128  pos = rb_str_byteindex(str, pat, pos);
6129  if (set_backref_str) {
6130  if (pos >= 0) {
6131  str = rb_str_new_frozen_String(str);
6132  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6133  }
6134  else {
6136  }
6137  }
6138  return pos;
6139  }
6140  else {
6141  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6142  }
6143 }
6144 
6145 
6146 /*
6147  * call-seq:
6148  * sub!(pattern, replacement) -> self or nil
6149  * sub!(pattern) {|match| ... } -> self or nil
6150  *
6151  * Replaces the first occurrence (not all occurrences) of the given +pattern+
6152  * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6153  *
6154  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6155  *
6156  * Related: String#sub, String#gsub, String#gsub!.
6157  *
6158  */
6159 
6160 static VALUE
6161 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6162 {
6163  VALUE pat, repl, hash = Qnil;
6164  int iter = 0;
6165  long plen;
6166  int min_arity = rb_block_given_p() ? 1 : 2;
6167  long beg;
6168 
6169  rb_check_arity(argc, min_arity, 2);
6170  if (argc == 1) {
6171  iter = 1;
6172  }
6173  else {
6174  repl = argv[1];
6175  hash = rb_check_hash_type(argv[1]);
6176  if (NIL_P(hash)) {
6177  StringValue(repl);
6178  }
6179  }
6180 
6181  pat = get_pat_quoted(argv[0], 1);
6182 
6183  str_modifiable(str);
6184  beg = rb_pat_search(pat, str, 0, 1);
6185  if (beg >= 0) {
6186  rb_encoding *enc;
6187  int cr = ENC_CODERANGE(str);
6188  long beg0, end0;
6189  VALUE match, match0 = Qnil;
6190  struct re_registers *regs;
6191  char *p, *rp;
6192  long len, rlen;
6193 
6194  match = rb_backref_get();
6195  regs = RMATCH_REGS(match);
6196  if (RB_TYPE_P(pat, T_STRING)) {
6197  beg0 = beg;
6198  end0 = beg0 + RSTRING_LEN(pat);
6199  match0 = pat;
6200  }
6201  else {
6202  beg0 = BEG(0);
6203  end0 = END(0);
6204  if (iter) match0 = rb_reg_nth_match(0, match);
6205  }
6206 
6207  if (iter || !NIL_P(hash)) {
6208  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6209 
6210  if (iter) {
6211  repl = rb_obj_as_string(rb_yield(match0));
6212  }
6213  else {
6214  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6215  repl = rb_obj_as_string(repl);
6216  }
6217  str_mod_check(str, p, len);
6218  rb_check_frozen(str);
6219  }
6220  else {
6221  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6222  }
6223 
6224  enc = rb_enc_compatible(str, repl);
6225  if (!enc) {
6226  rb_encoding *str_enc = STR_ENC_GET(str);
6227  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6228  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6229  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6230  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6231  rb_enc_inspect_name(str_enc),
6232  rb_enc_inspect_name(STR_ENC_GET(repl)));
6233  }
6234  enc = STR_ENC_GET(repl);
6235  }
6236  rb_str_modify(str);
6237  rb_enc_associate(str, enc);
6238  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
6239  int cr2 = ENC_CODERANGE(repl);
6240  if (cr2 == ENC_CODERANGE_BROKEN ||
6241  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6242  cr = ENC_CODERANGE_UNKNOWN;
6243  else
6244  cr = cr2;
6245  }
6246  plen = end0 - beg0;
6247  rlen = RSTRING_LEN(repl);
6248  len = RSTRING_LEN(str);
6249  if (rlen > plen) {
6250  RESIZE_CAPA(str, len + rlen - plen);
6251  }
6252  p = RSTRING_PTR(str);
6253  if (rlen != plen) {
6254  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6255  }
6256  rp = RSTRING_PTR(repl);
6257  memmove(p + beg0, rp, rlen);
6258  len += rlen - plen;
6259  STR_SET_LEN(str, len);
6260  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6261  ENC_CODERANGE_SET(str, cr);
6262 
6263  RB_GC_GUARD(match);
6264 
6265  return str;
6266  }
6267  return Qnil;
6268 }
6269 
6270 
6271 /*
6272  * call-seq:
6273  * sub(pattern, replacement) -> new_string
6274  * sub(pattern) {|match| ... } -> new_string
6275  *
6276  * Returns a copy of +self+ with only the first occurrence
6277  * (not all occurrences) of the given +pattern+ replaced.
6278  *
6279  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6280  *
6281  * Related: String#sub!, String#gsub, String#gsub!.
6282  *
6283  */
6284 
6285 static VALUE
6286 rb_str_sub(int argc, VALUE *argv, VALUE str)
6287 {
6288  str = str_duplicate(rb_cString, str);
6289  rb_str_sub_bang(argc, argv, str);
6290  return str;
6291 }
6292 
6293 static VALUE
6294 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6295 {
6296  VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6297  long beg, beg0, end0;
6298  long offset, blen, slen, len, last;
6299  enum {STR, ITER, MAP} mode = STR;
6300  char *sp, *cp;
6301  int need_backref = -1;
6302  rb_encoding *str_enc;
6303 
6304  switch (argc) {
6305  case 1:
6306  RETURN_ENUMERATOR(str, argc, argv);
6307  mode = ITER;
6308  break;
6309  case 2:
6310  repl = argv[1];
6311  hash = rb_check_hash_type(argv[1]);
6312  if (NIL_P(hash)) {
6313  StringValue(repl);
6314  }
6315  else {
6316  mode = MAP;
6317  }
6318  break;
6319  default:
6320  rb_error_arity(argc, 1, 2);
6321  }
6322 
6323  pat = get_pat_quoted(argv[0], 1);
6324  beg = rb_pat_search(pat, str, 0, need_backref);
6325  if (beg < 0) {
6326  if (bang) return Qnil; /* no match, no substitution */
6327  return str_duplicate(rb_cString, str);
6328  }
6329 
6330  offset = 0;
6331  blen = RSTRING_LEN(str) + 30; /* len + margin */
6332  dest = rb_str_buf_new(blen);
6333  sp = RSTRING_PTR(str);
6334  slen = RSTRING_LEN(str);
6335  cp = sp;
6336  str_enc = STR_ENC_GET(str);
6337  rb_enc_associate(dest, str_enc);
6339 
6340  do {
6341  VALUE match = rb_backref_get();
6342  struct re_registers *regs = RMATCH_REGS(match);
6343  if (RB_TYPE_P(pat, T_STRING)) {
6344  beg0 = beg;
6345  end0 = beg0 + RSTRING_LEN(pat);
6346  match0 = pat;
6347  }
6348  else {
6349  beg0 = BEG(0);
6350  end0 = END(0);
6351  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6352  }
6353 
6354  if (mode) {
6355  if (mode == ITER) {
6356  val = rb_obj_as_string(rb_yield(match0));
6357  }
6358  else {
6359  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6360  val = rb_obj_as_string(val);
6361  }
6362  str_mod_check(str, sp, slen);
6363  if (val == dest) { /* paranoid check [ruby-dev:24827] */
6364  rb_raise(rb_eRuntimeError, "block should not cheat");
6365  }
6366  }
6367  else if (need_backref) {
6368  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6369  if (need_backref < 0) {
6370  need_backref = val != repl;
6371  }
6372  }
6373  else {
6374  val = repl;
6375  }
6376 
6377  len = beg0 - offset; /* copy pre-match substr */
6378  if (len) {
6379  rb_enc_str_buf_cat(dest, cp, len, str_enc);
6380  }
6381 
6382  rb_str_buf_append(dest, val);
6383 
6384  last = offset;
6385  offset = end0;
6386  if (beg0 == end0) {
6387  /*
6388  * Always consume at least one character of the input string
6389  * in order to prevent infinite loops.
6390  */
6391  if (RSTRING_LEN(str) <= end0) break;
6392  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6393  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6394  offset = end0 + len;
6395  }
6396  cp = RSTRING_PTR(str) + offset;
6397  if (offset > RSTRING_LEN(str)) break;
6398  beg = rb_pat_search(pat, str, offset, need_backref);
6399 
6400  RB_GC_GUARD(match);
6401  } while (beg >= 0);
6402  if (RSTRING_LEN(str) > offset) {
6403  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6404  }
6405  rb_pat_search(pat, str, last, 1);
6406  if (bang) {
6407  str_shared_replace(str, dest);
6408  }
6409  else {
6410  str = dest;
6411  }
6412 
6413  return str;
6414 }
6415 
6416 
6417 /*
6418  * call-seq:
6419  * gsub!(pattern, replacement) -> self or nil
6420  * gsub!(pattern) {|match| ... } -> self or nil
6421  * gsub!(pattern) -> an_enumerator
6422  *
6423  * Performs the specified substring replacement(s) on +self+;
6424  * returns +self+ if any replacement occurred, +nil+ otherwise.
6425  *
6426  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6427  *
6428  * Returns an Enumerator if no +replacement+ and no block given.
6429  *
6430  * Related: String#sub, String#gsub, String#sub!.
6431  *
6432  */
6433 
6434 static VALUE
6435 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6436 {
6437  str_modify_keep_cr(str);
6438  return str_gsub(argc, argv, str, 1);
6439 }
6440 
6441 
6442 /*
6443  * call-seq:
6444  * gsub(pattern, replacement) -> new_string
6445  * gsub(pattern) {|match| ... } -> new_string
6446  * gsub(pattern) -> enumerator
6447  *
6448  * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6449  *
6450  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6451  *
6452  * Returns an Enumerator if no +replacement+ and no block given.
6453  *
6454  * Related: String#sub, String#sub!, String#gsub!.
6455  *
6456  */
6457 
6458 static VALUE
6459 rb_str_gsub(int argc, VALUE *argv, VALUE str)
6460 {
6461  return str_gsub(argc, argv, str, 0);
6462 }
6463 
6464 
6465 /*
6466  * call-seq:
6467  * replace(other_string) -> self
6468  *
6469  * Replaces the contents of +self+ with the contents of +other_string+:
6470  *
6471  * s = 'foo' # => "foo"
6472  * s.replace('bar') # => "bar"
6473  *
6474  */
6475 
6476 VALUE
6478 {
6479  str_modifiable(str);
6480  if (str == str2) return str;
6481 
6482  StringValue(str2);
6483  str_discard(str);
6484  return str_replace(str, str2);
6485 }
6486 
6487 /*
6488  * call-seq:
6489  * clear -> self
6490  *
6491  * Removes the contents of +self+:
6492  *
6493  * s = 'foo' # => "foo"
6494  * s.clear # => ""
6495  *
6496  */
6497 
6498 static VALUE
6499 rb_str_clear(VALUE str)
6500 {
6501  str_discard(str);
6502  STR_SET_EMBED(str);
6503  STR_SET_LEN(str, 0);
6504  RSTRING_PTR(str)[0] = 0;
6505  if (rb_enc_asciicompat(STR_ENC_GET(str)))
6507  else
6509  return str;
6510 }
6511 
6512 /*
6513  * call-seq:
6514  * chr -> string
6515  *
6516  * Returns a string containing the first character of +self+:
6517  *
6518  * s = 'foo' # => "foo"
6519  * s.chr # => "f"
6520  *
6521  */
6522 
6523 static VALUE
6524 rb_str_chr(VALUE str)
6525 {
6526  return rb_str_substr(str, 0, 1);
6527 }
6528 
6529 /*
6530  * call-seq:
6531  * getbyte(index) -> integer or nil
6532  *
6533  * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6534  *
6535  * s = 'abcde' # => "abcde"
6536  * s.getbyte(0) # => 97
6537  * s.getbyte(-1) # => 101
6538  * s.getbyte(5) # => nil
6539  *
6540  * Related: String#setbyte.
6541  */
6542 VALUE
6543 rb_str_getbyte(VALUE str, VALUE index)
6544 {
6545  long pos = NUM2LONG(index);
6546 
6547  if (pos < 0)
6548  pos += RSTRING_LEN(str);
6549  if (pos < 0 || RSTRING_LEN(str) <= pos)
6550  return Qnil;
6551 
6552  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6553 }
6554 
6555 /*
6556  * call-seq:
6557  * setbyte(index, integer) -> integer
6558  *
6559  * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6560  *
6561  * s = 'abcde' # => "abcde"
6562  * s.setbyte(0, 98) # => 98
6563  * s # => "bbcde"
6564  *
6565  * Related: String#getbyte.
6566  */
6567 VALUE
6568 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6569 {
6570  long pos = NUM2LONG(index);
6571  long len = RSTRING_LEN(str);
6572  char *ptr, *head, *left = 0;
6573  rb_encoding *enc;
6574  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6575 
6576  if (pos < -len || len <= pos)
6577  rb_raise(rb_eIndexError, "index %ld out of string", pos);
6578  if (pos < 0)
6579  pos += len;
6580 
6581  VALUE v = rb_to_int(value);
6582  VALUE w = rb_int_and(v, INT2FIX(0xff));
6583  char byte = (char)(NUM2INT(w) & 0xFF);
6584 
6585  if (!str_independent(str))
6586  str_make_independent(str);
6587  enc = STR_ENC_GET(str);
6588  head = RSTRING_PTR(str);
6589  ptr = &head[pos];
6590  if (!STR_EMBED_P(str)) {
6591  cr = ENC_CODERANGE(str);
6592  switch (cr) {
6593  case ENC_CODERANGE_7BIT:
6594  left = ptr;
6595  *ptr = byte;
6596  if (ISASCII(byte)) goto end;
6597  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6598  if (!MBCLEN_CHARFOUND_P(nlen))
6600  else
6602  goto end;
6603  case ENC_CODERANGE_VALID:
6604  left = rb_enc_left_char_head(head, ptr, head+len, enc);
6605  width = rb_enc_precise_mbclen(left, head+len, enc);
6606  *ptr = byte;
6607  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6608  if (!MBCLEN_CHARFOUND_P(nlen))
6610  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6611  ENC_CODERANGE_CLEAR(str);
6612  goto end;
6613  }
6614  }
6615  ENC_CODERANGE_CLEAR(str);
6616  *ptr = byte;
6617 
6618  end:
6619  return value;
6620 }
6621 
6622 static VALUE
6623 str_byte_substr(VALUE str, long beg, long len, int empty)
6624 {
6625  long n = RSTRING_LEN(str);
6626 
6627  if (beg > n || len < 0) return Qnil;
6628  if (beg < 0) {
6629  beg += n;
6630  if (beg < 0) return Qnil;
6631  }
6632  if (len > n - beg)
6633  len = n - beg;
6634  if (len <= 0) {
6635  if (!empty) return Qnil;
6636  len = 0;
6637  }
6638 
6639  VALUE str2 = str_subseq(str, beg, len);
6640 
6641  str_enc_copy_direct(str2, str);
6642 
6643  if (RSTRING_LEN(str2) == 0) {
6644  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6646  else
6648  }
6649  else {
6650  switch (ENC_CODERANGE(str)) {
6651  case ENC_CODERANGE_7BIT:
6653  break;
6654  default:
6656  break;
6657  }
6658  }
6659 
6660  return str2;
6661 }
6662 
6663 VALUE
6664 rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6665 {
6666  return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6667 }
6668 
6669 static VALUE
6670 str_byte_aref(VALUE str, VALUE indx)
6671 {
6672  long idx;
6673  if (FIXNUM_P(indx)) {
6674  idx = FIX2LONG(indx);
6675  }
6676  else {
6677  /* check if indx is Range */
6678  long beg, len = RSTRING_LEN(str);
6679 
6680  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6681  case Qfalse:
6682  break;
6683  case Qnil:
6684  return Qnil;
6685  default:
6686  return str_byte_substr(str, beg, len, TRUE);
6687  }
6688 
6689  idx = NUM2LONG(indx);
6690  }
6691  return str_byte_substr(str, idx, 1, FALSE);
6692 }
6693 
6694 /*
6695  * call-seq:
6696  * byteslice(index, length = 1) -> string or nil
6697  * byteslice(range) -> string or nil
6698  *
6699  * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6700  *
6701  * With integer arguments +index+ and +length+ given,
6702  * returns the substring beginning at the given +index+
6703  * of the given +length+ (if possible),
6704  * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6705  *
6706  * s = '0123456789' # => "0123456789"
6707  * s.byteslice(2) # => "2"
6708  * s.byteslice(200) # => nil
6709  * s.byteslice(4, 3) # => "456"
6710  * s.byteslice(4, 30) # => "456789"
6711  * s.byteslice(4, -1) # => nil
6712  * s.byteslice(40, 2) # => nil
6713  *
6714  * In either case above, counts backwards from the end of +self+
6715  * if +index+ is negative:
6716  *
6717  * s = '0123456789' # => "0123456789"
6718  * s.byteslice(-4) # => "6"
6719  * s.byteslice(-4, 3) # => "678"
6720  *
6721  * With Range argument +range+ given, returns
6722  * <tt>byteslice(range.begin, range.size)</tt>:
6723  *
6724  * s = '0123456789' # => "0123456789"
6725  * s.byteslice(4..6) # => "456"
6726  * s.byteslice(-6..-4) # => "456"
6727  * s.byteslice(5..2) # => "" # range.size is zero.
6728  * s.byteslice(40..42) # => nil
6729  *
6730  * In all cases, a returned string has the same encoding as +self+:
6731  *
6732  * s.encoding # => #<Encoding:UTF-8>
6733  * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6734  *
6735  */
6736 
6737 static VALUE
6738 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6739 {
6740  if (argc == 2) {
6741  long beg = NUM2LONG(argv[0]);
6742  long len = NUM2LONG(argv[1]);
6743  return str_byte_substr(str, beg, len, TRUE);
6744  }
6745  rb_check_arity(argc, 1, 2);
6746  return str_byte_aref(str, argv[0]);
6747 }
6748 
6749 static void
6750 str_check_beg_len(VALUE str, long *beg, long *len)
6751 {
6752  long end, slen = RSTRING_LEN(str);
6753 
6754  if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6755  if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6756  rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6757  }
6758  if (*beg < 0) {
6759  *beg += slen;
6760  }
6761  RUBY_ASSERT(*beg >= 0);
6762  RUBY_ASSERT(*beg <= slen);
6763 
6764  if (*len > slen - *beg) {
6765  *len = slen - *beg;
6766  }
6767  end = *beg + *len;
6768  str_ensure_byte_pos(str, *beg);
6769  str_ensure_byte_pos(str, end);
6770 }
6771 
6772 /*
6773  * call-seq:
6774  * bytesplice(index, length, str) -> string
6775  * bytesplice(index, length, str, str_index, str_length) -> string
6776  * bytesplice(range, str) -> string
6777  * bytesplice(range, str, str_range) -> string
6778  *
6779  * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6780  * The portion of the string affected is determined using
6781  * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6782  * If the replacement string is not the same length as the text it is replacing,
6783  * the string will be adjusted accordingly.
6784  *
6785  * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6786  *
6787  * The form that take an Integer will raise an IndexError if the value is out
6788  * of range; the Range form will raise a RangeError.
6789  * If the beginning or ending offset does not land on character (codepoint)
6790  * boundary, an IndexError will be raised.
6791  */
6792 
6793 static VALUE
6794 rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6795 {
6796  long beg, len, vbeg, vlen;
6797  VALUE val;
6798  int cr;
6799 
6800  rb_check_arity(argc, 2, 5);
6801  if (!(argc == 2 || argc == 3 || argc == 5)) {
6802  rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6803  }
6804  if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6805  if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6806  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6807  rb_builtin_class_name(argv[0]));
6808  }
6809  val = argv[1];
6810  StringValue(val);
6811  if (argc == 2) {
6812  /* bytesplice(range, str) */
6813  vbeg = 0;
6814  vlen = RSTRING_LEN(val);
6815  }
6816  else {
6817  /* bytesplice(range, str, str_range) */
6818  if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6819  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6820  rb_builtin_class_name(argv[2]));
6821  }
6822  }
6823  }
6824  else {
6825  beg = NUM2LONG(argv[0]);
6826  len = NUM2LONG(argv[1]);
6827  val = argv[2];
6828  StringValue(val);
6829  if (argc == 3) {
6830  /* bytesplice(index, length, str) */
6831  vbeg = 0;
6832  vlen = RSTRING_LEN(val);
6833  }
6834  else {
6835  /* bytesplice(index, length, str, str_index, str_length) */
6836  vbeg = NUM2LONG(argv[3]);
6837  vlen = NUM2LONG(argv[4]);
6838  }
6839  }
6840  str_check_beg_len(str, &beg, &len);
6841  str_check_beg_len(val, &vbeg, &vlen);
6842  str_modify_keep_cr(str);
6843 
6845  rb_enc_associate(str, rb_enc_check(str, val));
6846  }
6847 
6848  rb_str_update_1(str, beg, len, val, vbeg, vlen);
6850  if (cr != ENC_CODERANGE_BROKEN)
6851  ENC_CODERANGE_SET(str, cr);
6852  return str;
6853 }
6854 
6855 /*
6856  * call-seq:
6857  * reverse -> string
6858  *
6859  * Returns a new string with the characters from +self+ in reverse order.
6860  *
6861  * 'stressed'.reverse # => "desserts"
6862  *
6863  */
6864 
6865 static VALUE
6866 rb_str_reverse(VALUE str)
6867 {
6868  rb_encoding *enc;
6869  VALUE rev;
6870  char *s, *e, *p;
6871  int cr;
6872 
6873  if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6874  enc = STR_ENC_GET(str);
6875  rev = rb_str_new(0, RSTRING_LEN(str));
6876  s = RSTRING_PTR(str); e = RSTRING_END(str);
6877  p = RSTRING_END(rev);
6878  cr = ENC_CODERANGE(str);
6879 
6880  if (RSTRING_LEN(str) > 1) {
6881  if (single_byte_optimizable(str)) {
6882  while (s < e) {
6883  *--p = *s++;
6884  }
6885  }
6886  else if (cr == ENC_CODERANGE_VALID) {
6887  while (s < e) {
6888  int clen = rb_enc_fast_mbclen(s, e, enc);
6889 
6890  p -= clen;
6891  memcpy(p, s, clen);
6892  s += clen;
6893  }
6894  }
6895  else {
6896  cr = rb_enc_asciicompat(enc) ?
6898  while (s < e) {
6899  int clen = rb_enc_mbclen(s, e, enc);
6900 
6901  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6902  p -= clen;
6903  memcpy(p, s, clen);
6904  s += clen;
6905  }
6906  }
6907  }
6908  STR_SET_LEN(rev, RSTRING_LEN(str));
6909  str_enc_copy_direct(rev, str);
6910  ENC_CODERANGE_SET(rev, cr);
6911 
6912  return rev;
6913 }
6914 
6915 
6916 /*
6917  * call-seq:
6918  * reverse! -> self
6919  *
6920  * Returns +self+ with its characters reversed:
6921  *
6922  * s = 'stressed'
6923  * s.reverse! # => "desserts"
6924  * s # => "desserts"
6925  *
6926  */
6927 
6928 static VALUE
6929 rb_str_reverse_bang(VALUE str)
6930 {
6931  if (RSTRING_LEN(str) > 1) {
6932  if (single_byte_optimizable(str)) {
6933  char *s, *e, c;
6934 
6935  str_modify_keep_cr(str);
6936  s = RSTRING_PTR(str);
6937  e = RSTRING_END(str) - 1;
6938  while (s < e) {
6939  c = *s;
6940  *s++ = *e;
6941  *e-- = c;
6942  }
6943  }
6944  else {
6945  str_shared_replace(str, rb_str_reverse(str));
6946  }
6947  }
6948  else {
6949  str_modify_keep_cr(str);
6950  }
6951  return str;
6952 }
6953 
6954 
6955 /*
6956  * call-seq:
6957  * include?(other_string) -> true or false
6958  *
6959  * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6960  *
6961  * s = 'foo'
6962  * s.include?('f') # => true
6963  * s.include?('fo') # => true
6964  * s.include?('food') # => false
6965  *
6966  */
6967 
6968 VALUE
6969 rb_str_include(VALUE str, VALUE arg)
6970 {
6971  long i;
6972 
6973  StringValue(arg);
6974  i = rb_str_index(str, arg, 0);
6975 
6976  return RBOOL(i != -1);
6977 }
6978 
6979 
6980 /*
6981  * call-seq:
6982  * to_i(base = 10) -> integer
6983  *
6984  * Returns the result of interpreting leading characters in +self+
6985  * as an integer in the given +base+ (which must be in (0, 2..36)):
6986  *
6987  * '123456'.to_i # => 123456
6988  * '123def'.to_i(16) # => 1195503
6989  *
6990  * With +base+ zero, string +object+ may contain leading characters
6991  * to specify the actual base:
6992  *
6993  * '123def'.to_i(0) # => 123
6994  * '0123def'.to_i(0) # => 83
6995  * '0b123def'.to_i(0) # => 1
6996  * '0o123def'.to_i(0) # => 83
6997  * '0d123def'.to_i(0) # => 123
6998  * '0x123def'.to_i(0) # => 1195503
6999  *
7000  * Characters past a leading valid number (in the given +base+) are ignored:
7001  *
7002  * '12.345'.to_i # => 12
7003  * '12345'.to_i(2) # => 1
7004  *
7005  * Returns zero if there is no leading valid number:
7006  *
7007  * 'abcdef'.to_i # => 0
7008  * '2'.to_i(2) # => 0
7009  *
7010  */
7011 
7012 static VALUE
7013 rb_str_to_i(int argc, VALUE *argv, VALUE str)
7014 {
7015  int base = 10;
7016 
7017  if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7018  rb_raise(rb_eArgError, "invalid radix %d", base);
7019  }
7020  return rb_str_to_inum(str, base, FALSE);
7021 }
7022 
7023 
7024 /*
7025  * call-seq:
7026  * to_f -> float
7027  *
7028  * Returns the result of interpreting leading characters in +self+ as a Float:
7029  *
7030  * '3.14159'.to_f # => 3.14159
7031  * '1.234e-2'.to_f # => 0.01234
7032  *
7033  * Characters past a leading valid number (in the given +base+) are ignored:
7034  *
7035  * '3.14 (pi to two places)'.to_f # => 3.14
7036  *
7037  * Returns zero if there is no leading valid number:
7038  *
7039  * 'abcdef'.to_f # => 0.0
7040  *
7041  */
7042 
7043 static VALUE
7044 rb_str_to_f(VALUE str)
7045 {
7046  return DBL2NUM(rb_str_to_dbl(str, FALSE));
7047 }
7048 
7049 
7050 /*
7051  * call-seq:
7052  * to_s -> self or string
7053  *
7054  * Returns +self+ if +self+ is a +String+,
7055  * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7056  */
7057 
7058 static VALUE
7059 rb_str_to_s(VALUE str)
7060 {
7061  if (rb_obj_class(str) != rb_cString) {
7062  return str_duplicate(rb_cString, str);
7063  }
7064  return str;
7065 }
7066 
7067 #if 0
7068 static void
7069 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7070 {
7071  char s[RUBY_MAX_CHAR_LEN];
7072  int n = rb_enc_codelen(c, enc);
7073 
7074  rb_enc_mbcput(c, s, enc);
7075  rb_enc_str_buf_cat(str, s, n, enc);
7076 }
7077 #endif
7078 
7079 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7080 
7081 int
7082 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7083 {
7084  char buf[CHAR_ESC_LEN + 1];
7085  int l;
7086 
7087 #if SIZEOF_INT > 4
7088  c &= 0xffffffff;
7089 #endif
7090  if (unicode_p) {
7091  if (c < 0x7F && ISPRINT(c)) {
7092  snprintf(buf, CHAR_ESC_LEN, "%c", c);
7093  }
7094  else if (c < 0x10000) {
7095  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7096  }
7097  else {
7098  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7099  }
7100  }
7101  else {
7102  if (c < 0x100) {
7103  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7104  }
7105  else {
7106  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7107  }
7108  }
7109  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7110  rb_str_buf_cat(result, buf, l);
7111  return l;
7112 }
7113 
7114 const char *
7115 ruby_escaped_char(int c)
7116 {
7117  switch (c) {
7118  case '\0': return "\\0";
7119  case '\n': return "\\n";
7120  case '\r': return "\\r";
7121  case '\t': return "\\t";
7122  case '\f': return "\\f";
7123  case '\013': return "\\v";
7124  case '\010': return "\\b";
7125  case '\007': return "\\a";
7126  case '\033': return "\\e";
7127  case '\x7f': return "\\c?";
7128  }
7129  return NULL;
7130 }
7131 
7132 VALUE
7133 rb_str_escape(VALUE str)
7134 {
7135  int encidx = ENCODING_GET(str);
7136  rb_encoding *enc = rb_enc_from_index(encidx);
7137  const char *p = RSTRING_PTR(str);
7138  const char *pend = RSTRING_END(str);
7139  const char *prev = p;
7140  char buf[CHAR_ESC_LEN + 1];
7141  VALUE result = rb_str_buf_new(0);
7142  int unicode_p = rb_enc_unicode_p(enc);
7143  int asciicompat = rb_enc_asciicompat(enc);
7144 
7145  while (p < pend) {
7146  unsigned int c;
7147  const char *cc;
7148  int n = rb_enc_precise_mbclen(p, pend, enc);
7149  if (!MBCLEN_CHARFOUND_P(n)) {
7150  if (p > prev) str_buf_cat(result, prev, p - prev);
7151  n = rb_enc_mbminlen(enc);
7152  if (pend < p + n)
7153  n = (int)(pend - p);
7154  while (n--) {
7155  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7156  str_buf_cat(result, buf, strlen(buf));
7157  prev = ++p;
7158  }
7159  continue;
7160  }
7161  n = MBCLEN_CHARFOUND_LEN(n);
7162  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7163  p += n;
7164  cc = ruby_escaped_char(c);
7165  if (cc) {
7166  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7167  str_buf_cat(result, cc, strlen(cc));
7168  prev = p;
7169  }
7170  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7171  }
7172  else {
7173  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7174  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7175  prev = p;
7176  }
7177  }
7178  if (p > prev) str_buf_cat(result, prev, p - prev);
7180 
7181  return result;
7182 }
7183 
7184 /*
7185  * call-seq:
7186  * inspect -> string
7187  *
7188  * Returns a printable version of +self+, enclosed in double-quotes,
7189  * and with special characters escaped:
7190  *
7191  * s = "foo\tbar\tbaz\n"
7192  * s.inspect
7193  * # => "\"foo\\tbar\\tbaz\\n\""
7194  *
7195  */
7196 
7197 VALUE
7199 {
7200  int encidx = ENCODING_GET(str);
7201  rb_encoding *enc = rb_enc_from_index(encidx);
7202  const char *p, *pend, *prev;
7203  char buf[CHAR_ESC_LEN + 1];
7204  VALUE result = rb_str_buf_new(0);
7206  int unicode_p = rb_enc_unicode_p(enc);
7207  int asciicompat = rb_enc_asciicompat(enc);
7208 
7209  if (resenc == NULL) resenc = rb_default_external_encoding();
7210  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7211  rb_enc_associate(result, resenc);
7212  str_buf_cat2(result, "\"");
7213 
7214  p = RSTRING_PTR(str); pend = RSTRING_END(str);
7215  prev = p;
7216  while (p < pend) {
7217  unsigned int c, cc;
7218  int n;
7219 
7220  n = rb_enc_precise_mbclen(p, pend, enc);
7221  if (!MBCLEN_CHARFOUND_P(n)) {
7222  if (p > prev) str_buf_cat(result, prev, p - prev);
7223  n = rb_enc_mbminlen(enc);
7224  if (pend < p + n)
7225  n = (int)(pend - p);
7226  while (n--) {
7227  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7228  str_buf_cat(result, buf, strlen(buf));
7229  prev = ++p;
7230  }
7231  continue;
7232  }
7233  n = MBCLEN_CHARFOUND_LEN(n);
7234  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7235  p += n;
7236  if ((asciicompat || unicode_p) &&
7237  (c == '"'|| c == '\\' ||
7238  (c == '#' &&
7239  p < pend &&
7241  (cc = rb_enc_codepoint(p,pend,enc),
7242  (cc == '$' || cc == '@' || cc == '{'))))) {
7243  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7244  str_buf_cat2(result, "\\");
7245  if (asciicompat || enc == resenc) {
7246  prev = p - n;
7247  continue;
7248  }
7249  }
7250  switch (c) {
7251  case '\n': cc = 'n'; break;
7252  case '\r': cc = 'r'; break;
7253  case '\t': cc = 't'; break;
7254  case '\f': cc = 'f'; break;
7255  case '\013': cc = 'v'; break;
7256  case '\010': cc = 'b'; break;
7257  case '\007': cc = 'a'; break;
7258  case 033: cc = 'e'; break;
7259  default: cc = 0; break;
7260  }
7261  if (cc) {
7262  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7263  buf[0] = '\\';
7264  buf[1] = (char)cc;
7265  str_buf_cat(result, buf, 2);
7266  prev = p;
7267  continue;
7268  }
7269  /* The special casing of 0x85 (NEXT_LINE) here is because
7270  * Oniguruma historically treats it as printable, but it
7271  * doesn't match the print POSIX bracket class or character
7272  * property in regexps.
7273  *
7274  * See Ruby Bug #16842 for details:
7275  * https://bugs.ruby-lang.org/issues/16842
7276  */
7277  if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7278  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7279  continue;
7280  }
7281  else {
7282  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7283  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7284  prev = p;
7285  continue;
7286  }
7287  }
7288  if (p > prev) str_buf_cat(result, prev, p - prev);
7289  str_buf_cat2(result, "\"");
7290 
7291  return result;
7292 }
7293 
7294 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7295 
7296 /*
7297  * call-seq:
7298  * dump -> string
7299  *
7300  * Returns a printable version of +self+, enclosed in double-quotes,
7301  * with special characters escaped, and with non-printing characters
7302  * replaced by hexadecimal notation:
7303  *
7304  * "hello \n ''".dump # => "\"hello \\n ''\""
7305  * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7306  *
7307  * Related: String#undump (inverse of String#dump).
7308  *
7309  */
7310 
7311 VALUE
7313 {
7314  int encidx = rb_enc_get_index(str);
7315  rb_encoding *enc = rb_enc_from_index(encidx);
7316  long len;
7317  const char *p, *pend;
7318  char *q, *qend;
7319  VALUE result;
7320  int u8 = (encidx == rb_utf8_encindex());
7321  static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7322 
7323  len = 2; /* "" */
7324  if (!rb_enc_asciicompat(enc)) {
7325  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7326  len += strlen(enc->name);
7327  }
7328 
7329  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7330  while (p < pend) {
7331  int clen;
7332  unsigned char c = *p++;
7333 
7334  switch (c) {
7335  case '"': case '\\':
7336  case '\n': case '\r':
7337  case '\t': case '\f':
7338  case '\013': case '\010': case '\007': case '\033':
7339  clen = 2;
7340  break;
7341 
7342  case '#':
7343  clen = IS_EVSTR(p, pend) ? 2 : 1;
7344  break;
7345 
7346  default:
7347  if (ISPRINT(c)) {
7348  clen = 1;
7349  }
7350  else {
7351  if (u8 && c > 0x7F) { /* \u notation */
7352  int n = rb_enc_precise_mbclen(p-1, pend, enc);
7353  if (MBCLEN_CHARFOUND_P(n)) {
7354  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7355  if (cc <= 0xFFFF)
7356  clen = 6; /* \uXXXX */
7357  else if (cc <= 0xFFFFF)
7358  clen = 9; /* \u{XXXXX} */
7359  else
7360  clen = 10; /* \u{XXXXXX} */
7361  p += MBCLEN_CHARFOUND_LEN(n)-1;
7362  break;
7363  }
7364  }
7365  clen = 4; /* \xNN */
7366  }
7367  break;
7368  }
7369 
7370  if (clen > LONG_MAX - len) {
7371  rb_raise(rb_eRuntimeError, "string size too big");
7372  }
7373  len += clen;
7374  }
7375 
7376  result = rb_str_new(0, len);
7377  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7378  q = RSTRING_PTR(result); qend = q + len + 1;
7379 
7380  *q++ = '"';
7381  while (p < pend) {
7382  unsigned char c = *p++;
7383 
7384  if (c == '"' || c == '\\') {
7385  *q++ = '\\';
7386  *q++ = c;
7387  }
7388  else if (c == '#') {
7389  if (IS_EVSTR(p, pend)) *q++ = '\\';
7390  *q++ = '#';
7391  }
7392  else if (c == '\n') {
7393  *q++ = '\\';
7394  *q++ = 'n';
7395  }
7396  else if (c == '\r') {
7397  *q++ = '\\';
7398  *q++ = 'r';
7399  }
7400  else if (c == '\t') {
7401  *q++ = '\\';
7402  *q++ = 't';
7403  }
7404  else if (c == '\f') {
7405  *q++ = '\\';
7406  *q++ = 'f';
7407  }
7408  else if (c == '\013') {
7409  *q++ = '\\';
7410  *q++ = 'v';
7411  }
7412  else if (c == '\010') {
7413  *q++ = '\\';
7414  *q++ = 'b';
7415  }
7416  else if (c == '\007') {
7417  *q++ = '\\';
7418  *q++ = 'a';
7419  }
7420  else if (c == '\033') {
7421  *q++ = '\\';
7422  *q++ = 'e';
7423  }
7424  else if (ISPRINT(c)) {
7425  *q++ = c;
7426  }
7427  else {
7428  *q++ = '\\';
7429  if (u8) {
7430  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7431  if (MBCLEN_CHARFOUND_P(n)) {
7432  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7433  p += n;
7434  if (cc <= 0xFFFF)
7435  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7436  else
7437  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7438  q += strlen(q);
7439  continue;
7440  }
7441  }
7442  snprintf(q, qend-q, "x%02X", c);
7443  q += 3;
7444  }
7445  }
7446  *q++ = '"';
7447  *q = '\0';
7448  if (!rb_enc_asciicompat(enc)) {
7449  snprintf(q, qend-q, nonascii_suffix, enc->name);
7450  encidx = rb_ascii8bit_encindex();
7451  }
7452  /* result from dump is ASCII */
7453  rb_enc_associate_index(result, encidx);
7455  return result;
7456 }
7457 
7458 static int
7459 unescape_ascii(unsigned int c)
7460 {
7461  switch (c) {
7462  case 'n':
7463  return '\n';
7464  case 'r':
7465  return '\r';
7466  case 't':
7467  return '\t';
7468  case 'f':
7469  return '\f';
7470  case 'v':
7471  return '\13';
7472  case 'b':
7473  return '\010';
7474  case 'a':
7475  return '\007';
7476  case 'e':
7477  return 033;
7478  }
7479  UNREACHABLE_RETURN(-1);
7480 }
7481 
7482 static void
7483 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7484 {
7485  const char *s = *ss;
7486  unsigned int c;
7487  int codelen;
7488  size_t hexlen;
7489  unsigned char buf[6];
7490  static rb_encoding *enc_utf8 = NULL;
7491 
7492  switch (*s) {
7493  case '\\':
7494  case '"':
7495  case '#':
7496  rb_str_cat(undumped, s, 1); /* cat itself */
7497  s++;
7498  break;
7499  case 'n':
7500  case 'r':
7501  case 't':
7502  case 'f':
7503  case 'v':
7504  case 'b':
7505  case 'a':
7506  case 'e':
7507  *buf = unescape_ascii(*s);
7508  rb_str_cat(undumped, (char *)buf, 1);
7509  s++;
7510  break;
7511  case 'u':
7512  if (*binary) {
7513  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7514  }
7515  *utf8 = true;
7516  if (++s >= s_end) {
7517  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7518  }
7519  if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7520  if (*penc != enc_utf8) {
7521  *penc = enc_utf8;
7522  rb_enc_associate(undumped, enc_utf8);
7523  }
7524  if (*s == '{') { /* handle \u{...} form */
7525  s++;
7526  for (;;) {
7527  if (s >= s_end) {
7528  rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7529  }
7530  if (*s == '}') {
7531  s++;
7532  break;
7533  }
7534  if (ISSPACE(*s)) {
7535  s++;
7536  continue;
7537  }
7538  c = scan_hex(s, s_end-s, &hexlen);
7539  if (hexlen == 0 || hexlen > 6) {
7540  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7541  }
7542  if (c > 0x10ffff) {
7543  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7544  }
7545  if (0xd800 <= c && c <= 0xdfff) {
7546  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7547  }
7548  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7549  rb_str_cat(undumped, (char *)buf, codelen);
7550  s += hexlen;
7551  }
7552  }
7553  else { /* handle \uXXXX form */
7554  c = scan_hex(s, 4, &hexlen);
7555  if (hexlen != 4) {
7556  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7557  }
7558  if (0xd800 <= c && c <= 0xdfff) {
7559  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7560  }
7561  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7562  rb_str_cat(undumped, (char *)buf, codelen);
7563  s += hexlen;
7564  }
7565  break;
7566  case 'x':
7567  if (*utf8) {
7568  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7569  }
7570  *binary = true;
7571  if (++s >= s_end) {
7572  rb_raise(rb_eRuntimeError, "invalid hex escape");
7573  }
7574  *buf = scan_hex(s, 2, &hexlen);
7575  if (hexlen != 2) {
7576  rb_raise(rb_eRuntimeError, "invalid hex escape");
7577  }
7578  rb_str_cat(undumped, (char *)buf, 1);
7579  s += hexlen;
7580  break;
7581  default:
7582  rb_str_cat(undumped, s-1, 2);
7583  s++;
7584  }
7585 
7586  *ss = s;
7587 }
7588 
7589 static VALUE rb_str_is_ascii_only_p(VALUE str);
7590 
7591 /*
7592  * call-seq:
7593  * undump -> string
7594  *
7595  * Returns an unescaped version of +self+:
7596  *
7597  * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7598  * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7599  * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7600  * s_undumped == s_orig # => true
7601  *
7602  * Related: String#dump (inverse of String#undump).
7603  *
7604  */
7605 
7606 static VALUE
7607 str_undump(VALUE str)
7608 {
7609  const char *s = RSTRING_PTR(str);
7610  const char *s_end = RSTRING_END(str);
7611  rb_encoding *enc = rb_enc_get(str);
7612  VALUE undumped = rb_enc_str_new(s, 0L, enc);
7613  bool utf8 = false;
7614  bool binary = false;
7615  int w;
7616 
7617  rb_must_asciicompat(str);
7618  if (rb_str_is_ascii_only_p(str) == Qfalse) {
7619  rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7620  }
7621  if (!str_null_check(str, &w)) {
7622  rb_raise(rb_eRuntimeError, "string contains null byte");
7623  }
7624  if (RSTRING_LEN(str) < 2) goto invalid_format;
7625  if (*s != '"') goto invalid_format;
7626 
7627  /* strip '"' at the start */
7628  s++;
7629 
7630  for (;;) {
7631  if (s >= s_end) {
7632  rb_raise(rb_eRuntimeError, "unterminated dumped string");
7633  }
7634 
7635  if (*s == '"') {
7636  /* epilogue */
7637  s++;
7638  if (s == s_end) {
7639  /* ascii compatible dumped string */
7640  break;
7641  }
7642  else {
7643  static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7644  static const char dup_suffix[] = ".dup";
7645  const char *encname;
7646  int encidx;
7647  ptrdiff_t size;
7648 
7649  /* check separately for strings dumped by older versions */
7650  size = sizeof(dup_suffix) - 1;
7651  if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7652 
7653  size = sizeof(force_encoding_suffix) - 1;
7654  if (s_end - s <= size) goto invalid_format;
7655  if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7656  s += size;
7657 
7658  if (utf8) {
7659  rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7660  }
7661 
7662  encname = s;
7663  s = memchr(s, '"', s_end-s);
7664  size = s - encname;
7665  if (!s) goto invalid_format;
7666  if (s_end - s != 2) goto invalid_format;
7667  if (s[0] != '"' || s[1] != ')') goto invalid_format;
7668 
7669  encidx = rb_enc_find_index2(encname, (long)size);
7670  if (encidx < 0) {
7671  rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7672  }
7673  rb_enc_associate_index(undumped, encidx);
7674  }
7675  break;
7676  }
7677 
7678  if (*s == '\\') {
7679  s++;
7680  if (s >= s_end) {
7681  rb_raise(rb_eRuntimeError, "invalid escape");
7682  }
7683  undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7684  }
7685  else {
7686  rb_str_cat(undumped, s++, 1);
7687  }
7688  }
7689 
7690  RB_GC_GUARD(str);
7691 
7692  return undumped;
7693 invalid_format:
7694  rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7695 }
7696 
7697 static void
7698 rb_str_check_dummy_enc(rb_encoding *enc)
7699 {
7700  if (rb_enc_dummy_p(enc)) {
7701  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7702  rb_enc_name(enc));
7703  }
7704 }
7705 
7706 static rb_encoding *
7707 str_true_enc(VALUE str)
7708 {
7709  rb_encoding *enc = STR_ENC_GET(str);
7710  rb_str_check_dummy_enc(enc);
7711  return enc;
7712 }
7713 
7714 static OnigCaseFoldType
7715 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7716 {
7717  if (argc==0)
7718  return flags;
7719  if (argc>2)
7720  rb_raise(rb_eArgError, "too many options");
7721  if (argv[0]==sym_turkic) {
7722  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7723  if (argc==2) {
7724  if (argv[1]==sym_lithuanian)
7725  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7726  else
7727  rb_raise(rb_eArgError, "invalid second option");
7728  }
7729  }
7730  else if (argv[0]==sym_lithuanian) {
7731  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7732  if (argc==2) {
7733  if (argv[1]==sym_turkic)
7734  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7735  else
7736  rb_raise(rb_eArgError, "invalid second option");
7737  }
7738  }
7739  else if (argc>1)
7740  rb_raise(rb_eArgError, "too many options");
7741  else if (argv[0]==sym_ascii)
7742  flags |= ONIGENC_CASE_ASCII_ONLY;
7743  else if (argv[0]==sym_fold) {
7744  if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7745  flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7746  else
7747  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7748  }
7749  else
7750  rb_raise(rb_eArgError, "invalid option");
7751  return flags;
7752 }
7753 
7754 static inline bool
7755 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7756 {
7757  if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7758  return true;
7759  return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7760 }
7761 
7762 /* 16 should be long enough to absorb any kind of single character length increase */
7763 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7764 #ifndef CASEMAP_DEBUG
7765 # define CASEMAP_DEBUG 0
7766 #endif
7767 
7768 struct mapping_buffer;
7769 typedef struct mapping_buffer {
7770  size_t capa;
7771  size_t used;
7772  struct mapping_buffer *next;
7773  OnigUChar space[FLEX_ARY_LEN];
7774 } mapping_buffer;
7775 
7776 static void
7777 mapping_buffer_free(void *p)
7778 {
7779  mapping_buffer *previous_buffer;
7780  mapping_buffer *current_buffer = p;
7781  while (current_buffer) {
7782  previous_buffer = current_buffer;
7783  current_buffer = current_buffer->next;
7784  ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7785  }
7786 }
7787 
7788 static const rb_data_type_t mapping_buffer_type = {
7789  "mapping_buffer",
7790  {0, mapping_buffer_free,},
7791  0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7792 };
7793 
7794 static VALUE
7795 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7796 {
7797  VALUE target;
7798 
7799  const OnigUChar *source_current, *source_end;
7800  int target_length = 0;
7801  VALUE buffer_anchor;
7802  mapping_buffer *current_buffer = 0;
7803  mapping_buffer **pre_buffer;
7804  size_t buffer_count = 0;
7805  int buffer_length_or_invalid;
7806 
7807  if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7808 
7809  source_current = (OnigUChar*)RSTRING_PTR(source);
7810  source_end = (OnigUChar*)RSTRING_END(source);
7811 
7812  buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7813  pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7814  while (source_current < source_end) {
7815  /* increase multiplier using buffer count to converge quickly */
7816  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7817  if (CASEMAP_DEBUG) {
7818  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7819  }
7820  current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7821  *pre_buffer = current_buffer;
7822  pre_buffer = &current_buffer->next;
7823  current_buffer->next = NULL;
7824  current_buffer->capa = capa;
7825  buffer_length_or_invalid = enc->case_map(flags,
7826  &source_current, source_end,
7827  current_buffer->space,
7828  current_buffer->space+current_buffer->capa,
7829  enc);
7830  if (buffer_length_or_invalid < 0) {
7831  current_buffer = DATA_PTR(buffer_anchor);
7832  DATA_PTR(buffer_anchor) = 0;
7833  mapping_buffer_free(current_buffer);
7834  rb_raise(rb_eArgError, "input string invalid");
7835  }
7836  target_length += current_buffer->used = buffer_length_or_invalid;
7837  }
7838  if (CASEMAP_DEBUG) {
7839  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7840  }
7841 
7842  if (buffer_count==1) {
7843  target = rb_str_new((const char*)current_buffer->space, target_length);
7844  }
7845  else {
7846  char *target_current;
7847 
7848  target = rb_str_new(0, target_length);
7849  target_current = RSTRING_PTR(target);
7850  current_buffer = DATA_PTR(buffer_anchor);
7851  while (current_buffer) {
7852  memcpy(target_current, current_buffer->space, current_buffer->used);
7853  target_current += current_buffer->used;
7854  current_buffer = current_buffer->next;
7855  }
7856  }
7857  current_buffer = DATA_PTR(buffer_anchor);
7858  DATA_PTR(buffer_anchor) = 0;
7859  mapping_buffer_free(current_buffer);
7860 
7861  RB_GC_GUARD(buffer_anchor);
7862 
7863  /* TODO: check about string terminator character */
7864  str_enc_copy_direct(target, source);
7865  /*ENC_CODERANGE_SET(mapped, cr);*/
7866 
7867  return target;
7868 }
7869 
7870 static VALUE
7871 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7872 {
7873  const OnigUChar *source_current, *source_end;
7874  OnigUChar *target_current, *target_end;
7875  long old_length = RSTRING_LEN(source);
7876  int length_or_invalid;
7877 
7878  if (old_length == 0) return Qnil;
7879 
7880  source_current = (OnigUChar*)RSTRING_PTR(source);
7881  source_end = (OnigUChar*)RSTRING_END(source);
7882  if (source == target) {
7883  target_current = (OnigUChar*)source_current;
7884  target_end = (OnigUChar*)source_end;
7885  }
7886  else {
7887  target_current = (OnigUChar*)RSTRING_PTR(target);
7888  target_end = (OnigUChar*)RSTRING_END(target);
7889  }
7890 
7891  length_or_invalid = onigenc_ascii_only_case_map(flags,
7892  &source_current, source_end,
7893  target_current, target_end, enc);
7894  if (length_or_invalid < 0)
7895  rb_raise(rb_eArgError, "input string invalid");
7896  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7897  fprintf(stderr, "problem with rb_str_ascii_casemap"
7898  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7899  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7900  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7901  }
7902 
7903  str_enc_copy(target, source);
7904 
7905  return target;
7906 }
7907 
7908 static bool
7909 upcase_single(VALUE str)
7910 {
7911  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7912  bool modified = false;
7913 
7914  while (s < send) {
7915  unsigned int c = *(unsigned char*)s;
7916 
7917  if ('a' <= c && c <= 'z') {
7918  *s = 'A' + (c - 'a');
7919  modified = true;
7920  }
7921  s++;
7922  }
7923  return modified;
7924 }
7925 
7926 /*
7927  * call-seq:
7928  * upcase!(*options) -> self or nil
7929  *
7930  * Upcases the characters in +self+;
7931  * returns +self+ if any changes were made, +nil+ otherwise:
7932  *
7933  * s = 'Hello World!' # => "Hello World!"
7934  * s.upcase! # => "HELLO WORLD!"
7935  * s # => "HELLO WORLD!"
7936  * s.upcase! # => nil
7937  *
7938  * The casing may be affected by the given +options+;
7939  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7940  *
7941  * Related: String#upcase, String#downcase, String#downcase!.
7942  *
7943  */
7944 
7945 static VALUE
7946 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7947 {
7948  rb_encoding *enc;
7949  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7950 
7951  flags = check_case_options(argc, argv, flags);
7952  str_modify_keep_cr(str);
7953  enc = str_true_enc(str);
7954  if (case_option_single_p(flags, enc, str)) {
7955  if (upcase_single(str))
7956  flags |= ONIGENC_CASE_MODIFIED;
7957  }
7958  else if (flags&ONIGENC_CASE_ASCII_ONLY)
7959  rb_str_ascii_casemap(str, str, &flags, enc);
7960  else
7961  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7962 
7963  if (ONIGENC_CASE_MODIFIED&flags) return str;
7964  return Qnil;
7965 }
7966 
7967 
7968 /*
7969  * call-seq:
7970  * upcase(*options) -> string
7971  *
7972  * Returns a string containing the upcased characters in +self+:
7973  *
7974  * s = 'Hello World!' # => "Hello World!"
7975  * s.upcase # => "HELLO WORLD!"
7976  *
7977  * The casing may be affected by the given +options+;
7978  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7979  *
7980  * Related: String#upcase!, String#downcase, String#downcase!.
7981  *
7982  */
7983 
7984 static VALUE
7985 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7986 {
7987  rb_encoding *enc;
7988  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7989  VALUE ret;
7990 
7991  flags = check_case_options(argc, argv, flags);
7992  enc = str_true_enc(str);
7993  if (case_option_single_p(flags, enc, str)) {
7994  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7995  str_enc_copy_direct(ret, str);
7996  upcase_single(ret);
7997  }
7998  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7999  ret = rb_str_new(0, RSTRING_LEN(str));
8000  rb_str_ascii_casemap(str, ret, &flags, enc);
8001  }
8002  else {
8003  ret = rb_str_casemap(str, &flags, enc);
8004  }
8005 
8006  return ret;
8007 }
8008 
8009 static bool
8010 downcase_single(VALUE str)
8011 {
8012  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8013  bool modified = false;
8014 
8015  while (s < send) {
8016  unsigned int c = *(unsigned char*)s;
8017 
8018  if ('A' <= c && c <= 'Z') {
8019  *s = 'a' + (c - 'A');
8020  modified = true;
8021  }
8022  s++;
8023  }
8024 
8025  return modified;
8026 }
8027 
8028 /*
8029  * call-seq:
8030  * downcase!(*options) -> self or nil
8031  *
8032  * Downcases the characters in +self+;
8033  * returns +self+ if any changes were made, +nil+ otherwise:
8034  *
8035  * s = 'Hello World!' # => "Hello World!"
8036  * s.downcase! # => "hello world!"
8037  * s # => "hello world!"
8038  * s.downcase! # => nil
8039  *
8040  * The casing may be affected by the given +options+;
8041  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8042  *
8043  * Related: String#downcase, String#upcase, String#upcase!.
8044  *
8045  */
8046 
8047 static VALUE
8048 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8049 {
8050  rb_encoding *enc;
8051  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8052 
8053  flags = check_case_options(argc, argv, flags);
8054  str_modify_keep_cr(str);
8055  enc = str_true_enc(str);
8056  if (case_option_single_p(flags, enc, str)) {
8057  if (downcase_single(str))
8058  flags |= ONIGENC_CASE_MODIFIED;
8059  }
8060  else if (flags&ONIGENC_CASE_ASCII_ONLY)
8061  rb_str_ascii_casemap(str, str, &flags, enc);
8062  else
8063  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8064 
8065  if (ONIGENC_CASE_MODIFIED&flags) return str;
8066  return Qnil;
8067 }
8068 
8069 
8070 /*
8071  * call-seq:
8072  * downcase(*options) -> string
8073  *
8074  * Returns a string containing the downcased characters in +self+:
8075  *
8076  * s = 'Hello World!' # => "Hello World!"
8077  * s.downcase # => "hello world!"
8078  *
8079  * The casing may be affected by the given +options+;
8080  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8081  *
8082  * Related: String#downcase!, String#upcase, String#upcase!.
8083  *
8084  */
8085 
8086 static VALUE
8087 rb_str_downcase(int argc, VALUE *argv, VALUE str)
8088 {
8089  rb_encoding *enc;
8090  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8091  VALUE ret;
8092 
8093  flags = check_case_options(argc, argv, flags);
8094  enc = str_true_enc(str);
8095  if (case_option_single_p(flags, enc, str)) {
8096  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8097  str_enc_copy_direct(ret, str);
8098  downcase_single(ret);
8099  }
8100  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8101  ret = rb_str_new(0, RSTRING_LEN(str));
8102  rb_str_ascii_casemap(str, ret, &flags, enc);
8103  }
8104  else {
8105  ret = rb_str_casemap(str, &flags, enc);
8106  }
8107 
8108  return ret;
8109 }
8110 
8111 
8112 /*
8113  * call-seq:
8114  * capitalize!(*options) -> self or nil
8115  *
8116  * Upcases the first character in +self+;
8117  * downcases the remaining characters;
8118  * returns +self+ if any changes were made, +nil+ otherwise:
8119  *
8120  * s = 'hello World!' # => "hello World!"
8121  * s.capitalize! # => "Hello world!"
8122  * s # => "Hello world!"
8123  * s.capitalize! # => nil
8124  *
8125  * The casing may be affected by the given +options+;
8126  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8127  *
8128  * Related: String#capitalize.
8129  *
8130  */
8131 
8132 static VALUE
8133 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8134 {
8135  rb_encoding *enc;
8136  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8137 
8138  flags = check_case_options(argc, argv, flags);
8139  str_modify_keep_cr(str);
8140  enc = str_true_enc(str);
8141  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8142  if (flags&ONIGENC_CASE_ASCII_ONLY)
8143  rb_str_ascii_casemap(str, str, &flags, enc);
8144  else
8145  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8146 
8147  if (ONIGENC_CASE_MODIFIED&flags) return str;
8148  return Qnil;
8149 }
8150 
8151 
8152 /*
8153  * call-seq:
8154  * capitalize(*options) -> string
8155  *
8156  * Returns a string containing the characters in +self+;
8157  * the first character is upcased;
8158  * the remaining characters are downcased:
8159  *
8160  * s = 'hello World!' # => "hello World!"
8161  * s.capitalize # => "Hello world!"
8162  *
8163  * The casing may be affected by the given +options+;
8164  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8165  *
8166  * Related: String#capitalize!.
8167  *
8168  */
8169 
8170 static VALUE
8171 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8172 {
8173  rb_encoding *enc;
8174  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8175  VALUE ret;
8176 
8177  flags = check_case_options(argc, argv, flags);
8178  enc = str_true_enc(str);
8179  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8180  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8181  ret = rb_str_new(0, RSTRING_LEN(str));
8182  rb_str_ascii_casemap(str, ret, &flags, enc);
8183  }
8184  else {
8185  ret = rb_str_casemap(str, &flags, enc);
8186  }
8187  return ret;
8188 }
8189 
8190 
8191 /*
8192  * call-seq:
8193  * swapcase!(*options) -> self or nil
8194  *
8195  * Upcases each lowercase character in +self+;
8196  * downcases uppercase character;
8197  * returns +self+ if any changes were made, +nil+ otherwise:
8198  *
8199  * s = 'Hello World!' # => "Hello World!"
8200  * s.swapcase! # => "hELLO wORLD!"
8201  * s # => "hELLO wORLD!"
8202  * ''.swapcase! # => nil
8203  *
8204  * The casing may be affected by the given +options+;
8205  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8206  *
8207  * Related: String#swapcase.
8208  *
8209  */
8210 
8211 static VALUE
8212 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8213 {
8214  rb_encoding *enc;
8215  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8216 
8217  flags = check_case_options(argc, argv, flags);
8218  str_modify_keep_cr(str);
8219  enc = str_true_enc(str);
8220  if (flags&ONIGENC_CASE_ASCII_ONLY)
8221  rb_str_ascii_casemap(str, str, &flags, enc);
8222  else
8223  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8224 
8225  if (ONIGENC_CASE_MODIFIED&flags) return str;
8226  return Qnil;
8227 }
8228 
8229 
8230 /*
8231  * call-seq:
8232  * swapcase(*options) -> string
8233  *
8234  * Returns a string containing the characters in +self+, with cases reversed;
8235  * each uppercase character is downcased;
8236  * each lowercase character is upcased:
8237  *
8238  * s = 'Hello World!' # => "Hello World!"
8239  * s.swapcase # => "hELLO wORLD!"
8240  *
8241  * The casing may be affected by the given +options+;
8242  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8243  *
8244  * Related: String#swapcase!.
8245  *
8246  */
8247 
8248 static VALUE
8249 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8250 {
8251  rb_encoding *enc;
8252  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8253  VALUE ret;
8254 
8255  flags = check_case_options(argc, argv, flags);
8256  enc = str_true_enc(str);
8257  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8258  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8259  ret = rb_str_new(0, RSTRING_LEN(str));
8260  rb_str_ascii_casemap(str, ret, &flags, enc);
8261  }
8262  else {
8263  ret = rb_str_casemap(str, &flags, enc);
8264  }
8265  return ret;
8266 }
8267 
8268 typedef unsigned char *USTR;
8269 
8270 struct tr {
8271  int gen;
8272  unsigned int now, max;
8273  char *p, *pend;
8274 };
8275 
8276 static unsigned int
8277 trnext(struct tr *t, rb_encoding *enc)
8278 {
8279  int n;
8280 
8281  for (;;) {
8282  nextpart:
8283  if (!t->gen) {
8284  if (t->p == t->pend) return -1;
8285  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8286  t->p += n;
8287  }
8288  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8289  t->p += n;
8290  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8291  t->p += n;
8292  if (t->p < t->pend) {
8293  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8294  t->p += n;
8295  if (t->now > c) {
8296  if (t->now < 0x80 && c < 0x80) {
8298  "invalid range \"%c-%c\" in string transliteration",
8299  t->now, c);
8300  }
8301  else {
8302  rb_raise(rb_eArgError, "invalid range in string transliteration");
8303  }
8304  continue; /* not reached */
8305  }
8306  else if (t->now < c) {
8307  t->gen = 1;
8308  t->max = c;
8309  }
8310  }
8311  }
8312  return t->now;
8313  }
8314  else {
8315  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8316  if (t->now == t->max) {
8317  t->gen = 0;
8318  goto nextpart;
8319  }
8320  }
8321  if (t->now < t->max) {
8322  return t->now;
8323  }
8324  else {
8325  t->gen = 0;
8326  return t->max;
8327  }
8328  }
8329  }
8330 }
8331 
8332 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8333 
8334 static VALUE
8335 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8336 {
8337  const unsigned int errc = -1;
8338  unsigned int trans[256];
8339  rb_encoding *enc, *e1, *e2;
8340  struct tr trsrc, trrepl;
8341  int cflag = 0;
8342  unsigned int c, c0, last = 0;
8343  int modify = 0, i, l;
8344  unsigned char *s, *send;
8345  VALUE hash = 0;
8346  int singlebyte = single_byte_optimizable(str);
8347  int termlen;
8348  int cr;
8349 
8350 #define CHECK_IF_ASCII(c) \
8351  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8352  (cr = ENC_CODERANGE_VALID) : 0)
8353 
8354  StringValue(src);
8355  StringValue(repl);
8356  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8357  if (RSTRING_LEN(repl) == 0) {
8358  return rb_str_delete_bang(1, &src, str);
8359  }
8360 
8361  cr = ENC_CODERANGE(str);
8362  e1 = rb_enc_check(str, src);
8363  e2 = rb_enc_check(str, repl);
8364  if (e1 == e2) {
8365  enc = e1;
8366  }
8367  else {
8368  enc = rb_enc_check(src, repl);
8369  }
8370  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8371  if (RSTRING_LEN(src) > 1 &&
8372  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8373  trsrc.p + l < trsrc.pend) {
8374  cflag = 1;
8375  trsrc.p += l;
8376  }
8377  trrepl.p = RSTRING_PTR(repl);
8378  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8379  trsrc.gen = trrepl.gen = 0;
8380  trsrc.now = trrepl.now = 0;
8381  trsrc.max = trrepl.max = 0;
8382 
8383  if (cflag) {
8384  for (i=0; i<256; i++) {
8385  trans[i] = 1;
8386  }
8387  while ((c = trnext(&trsrc, enc)) != errc) {
8388  if (c < 256) {
8389  trans[c] = errc;
8390  }
8391  else {
8392  if (!hash) hash = rb_hash_new();
8393  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8394  }
8395  }
8396  while ((c = trnext(&trrepl, enc)) != errc)
8397  /* retrieve last replacer */;
8398  last = trrepl.now;
8399  for (i=0; i<256; i++) {
8400  if (trans[i] != errc) {
8401  trans[i] = last;
8402  }
8403  }
8404  }
8405  else {
8406  unsigned int r;
8407 
8408  for (i=0; i<256; i++) {
8409  trans[i] = errc;
8410  }
8411  while ((c = trnext(&trsrc, enc)) != errc) {
8412  r = trnext(&trrepl, enc);
8413  if (r == errc) r = trrepl.now;
8414  if (c < 256) {
8415  trans[c] = r;
8416  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8417  }
8418  else {
8419  if (!hash) hash = rb_hash_new();
8420  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8421  }
8422  }
8423  }
8424 
8425  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8426  cr = ENC_CODERANGE_7BIT;
8427  str_modify_keep_cr(str);
8428  s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8429  termlen = rb_enc_mbminlen(enc);
8430  if (sflag) {
8431  int clen, tlen;
8432  long offset, max = RSTRING_LEN(str);
8433  unsigned int save = -1;
8434  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8435 
8436  while (s < send) {
8437  int may_modify = 0;
8438 
8439  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8440  if (!MBCLEN_CHARFOUND_P(r)) {
8441  xfree(buf);
8442  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8443  }
8444  clen = MBCLEN_CHARFOUND_LEN(r);
8445  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8446 
8447  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8448 
8449  s += clen;
8450  if (c < 256) {
8451  c = trans[c];
8452  }
8453  else if (hash) {
8454  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8455  if (NIL_P(tmp)) {
8456  if (cflag) c = last;
8457  else c = errc;
8458  }
8459  else if (cflag) c = errc;
8460  else c = NUM2INT(tmp);
8461  }
8462  else {
8463  c = errc;
8464  }
8465  if (c != (unsigned int)-1) {
8466  if (save == c) {
8467  CHECK_IF_ASCII(c);
8468  continue;
8469  }
8470  save = c;
8471  tlen = rb_enc_codelen(c, enc);
8472  modify = 1;
8473  }
8474  else {
8475  save = -1;
8476  c = c0;
8477  if (enc != e1) may_modify = 1;
8478  }
8479  if ((offset = t - buf) + tlen > max) {
8480  size_t MAYBE_UNUSED(old) = max + termlen;
8481  max = offset + tlen + (send - s);
8482  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8483  t = buf + offset;
8484  }
8485  rb_enc_mbcput(c, t, enc);
8486  if (may_modify && memcmp(s, t, tlen) != 0) {
8487  modify = 1;
8488  }
8489  CHECK_IF_ASCII(c);
8490  t += tlen;
8491  }
8492  if (!STR_EMBED_P(str)) {
8493  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8494  }
8495  TERM_FILL((char *)t, termlen);
8496  RSTRING(str)->as.heap.ptr = (char *)buf;
8497  STR_SET_LEN(str, t - buf);
8498  STR_SET_NOEMBED(str);
8499  RSTRING(str)->as.heap.aux.capa = max;
8500  }
8501  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8502  while (s < send) {
8503  c = (unsigned char)*s;
8504  if (trans[c] != errc) {
8505  if (!cflag) {
8506  c = trans[c];
8507  *s = c;
8508  modify = 1;
8509  }
8510  else {
8511  *s = last;
8512  modify = 1;
8513  }
8514  }
8515  CHECK_IF_ASCII(c);
8516  s++;
8517  }
8518  }
8519  else {
8520  int clen, tlen;
8521  long offset, max = (long)((send - s) * 1.2);
8522  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8523 
8524  while (s < send) {
8525  int may_modify = 0;
8526 
8527  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8528  if (!MBCLEN_CHARFOUND_P(r)) {
8529  xfree(buf);
8530  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8531  }
8532  clen = MBCLEN_CHARFOUND_LEN(r);
8533  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8534 
8535  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8536 
8537  if (c < 256) {
8538  c = trans[c];
8539  }
8540  else if (hash) {
8541  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8542  if (NIL_P(tmp)) {
8543  if (cflag) c = last;
8544  else c = errc;
8545  }
8546  else if (cflag) c = errc;
8547  else c = NUM2INT(tmp);
8548  }
8549  else {
8550  c = cflag ? last : errc;
8551  }
8552  if (c != errc) {
8553  tlen = rb_enc_codelen(c, enc);
8554  modify = 1;
8555  }
8556  else {
8557  c = c0;
8558  if (enc != e1) may_modify = 1;
8559  }
8560  if ((offset = t - buf) + tlen > max) {
8561  size_t MAYBE_UNUSED(old) = max + termlen;
8562  max = offset + tlen + (long)((send - s) * 1.2);
8563  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8564  t = buf + offset;
8565  }
8566  if (s != t) {
8567  rb_enc_mbcput(c, t, enc);
8568  if (may_modify && memcmp(s, t, tlen) != 0) {
8569  modify = 1;
8570  }
8571  }
8572  CHECK_IF_ASCII(c);
8573  s += clen;
8574  t += tlen;
8575  }
8576  if (!STR_EMBED_P(str)) {
8577  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8578  }
8579  TERM_FILL((char *)t, termlen);
8580  RSTRING(str)->as.heap.ptr = (char *)buf;
8581  STR_SET_LEN(str, t - buf);
8582  STR_SET_NOEMBED(str);
8583  RSTRING(str)->as.heap.aux.capa = max;
8584  }
8585 
8586  if (modify) {
8587  if (cr != ENC_CODERANGE_BROKEN)
8588  ENC_CODERANGE_SET(str, cr);
8589  rb_enc_associate(str, enc);
8590  return str;
8591  }
8592  return Qnil;
8593 }
8594 
8595 
8596 /*
8597  * call-seq:
8598  * tr!(selector, replacements) -> self or nil
8599  *
8600  * Like String#tr, but modifies +self+ in place.
8601  * Returns +self+ if any changes were made, +nil+ otherwise.
8602  *
8603  */
8604 
8605 static VALUE
8606 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8607 {
8608  return tr_trans(str, src, repl, 0);
8609 }
8610 
8611 
8612 /*
8613  * call-seq:
8614  * tr(selector, replacements) -> new_string
8615  *
8616  * Returns a copy of +self+ with each character specified by string +selector+
8617  * translated to the corresponding character in string +replacements+.
8618  * The correspondence is _positional_:
8619  *
8620  * - Each occurrence of the first character specified by +selector+
8621  * is translated to the first character in +replacements+.
8622  * - Each occurrence of the second character specified by +selector+
8623  * is translated to the second character in +replacements+.
8624  * - And so on.
8625  *
8626  * Example:
8627  *
8628  * 'hello'.tr('el', 'ip') #=> "hippo"
8629  *
8630  * If +replacements+ is shorter than +selector+,
8631  * it is implicitly padded with its own last character:
8632  *
8633  * 'hello'.tr('aeiou', '-') # => "h-ll-"
8634  * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8635  *
8636  * Arguments +selector+ and +replacements+ must be valid character selectors
8637  * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8638  * and may use any of its valid forms, including negation, ranges, and escaping:
8639  *
8640  * # Negation.
8641  * 'hello'.tr('^aeiou', '-') # => "-e--o"
8642  * # Ranges.
8643  * 'ibm'.tr('b-z', 'a-z') # => "hal"
8644  * # Escapes.
8645  * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8646  * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8647  * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8648  *
8649  */
8650 
8651 static VALUE
8652 rb_str_tr(VALUE str, VALUE src, VALUE repl)
8653 {
8654  str = str_duplicate(rb_cString, str);
8655  tr_trans(str, src, repl, 0);
8656  return str;
8657 }
8658 
8659 #define TR_TABLE_MAX (UCHAR_MAX+1)
8660 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8661 static void
8662 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8663  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8664 {
8665  const unsigned int errc = -1;
8666  char buf[TR_TABLE_MAX];
8667  struct tr tr;
8668  unsigned int c;
8669  VALUE table = 0, ptable = 0;
8670  int i, l, cflag = 0;
8671 
8672  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8673  tr.gen = tr.now = tr.max = 0;
8674 
8675  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8676  cflag = 1;
8677  tr.p += l;
8678  }
8679  if (first) {
8680  for (i=0; i<TR_TABLE_MAX; i++) {
8681  stable[i] = 1;
8682  }
8683  stable[TR_TABLE_MAX] = cflag;
8684  }
8685  else if (stable[TR_TABLE_MAX] && !cflag) {
8686  stable[TR_TABLE_MAX] = 0;
8687  }
8688  for (i=0; i<TR_TABLE_MAX; i++) {
8689  buf[i] = cflag;
8690  }
8691 
8692  while ((c = trnext(&tr, enc)) != errc) {
8693  if (c < TR_TABLE_MAX) {
8694  buf[(unsigned char)c] = !cflag;
8695  }
8696  else {
8697  VALUE key = UINT2NUM(c);
8698 
8699  if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8700  if (cflag) {
8701  ptable = *ctablep;
8702  table = ptable ? ptable : rb_hash_new();
8703  *ctablep = table;
8704  }
8705  else {
8706  table = rb_hash_new();
8707  ptable = *tablep;
8708  *tablep = table;
8709  }
8710  }
8711  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8712  rb_hash_aset(table, key, Qtrue);
8713  }
8714  }
8715  }
8716  for (i=0; i<TR_TABLE_MAX; i++) {
8717  stable[i] = stable[i] && buf[i];
8718  }
8719  if (!table && !cflag) {
8720  *tablep = 0;
8721  }
8722 }
8723 
8724 
8725 static int
8726 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8727 {
8728  if (c < TR_TABLE_MAX) {
8729  return table[c] != 0;
8730  }
8731  else {
8732  VALUE v = UINT2NUM(c);
8733 
8734  if (del) {
8735  if (!NIL_P(rb_hash_lookup(del, v)) &&
8736  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8737  return TRUE;
8738  }
8739  }
8740  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8741  return FALSE;
8742  }
8743  return table[TR_TABLE_MAX] ? TRUE : FALSE;
8744  }
8745 }
8746 
8747 /*
8748  * call-seq:
8749  * delete!(*selectors) -> self or nil
8750  *
8751  * Like String#delete, but modifies +self+ in place.
8752  * Returns +self+ if any changes were made, +nil+ otherwise.
8753  *
8754  */
8755 
8756 static VALUE
8757 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8758 {
8759  char squeez[TR_TABLE_SIZE];
8760  rb_encoding *enc = 0;
8761  char *s, *send, *t;
8762  VALUE del = 0, nodel = 0;
8763  int modify = 0;
8764  int i, ascompat, cr;
8765 
8766  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8768  for (i=0; i<argc; i++) {
8769  VALUE s = argv[i];
8770 
8771  StringValue(s);
8772  enc = rb_enc_check(str, s);
8773  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8774  }
8775 
8776  str_modify_keep_cr(str);
8777  ascompat = rb_enc_asciicompat(enc);
8778  s = t = RSTRING_PTR(str);
8779  send = RSTRING_END(str);
8780  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8781  while (s < send) {
8782  unsigned int c;
8783  int clen;
8784 
8785  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8786  if (squeez[c]) {
8787  modify = 1;
8788  }
8789  else {
8790  if (t != s) *t = c;
8791  t++;
8792  }
8793  s++;
8794  }
8795  else {
8796  c = rb_enc_codepoint_len(s, send, &clen, enc);
8797 
8798  if (tr_find(c, squeez, del, nodel)) {
8799  modify = 1;
8800  }
8801  else {
8802  if (t != s) rb_enc_mbcput(c, t, enc);
8803  t += clen;
8804  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8805  }
8806  s += clen;
8807  }
8808  }
8809  TERM_FILL(t, TERM_LEN(str));
8810  STR_SET_LEN(str, t - RSTRING_PTR(str));
8811  ENC_CODERANGE_SET(str, cr);
8812 
8813  if (modify) return str;
8814  return Qnil;
8815 }
8816 
8817 
8818 /*
8819  * call-seq:
8820  * delete(*selectors) -> new_string
8821  *
8822  * Returns a copy of +self+ with characters specified by +selectors+ removed
8823  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8824  *
8825  * "hello".delete "l","lo" #=> "heo"
8826  * "hello".delete "lo" #=> "he"
8827  * "hello".delete "aeiou", "^e" #=> "hell"
8828  * "hello".delete "ej-m" #=> "ho"
8829  *
8830  */
8831 
8832 static VALUE
8833 rb_str_delete(int argc, VALUE *argv, VALUE str)
8834 {
8835  str = str_duplicate(rb_cString, str);
8836  rb_str_delete_bang(argc, argv, str);
8837  return str;
8838 }
8839 
8840 
8841 /*
8842  * call-seq:
8843  * squeeze!(*selectors) -> self or nil
8844  *
8845  * Like String#squeeze, but modifies +self+ in place.
8846  * Returns +self+ if any changes were made, +nil+ otherwise.
8847  */
8848 
8849 static VALUE
8850 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8851 {
8852  char squeez[TR_TABLE_SIZE];
8853  rb_encoding *enc = 0;
8854  VALUE del = 0, nodel = 0;
8855  unsigned char *s, *send, *t;
8856  int i, modify = 0;
8857  int ascompat, singlebyte = single_byte_optimizable(str);
8858  unsigned int save;
8859 
8860  if (argc == 0) {
8861  enc = STR_ENC_GET(str);
8862  }
8863  else {
8864  for (i=0; i<argc; i++) {
8865  VALUE s = argv[i];
8866 
8867  StringValue(s);
8868  enc = rb_enc_check(str, s);
8869  if (singlebyte && !single_byte_optimizable(s))
8870  singlebyte = 0;
8871  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8872  }
8873  }
8874 
8875  str_modify_keep_cr(str);
8876  s = t = (unsigned char *)RSTRING_PTR(str);
8877  if (!s || RSTRING_LEN(str) == 0) return Qnil;
8878  send = (unsigned char *)RSTRING_END(str);
8879  save = -1;
8880  ascompat = rb_enc_asciicompat(enc);
8881 
8882  if (singlebyte) {
8883  while (s < send) {
8884  unsigned int c = *s++;
8885  if (c != save || (argc > 0 && !squeez[c])) {
8886  *t++ = save = c;
8887  }
8888  }
8889  }
8890  else {
8891  while (s < send) {
8892  unsigned int c;
8893  int clen;
8894 
8895  if (ascompat && (c = *s) < 0x80) {
8896  if (c != save || (argc > 0 && !squeez[c])) {
8897  *t++ = save = c;
8898  }
8899  s++;
8900  }
8901  else {
8902  c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8903 
8904  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8905  if (t != s) rb_enc_mbcput(c, t, enc);
8906  save = c;
8907  t += clen;
8908  }
8909  s += clen;
8910  }
8911  }
8912  }
8913 
8914  TERM_FILL((char *)t, TERM_LEN(str));
8915  if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8916  STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8917  modify = 1;
8918  }
8919 
8920  if (modify) return str;
8921  return Qnil;
8922 }
8923 
8924 
8925 /*
8926  * call-seq:
8927  * squeeze(*selectors) -> new_string
8928  *
8929  * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8930  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8931  *
8932  * "Squeezed" means that each multiple-character run of a selected character
8933  * is squeezed down to a single character;
8934  * with no arguments given, squeezes all characters:
8935  *
8936  * "yellow moon".squeeze #=> "yelow mon"
8937  * " now is the".squeeze(" ") #=> " now is the"
8938  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8939  *
8940  */
8941 
8942 static VALUE
8943 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8944 {
8945  str = str_duplicate(rb_cString, str);
8946  rb_str_squeeze_bang(argc, argv, str);
8947  return str;
8948 }
8949 
8950 
8951 /*
8952  * call-seq:
8953  * tr_s!(selector, replacements) -> self or nil
8954  *
8955  * Like String#tr_s, but modifies +self+ in place.
8956  * Returns +self+ if any changes were made, +nil+ otherwise.
8957  *
8958  * Related: String#squeeze!.
8959  */
8960 
8961 static VALUE
8962 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8963 {
8964  return tr_trans(str, src, repl, 1);
8965 }
8966 
8967 
8968 /*
8969  * call-seq:
8970  * tr_s(selector, replacements) -> string
8971  *
8972  * Like String#tr, but also squeezes the modified portions of the translated string;
8973  * returns a new string (translated and squeezed).
8974  *
8975  * 'hello'.tr_s('l', 'r') #=> "hero"
8976  * 'hello'.tr_s('el', '-') #=> "h-o"
8977  * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8978  *
8979  * Related: String#squeeze.
8980  *
8981  */
8982 
8983 static VALUE
8984 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8985 {
8986  str = str_duplicate(rb_cString, str);
8987  tr_trans(str, src, repl, 1);
8988  return str;
8989 }
8990 
8991 
8992 /*
8993  * call-seq:
8994  * count(*selectors) -> integer
8995  *
8996  * Returns the total number of characters in +self+
8997  * that are specified by the given +selectors+
8998  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8999  *
9000  * a = "hello world"
9001  * a.count "lo" #=> 5
9002  * a.count "lo", "o" #=> 2
9003  * a.count "hello", "^l" #=> 4
9004  * a.count "ej-m" #=> 4
9005  *
9006  * "hello^world".count "\\^aeiou" #=> 4
9007  * "hello-world".count "a\\-eo" #=> 4
9008  *
9009  * c = "hello world\\r\\n"
9010  * c.count "\\" #=> 2
9011  * c.count "\\A" #=> 0
9012  * c.count "X-\\w" #=> 3
9013  */
9014 
9015 static VALUE
9016 rb_str_count(int argc, VALUE *argv, VALUE str)
9017 {
9018  char table[TR_TABLE_SIZE];
9019  rb_encoding *enc = 0;
9020  VALUE del = 0, nodel = 0, tstr;
9021  char *s, *send;
9022  int i;
9023  int ascompat;
9024  size_t n = 0;
9025 
9027 
9028  tstr = argv[0];
9029  StringValue(tstr);
9030  enc = rb_enc_check(str, tstr);
9031  if (argc == 1) {
9032  const char *ptstr;
9033  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9034  (ptstr = RSTRING_PTR(tstr),
9035  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9036  !is_broken_string(str)) {
9037  int clen;
9038  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9039 
9040  s = RSTRING_PTR(str);
9041  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9042  send = RSTRING_END(str);
9043  while (s < send) {
9044  if (*(unsigned char*)s++ == c) n++;
9045  }
9046  return SIZET2NUM(n);
9047  }
9048  }
9049 
9050  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9051  for (i=1; i<argc; i++) {
9052  tstr = argv[i];
9053  StringValue(tstr);
9054  enc = rb_enc_check(str, tstr);
9055  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9056  }
9057 
9058  s = RSTRING_PTR(str);
9059  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9060  send = RSTRING_END(str);
9061  ascompat = rb_enc_asciicompat(enc);
9062  while (s < send) {
9063  unsigned int c;
9064 
9065  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9066  if (table[c]) {
9067  n++;
9068  }
9069  s++;
9070  }
9071  else {
9072  int clen;
9073  c = rb_enc_codepoint_len(s, send, &clen, enc);
9074  if (tr_find(c, table, del, nodel)) {
9075  n++;
9076  }
9077  s += clen;
9078  }
9079  }
9080 
9081  return SIZET2NUM(n);
9082 }
9083 
9084 static VALUE
9085 rb_fs_check(VALUE val)
9086 {
9087  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9088  val = rb_check_string_type(val);
9089  if (NIL_P(val)) return 0;
9090  }
9091  return val;
9092 }
9093 
9094 static const char isspacetable[256] = {
9095  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9096  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9098  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9099  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9109  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9110  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9111 };
9112 
9113 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9114 
9115 static long
9116 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9117 {
9118  if (empty_count >= 0 && len == 0) {
9119  return empty_count + 1;
9120  }
9121  if (empty_count > 0) {
9122  /* make different substrings */
9123  if (result) {
9124  do {
9125  rb_ary_push(result, str_new_empty_String(str));
9126  } while (--empty_count > 0);
9127  }
9128  else {
9129  do {
9130  rb_yield(str_new_empty_String(str));
9131  } while (--empty_count > 0);
9132  }
9133  }
9134  str = rb_str_subseq(str, beg, len);
9135  if (result) {
9136  rb_ary_push(result, str);
9137  }
9138  else {
9139  rb_yield(str);
9140  }
9141  return empty_count;
9142 }
9143 
9144 typedef enum {
9145  SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9146 } split_type_t;
9147 
9148 static split_type_t
9149 literal_split_pattern(VALUE spat, split_type_t default_type)
9150 {
9151  rb_encoding *enc = STR_ENC_GET(spat);
9152  const char *ptr;
9153  long len;
9154  RSTRING_GETMEM(spat, ptr, len);
9155  if (len == 0) {
9156  /* Special case - split into chars */
9157  return SPLIT_TYPE_CHARS;
9158  }
9159  else if (rb_enc_asciicompat(enc)) {
9160  if (len == 1 && ptr[0] == ' ') {
9161  return SPLIT_TYPE_AWK;
9162  }
9163  }
9164  else {
9165  int l;
9166  if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9167  return SPLIT_TYPE_AWK;
9168  }
9169  }
9170  return default_type;
9171 }
9172 
9173 /*
9174  * call-seq:
9175  * split(field_sep = $;, limit = 0) -> array
9176  * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9177  *
9178  * :include: doc/string/split.rdoc
9179  *
9180  */
9181 
9182 static VALUE
9183 rb_str_split_m(int argc, VALUE *argv, VALUE str)
9184 {
9185  rb_encoding *enc;
9186  VALUE spat;
9187  VALUE limit;
9188  split_type_t split_type;
9189  long beg, end, i = 0, empty_count = -1;
9190  int lim = 0;
9191  VALUE result, tmp;
9192 
9193  result = rb_block_given_p() ? Qfalse : Qnil;
9194  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9195  lim = NUM2INT(limit);
9196  if (lim <= 0) limit = Qnil;
9197  else if (lim == 1) {
9198  if (RSTRING_LEN(str) == 0)
9199  return result ? rb_ary_new2(0) : str;
9200  tmp = str_duplicate(rb_cString, str);
9201  if (!result) {
9202  rb_yield(tmp);
9203  return str;
9204  }
9205  return rb_ary_new3(1, tmp);
9206  }
9207  i = 1;
9208  }
9209  if (NIL_P(limit) && !lim) empty_count = 0;
9210 
9211  enc = STR_ENC_GET(str);
9212  split_type = SPLIT_TYPE_REGEXP;
9213  if (!NIL_P(spat)) {
9214  spat = get_pat_quoted(spat, 0);
9215  }
9216  else if (NIL_P(spat = rb_fs)) {
9217  split_type = SPLIT_TYPE_AWK;
9218  }
9219  else if (!(spat = rb_fs_check(spat))) {
9220  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9221  }
9222  else {
9223  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9224  }
9225  if (split_type != SPLIT_TYPE_AWK) {
9226  switch (BUILTIN_TYPE(spat)) {
9227  case T_REGEXP:
9228  rb_reg_options(spat); /* check if uninitialized */
9229  tmp = RREGEXP_SRC(spat);
9230  split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9231  if (split_type == SPLIT_TYPE_AWK) {
9232  spat = tmp;
9233  split_type = SPLIT_TYPE_STRING;
9234  }
9235  break;
9236 
9237  case T_STRING:
9238  mustnot_broken(spat);
9239  split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9240  break;
9241 
9242  default:
9244  }
9245  }
9246 
9247 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9248 
9249  beg = 0;
9250  char *ptr = RSTRING_PTR(str);
9251  char *eptr = RSTRING_END(str);
9252  if (split_type == SPLIT_TYPE_AWK) {
9253  char *bptr = ptr;
9254  int skip = 1;
9255  unsigned int c;
9256 
9257  if (result) result = rb_ary_new();
9258  end = beg;
9259  if (is_ascii_string(str)) {
9260  while (ptr < eptr) {
9261  c = (unsigned char)*ptr++;
9262  if (skip) {
9263  if (ascii_isspace(c)) {
9264  beg = ptr - bptr;
9265  }
9266  else {
9267  end = ptr - bptr;
9268  skip = 0;
9269  if (!NIL_P(limit) && lim <= i) break;
9270  }
9271  }
9272  else if (ascii_isspace(c)) {
9273  SPLIT_STR(beg, end-beg);
9274  skip = 1;
9275  beg = ptr - bptr;
9276  if (!NIL_P(limit)) ++i;
9277  }
9278  else {
9279  end = ptr - bptr;
9280  }
9281  }
9282  }
9283  else {
9284  while (ptr < eptr) {
9285  int n;
9286 
9287  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9288  ptr += n;
9289  if (skip) {
9290  if (rb_isspace(c)) {
9291  beg = ptr - bptr;
9292  }
9293  else {
9294  end = ptr - bptr;
9295  skip = 0;
9296  if (!NIL_P(limit) && lim <= i) break;
9297  }
9298  }
9299  else if (rb_isspace(c)) {
9300  SPLIT_STR(beg, end-beg);
9301  skip = 1;
9302  beg = ptr - bptr;
9303  if (!NIL_P(limit)) ++i;
9304  }
9305  else {
9306  end = ptr - bptr;
9307  }
9308  }
9309  }
9310  }
9311  else if (split_type == SPLIT_TYPE_STRING) {
9312  char *str_start = ptr;
9313  char *substr_start = ptr;
9314  char *sptr = RSTRING_PTR(spat);
9315  long slen = RSTRING_LEN(spat);
9316 
9317  if (result) result = rb_ary_new();
9318  mustnot_broken(str);
9319  enc = rb_enc_check(str, spat);
9320  while (ptr < eptr &&
9321  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9322  /* Check we are at the start of a char */
9323  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9324  if (t != ptr + end) {
9325  ptr = t;
9326  continue;
9327  }
9328  SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9329  ptr += end + slen;
9330  substr_start = ptr;
9331  if (!NIL_P(limit) && lim <= ++i) break;
9332  }
9333  beg = ptr - str_start;
9334  }
9335  else if (split_type == SPLIT_TYPE_CHARS) {
9336  char *str_start = ptr;
9337  int n;
9338 
9339  if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9340  mustnot_broken(str);
9341  enc = rb_enc_get(str);
9342  while (ptr < eptr &&
9343  (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9344  SPLIT_STR(ptr - str_start, n);
9345  ptr += n;
9346  if (!NIL_P(limit) && lim <= ++i) break;
9347  }
9348  beg = ptr - str_start;
9349  }
9350  else {
9351  if (result) result = rb_ary_new();
9352  long len = RSTRING_LEN(str);
9353  long start = beg;
9354  long idx;
9355  int last_null = 0;
9356  struct re_registers *regs;
9357  VALUE match = 0;
9358 
9359  for (; rb_reg_search(spat, str, start, 0) >= 0;
9360  (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9361  match = rb_backref_get();
9362  if (!result) rb_match_busy(match);
9363  regs = RMATCH_REGS(match);
9364  end = BEG(0);
9365  if (start == end && BEG(0) == END(0)) {
9366  if (!ptr) {
9367  SPLIT_STR(0, 0);
9368  break;
9369  }
9370  else if (last_null == 1) {
9371  SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9372  beg = start;
9373  }
9374  else {
9375  if (start == len)
9376  start++;
9377  else
9378  start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9379  last_null = 1;
9380  continue;
9381  }
9382  }
9383  else {
9384  SPLIT_STR(beg, end-beg);
9385  beg = start = END(0);
9386  }
9387  last_null = 0;
9388 
9389  for (idx=1; idx < regs->num_regs; idx++) {
9390  if (BEG(idx) == -1) continue;
9391  SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9392  }
9393  if (!NIL_P(limit) && lim <= ++i) break;
9394  }
9395  if (match) rb_match_unbusy(match);
9396  }
9397  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9398  SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9399  }
9400 
9401  return result ? result : str;
9402 }
9403 
9404 VALUE
9405 rb_str_split(VALUE str, const char *sep0)
9406 {
9407  VALUE sep;
9408 
9409  StringValue(str);
9410  sep = rb_str_new_cstr(sep0);
9411  return rb_str_split_m(1, &sep, str);
9412 }
9413 
9414 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9415 
9416 static inline int
9417 enumerator_element(VALUE ary, VALUE e)
9418 {
9419  if (ary) {
9420  rb_ary_push(ary, e);
9421  return 0;
9422  }
9423  else {
9424  rb_yield(e);
9425  return 1;
9426  }
9427 }
9428 
9429 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9430 
9431 static const char *
9432 chomp_newline(const char *p, const char *e, rb_encoding *enc)
9433 {
9434  const char *prev = rb_enc_prev_char(p, e, e, enc);
9435  if (rb_enc_is_newline(prev, e, enc)) {
9436  e = prev;
9437  prev = rb_enc_prev_char(p, e, e, enc);
9438  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9439  e = prev;
9440  }
9441  return e;
9442 }
9443 
9444 static VALUE
9445 get_rs(void)
9446 {
9447  VALUE rs = rb_rs;
9448  if (!NIL_P(rs) &&
9449  (!RB_TYPE_P(rs, T_STRING) ||
9450  RSTRING_LEN(rs) != 1 ||
9451  RSTRING_PTR(rs)[0] != '\n')) {
9452  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9453  }
9454  return rs;
9455 }
9456 
9457 #define rb_rs get_rs()
9458 
9459 static VALUE
9460 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9461 {
9462  rb_encoding *enc;
9463  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9464  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9465  long pos, len, rslen;
9466  int rsnewline = 0;
9467 
9468  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9469  rs = rb_rs;
9470  if (!NIL_P(opts)) {
9471  static ID keywords[1];
9472  if (!keywords[0]) {
9473  keywords[0] = rb_intern_const("chomp");
9474  }
9475  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9476  chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9477  }
9478 
9479  if (NIL_P(rs)) {
9480  if (!ENUM_ELEM(ary, str)) {
9481  return ary;
9482  }
9483  else {
9484  return orig;
9485  }
9486  }
9487 
9488  if (!RSTRING_LEN(str)) goto end;
9489  str = rb_str_new_frozen(str);
9490  ptr = subptr = RSTRING_PTR(str);
9491  pend = RSTRING_END(str);
9492  len = RSTRING_LEN(str);
9493  StringValue(rs);
9494  rslen = RSTRING_LEN(rs);
9495 
9496  if (rs == rb_default_rs)
9497  enc = rb_enc_get(str);
9498  else
9499  enc = rb_enc_check(str, rs);
9500 
9501  if (rslen == 0) {
9502  /* paragraph mode */
9503  int n;
9504  const char *eol = NULL;
9505  subend = subptr;
9506  while (subend < pend) {
9507  long chomp_rslen = 0;
9508  do {
9509  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9510  n = 0;
9511  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9512  if (rb_enc_is_newline(subend + n, pend, enc)) {
9513  if (eol == subend) break;
9514  subend += rslen;
9515  if (subptr) {
9516  eol = subend;
9517  chomp_rslen = -rslen;
9518  }
9519  }
9520  else {
9521  if (!subptr) subptr = subend;
9522  subend += rslen;
9523  }
9524  rslen = 0;
9525  } while (subend < pend);
9526  if (!subptr) break;
9527  if (rslen == 0) chomp_rslen = 0;
9528  line = rb_str_subseq(str, subptr - ptr,
9529  subend - subptr + (chomp ? chomp_rslen : rslen));
9530  if (ENUM_ELEM(ary, line)) {
9531  str_mod_check(str, ptr, len);
9532  }
9533  subptr = eol = NULL;
9534  }
9535  goto end;
9536  }
9537  else {
9538  rsptr = RSTRING_PTR(rs);
9539  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9540  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9541  rsnewline = 1;
9542  }
9543  }
9544 
9545  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9546  rs = rb_str_new(rsptr, rslen);
9547  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9548  rsptr = RSTRING_PTR(rs);
9549  rslen = RSTRING_LEN(rs);
9550  }
9551 
9552  while (subptr < pend) {
9553  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9554  if (pos < 0) break;
9555  hit = subptr + pos;
9556  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9557  if (hit != adjusted) {
9558  subptr = adjusted;
9559  continue;
9560  }
9561  subend = hit += rslen;
9562  if (chomp) {
9563  if (rsnewline) {
9564  subend = chomp_newline(subptr, subend, enc);
9565  }
9566  else {
9567  subend -= rslen;
9568  }
9569  }
9570  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9571  if (ENUM_ELEM(ary, line)) {
9572  str_mod_check(str, ptr, len);
9573  }
9574  subptr = hit;
9575  }
9576 
9577  if (subptr != pend) {
9578  if (chomp) {
9579  if (rsnewline) {
9580  pend = chomp_newline(subptr, pend, enc);
9581  }
9582  else if (pend - subptr >= rslen &&
9583  memcmp(pend - rslen, rsptr, rslen) == 0) {
9584  pend -= rslen;
9585  }
9586  }
9587  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9588  ENUM_ELEM(ary, line);
9589  RB_GC_GUARD(str);
9590  }
9591 
9592  end:
9593  if (ary)
9594  return ary;
9595  else
9596  return orig;
9597 }
9598 
9599 /*
9600  * call-seq:
9601  * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9602  * each_line(line_sep = $/, chomp: false) -> enumerator
9603  *
9604  * :include: doc/string/each_line.rdoc
9605  *
9606  */
9607 
9608 static VALUE
9609 rb_str_each_line(int argc, VALUE *argv, VALUE str)
9610 {
9611  RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9612  return rb_str_enumerate_lines(argc, argv, str, 0);
9613 }
9614 
9615 /*
9616  * call-seq:
9617  * lines(Line_sep = $/, chomp: false) -> array_of_strings
9618  *
9619  * Forms substrings ("lines") of +self+ according to the given arguments
9620  * (see String#each_line for details); returns the lines in an array.
9621  *
9622  */
9623 
9624 static VALUE
9625 rb_str_lines(int argc, VALUE *argv, VALUE str)
9626 {
9627  VALUE ary = WANTARRAY("lines", 0);
9628  return rb_str_enumerate_lines(argc, argv, str, ary);
9629 }
9630 
9631 static VALUE
9632 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9633 {
9634  return LONG2FIX(RSTRING_LEN(str));
9635 }
9636 
9637 static VALUE
9638 rb_str_enumerate_bytes(VALUE str, VALUE ary)
9639 {
9640  long i;
9641 
9642  for (i=0; i<RSTRING_LEN(str); i++) {
9643  ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9644  }
9645  if (ary)
9646  return ary;
9647  else
9648  return str;
9649 }
9650 
9651 /*
9652  * call-seq:
9653  * each_byte {|byte| ... } -> self
9654  * each_byte -> enumerator
9655  *
9656  * :include: doc/string/each_byte.rdoc
9657  *
9658  */
9659 
9660 static VALUE
9661 rb_str_each_byte(VALUE str)
9662 {
9663  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9664  return rb_str_enumerate_bytes(str, 0);
9665 }
9666 
9667 /*
9668  * call-seq:
9669  * bytes -> array_of_bytes
9670  *
9671  * :include: doc/string/bytes.rdoc
9672  *
9673  */
9674 
9675 static VALUE
9676 rb_str_bytes(VALUE str)
9677 {
9678  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9679  return rb_str_enumerate_bytes(str, ary);
9680 }
9681 
9682 static VALUE
9683 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9684 {
9685  return rb_str_length(str);
9686 }
9687 
9688 static VALUE
9689 rb_str_enumerate_chars(VALUE str, VALUE ary)
9690 {
9691  VALUE orig = str;
9692  long i, len, n;
9693  const char *ptr;
9694  rb_encoding *enc;
9695 
9696  str = rb_str_new_frozen(str);
9697  ptr = RSTRING_PTR(str);
9698  len = RSTRING_LEN(str);
9699  enc = rb_enc_get(str);
9700 
9702  for (i = 0; i < len; i += n) {
9703  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9704  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9705  }
9706  }
9707  else {
9708  for (i = 0; i < len; i += n) {
9709  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9710  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9711  }
9712  }
9713  RB_GC_GUARD(str);
9714  if (ary)
9715  return ary;
9716  else
9717  return orig;
9718 }
9719 
9720 /*
9721  * call-seq:
9722  * each_char {|c| ... } -> self
9723  * each_char -> enumerator
9724  *
9725  * :include: doc/string/each_char.rdoc
9726  *
9727  */
9728 
9729 static VALUE
9730 rb_str_each_char(VALUE str)
9731 {
9732  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9733  return rb_str_enumerate_chars(str, 0);
9734 }
9735 
9736 /*
9737  * call-seq:
9738  * chars -> array_of_characters
9739  *
9740  * :include: doc/string/chars.rdoc
9741  *
9742  */
9743 
9744 static VALUE
9745 rb_str_chars(VALUE str)
9746 {
9747  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9748  return rb_str_enumerate_chars(str, ary);
9749 }
9750 
9751 static VALUE
9752 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9753 {
9754  VALUE orig = str;
9755  int n;
9756  unsigned int c;
9757  const char *ptr, *end;
9758  rb_encoding *enc;
9759 
9760  if (single_byte_optimizable(str))
9761  return rb_str_enumerate_bytes(str, ary);
9762 
9763  str = rb_str_new_frozen(str);
9764  ptr = RSTRING_PTR(str);
9765  end = RSTRING_END(str);
9766  enc = STR_ENC_GET(str);
9767 
9768  while (ptr < end) {
9769  c = rb_enc_codepoint_len(ptr, end, &n, enc);
9770  ENUM_ELEM(ary, UINT2NUM(c));
9771  ptr += n;
9772  }
9773  RB_GC_GUARD(str);
9774  if (ary)
9775  return ary;
9776  else
9777  return orig;
9778 }
9779 
9780 /*
9781  * call-seq:
9782  * each_codepoint {|integer| ... } -> self
9783  * each_codepoint -> enumerator
9784  *
9785  * :include: doc/string/each_codepoint.rdoc
9786  *
9787  */
9788 
9789 static VALUE
9790 rb_str_each_codepoint(VALUE str)
9791 {
9792  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9793  return rb_str_enumerate_codepoints(str, 0);
9794 }
9795 
9796 /*
9797  * call-seq:
9798  * codepoints -> array_of_integers
9799  *
9800  * :include: doc/string/codepoints.rdoc
9801  *
9802  */
9803 
9804 static VALUE
9805 rb_str_codepoints(VALUE str)
9806 {
9807  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9808  return rb_str_enumerate_codepoints(str, ary);
9809 }
9810 
9811 static regex_t *
9812 get_reg_grapheme_cluster(rb_encoding *enc)
9813 {
9814  int encidx = rb_enc_to_index(enc);
9815 
9816  const OnigUChar source_ascii[] = "\\X";
9817  const OnigUChar *source = source_ascii;
9818  size_t source_len = sizeof(source_ascii) - 1;
9819 
9820  switch (encidx) {
9821 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9822 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9823 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9824 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9825 #define CASE_UTF(e) \
9826  case ENCINDEX_UTF_##e: { \
9827  static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9828  source = source_UTF_##e; \
9829  source_len = sizeof(source_UTF_##e); \
9830  break; \
9831  }
9832  CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9833 #undef CASE_UTF
9834 #undef CHARS_16BE
9835 #undef CHARS_16LE
9836 #undef CHARS_32BE
9837 #undef CHARS_32LE
9838  }
9839 
9840  regex_t *reg_grapheme_cluster;
9841  OnigErrorInfo einfo;
9842  int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9843  ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9844  if (r) {
9845  UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9846  onig_error_code_to_str(message, r, &einfo);
9847  rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9848  }
9849 
9850  return reg_grapheme_cluster;
9851 }
9852 
9853 static regex_t *
9854 get_cached_reg_grapheme_cluster(rb_encoding *enc)
9855 {
9856  int encidx = rb_enc_to_index(enc);
9857  static regex_t *reg_grapheme_cluster_utf8 = NULL;
9858 
9859  if (encidx == rb_utf8_encindex()) {
9860  if (!reg_grapheme_cluster_utf8) {
9861  reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9862  }
9863 
9864  return reg_grapheme_cluster_utf8;
9865  }
9866 
9867  return NULL;
9868 }
9869 
9870 static VALUE
9871 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9872 {
9873  size_t grapheme_cluster_count = 0;
9874  rb_encoding *enc = get_encoding(str);
9875  const char *ptr, *end;
9876 
9877  if (!rb_enc_unicode_p(enc)) {
9878  return rb_str_length(str);
9879  }
9880 
9881  bool cached_reg_grapheme_cluster = true;
9882  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9883  if (!reg_grapheme_cluster) {
9884  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9885  cached_reg_grapheme_cluster = false;
9886  }
9887 
9888  ptr = RSTRING_PTR(str);
9889  end = RSTRING_END(str);
9890 
9891  while (ptr < end) {
9892  OnigPosition len = onig_match(reg_grapheme_cluster,
9893  (const OnigUChar *)ptr, (const OnigUChar *)end,
9894  (const OnigUChar *)ptr, NULL, 0);
9895  if (len <= 0) break;
9896  grapheme_cluster_count++;
9897  ptr += len;
9898  }
9899 
9900  if (!cached_reg_grapheme_cluster) {
9901  onig_free(reg_grapheme_cluster);
9902  }
9903 
9904  return SIZET2NUM(grapheme_cluster_count);
9905 }
9906 
9907 static VALUE
9908 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9909 {
9910  VALUE orig = str;
9911  rb_encoding *enc = get_encoding(str);
9912  const char *ptr0, *ptr, *end;
9913 
9914  if (!rb_enc_unicode_p(enc)) {
9915  return rb_str_enumerate_chars(str, ary);
9916  }
9917 
9918  if (!ary) str = rb_str_new_frozen(str);
9919 
9920  bool cached_reg_grapheme_cluster = true;
9921  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9922  if (!reg_grapheme_cluster) {
9923  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9924  cached_reg_grapheme_cluster = false;
9925  }
9926 
9927  ptr0 = ptr = RSTRING_PTR(str);
9928  end = RSTRING_END(str);
9929 
9930  while (ptr < end) {
9931  OnigPosition len = onig_match(reg_grapheme_cluster,
9932  (const OnigUChar *)ptr, (const OnigUChar *)end,
9933  (const OnigUChar *)ptr, NULL, 0);
9934  if (len <= 0) break;
9935  ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9936  ptr += len;
9937  }
9938 
9939  if (!cached_reg_grapheme_cluster) {
9940  onig_free(reg_grapheme_cluster);
9941  }
9942 
9943  RB_GC_GUARD(str);
9944  if (ary)
9945  return ary;
9946  else
9947  return orig;
9948 }
9949 
9950 /*
9951  * call-seq:
9952  * each_grapheme_cluster {|gc| ... } -> self
9953  * each_grapheme_cluster -> enumerator
9954  *
9955  * :include: doc/string/each_grapheme_cluster.rdoc
9956  *
9957  */
9958 
9959 static VALUE
9960 rb_str_each_grapheme_cluster(VALUE str)
9961 {
9962  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9963  return rb_str_enumerate_grapheme_clusters(str, 0);
9964 }
9965 
9966 /*
9967  * call-seq:
9968  * grapheme_clusters -> array_of_grapheme_clusters
9969  *
9970  * :include: doc/string/grapheme_clusters.rdoc
9971  *
9972  */
9973 
9974 static VALUE
9975 rb_str_grapheme_clusters(VALUE str)
9976 {
9977  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9978  return rb_str_enumerate_grapheme_clusters(str, ary);
9979 }
9980 
9981 static long
9982 chopped_length(VALUE str)
9983 {
9984  rb_encoding *enc = STR_ENC_GET(str);
9985  const char *p, *p2, *beg, *end;
9986 
9987  beg = RSTRING_PTR(str);
9988  end = beg + RSTRING_LEN(str);
9989  if (beg >= end) return 0;
9990  p = rb_enc_prev_char(beg, end, end, enc);
9991  if (!p) return 0;
9992  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9993  p2 = rb_enc_prev_char(beg, p, end, enc);
9994  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9995  }
9996  return p - beg;
9997 }
9998 
9999 /*
10000  * call-seq:
10001  * chop! -> self or nil
10002  *
10003  * Like String#chop, but modifies +self+ in place;
10004  * returns +nil+ if +self+ is empty, +self+ otherwise.
10005  *
10006  * Related: String#chomp!.
10007  */
10008 
10009 static VALUE
10010 rb_str_chop_bang(VALUE str)
10011 {
10012  str_modify_keep_cr(str);
10013  if (RSTRING_LEN(str) > 0) {
10014  long len;
10015  len = chopped_length(str);
10016  STR_SET_LEN(str, len);
10017  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10018  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10019  ENC_CODERANGE_CLEAR(str);
10020  }
10021  return str;
10022  }
10023  return Qnil;
10024 }
10025 
10026 
10027 /*
10028  * call-seq:
10029  * chop -> new_string
10030  *
10031  * :include: doc/string/chop.rdoc
10032  *
10033  */
10034 
10035 static VALUE
10036 rb_str_chop(VALUE str)
10037 {
10038  return rb_str_subseq(str, 0, chopped_length(str));
10039 }
10040 
10041 static long
10042 smart_chomp(VALUE str, const char *e, const char *p)
10043 {
10044  rb_encoding *enc = rb_enc_get(str);
10045  if (rb_enc_mbminlen(enc) > 1) {
10046  const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10047  if (rb_enc_is_newline(pp, e, enc)) {
10048  e = pp;
10049  }
10050  pp = e - rb_enc_mbminlen(enc);
10051  if (pp >= p) {
10052  pp = rb_enc_left_char_head(p, pp, e, enc);
10053  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10054  e = pp;
10055  }
10056  }
10057  }
10058  else {
10059  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10060  case '\n':
10061  if (--e > p && *(e-1) == '\r') {
10062  --e;
10063  }
10064  break;
10065  case '\r':
10066  --e;
10067  break;
10068  }
10069  }
10070  return e - p;
10071 }
10072 
10073 static long
10074 chompped_length(VALUE str, VALUE rs)
10075 {
10076  rb_encoding *enc;
10077  int newline;
10078  char *pp, *e, *rsptr;
10079  long rslen;
10080  char *const p = RSTRING_PTR(str);
10081  long len = RSTRING_LEN(str);
10082 
10083  if (len == 0) return 0;
10084  e = p + len;
10085  if (rs == rb_default_rs) {
10086  return smart_chomp(str, e, p);
10087  }
10088 
10089  enc = rb_enc_get(str);
10090  RSTRING_GETMEM(rs, rsptr, rslen);
10091  if (rslen == 0) {
10092  if (rb_enc_mbminlen(enc) > 1) {
10093  while (e > p) {
10094  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10095  if (!rb_enc_is_newline(pp, e, enc)) break;
10096  e = pp;
10097  pp -= rb_enc_mbminlen(enc);
10098  if (pp >= p) {
10099  pp = rb_enc_left_char_head(p, pp, e, enc);
10100  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10101  e = pp;
10102  }
10103  }
10104  }
10105  }
10106  else {
10107  while (e > p && *(e-1) == '\n') {
10108  --e;
10109  if (e > p && *(e-1) == '\r')
10110  --e;
10111  }
10112  }
10113  return e - p;
10114  }
10115  if (rslen > len) return len;
10116 
10117  enc = rb_enc_get(rs);
10118  newline = rsptr[rslen-1];
10119  if (rslen == rb_enc_mbminlen(enc)) {
10120  if (rslen == 1) {
10121  if (newline == '\n')
10122  return smart_chomp(str, e, p);
10123  }
10124  else {
10125  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10126  return smart_chomp(str, e, p);
10127  }
10128  }
10129 
10130  enc = rb_enc_check(str, rs);
10131  if (is_broken_string(rs)) {
10132  return len;
10133  }
10134  pp = e - rslen;
10135  if (p[len-1] == newline &&
10136  (rslen <= 1 ||
10137  memcmp(rsptr, pp, rslen) == 0)) {
10138  if (at_char_boundary(p, pp, e, enc))
10139  return len - rslen;
10140  RB_GC_GUARD(rs);
10141  }
10142  return len;
10143 }
10144 
10150 static VALUE
10151 chomp_rs(int argc, const VALUE *argv)
10152 {
10153  rb_check_arity(argc, 0, 1);
10154  if (argc > 0) {
10155  VALUE rs = argv[0];
10156  if (!NIL_P(rs)) StringValue(rs);
10157  return rs;
10158  }
10159  else {
10160  return rb_rs;
10161  }
10162 }
10163 
10164 VALUE
10165 rb_str_chomp_string(VALUE str, VALUE rs)
10166 {
10167  long olen = RSTRING_LEN(str);
10168  long len = chompped_length(str, rs);
10169  if (len >= olen) return Qnil;
10170  str_modify_keep_cr(str);
10171  STR_SET_LEN(str, len);
10172  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10173  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10174  ENC_CODERANGE_CLEAR(str);
10175  }
10176  return str;
10177 }
10178 
10179 /*
10180  * call-seq:
10181  * chomp!(line_sep = $/) -> self or nil
10182  *
10183  * Like String#chomp, but modifies +self+ in place;
10184  * returns +nil+ if no modification made, +self+ otherwise.
10185  *
10186  */
10187 
10188 static VALUE
10189 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10190 {
10191  VALUE rs;
10192  str_modifiable(str);
10193  if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10194  rs = chomp_rs(argc, argv);
10195  if (NIL_P(rs)) return Qnil;
10196  return rb_str_chomp_string(str, rs);
10197 }
10198 
10199 
10200 /*
10201  * call-seq:
10202  * chomp(line_sep = $/) -> new_string
10203  *
10204  * :include: doc/string/chomp.rdoc
10205  *
10206  */
10207 
10208 static VALUE
10209 rb_str_chomp(int argc, VALUE *argv, VALUE str)
10210 {
10211  VALUE rs = chomp_rs(argc, argv);
10212  if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10213  return rb_str_subseq(str, 0, chompped_length(str, rs));
10214 }
10215 
10216 static long
10217 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10218 {
10219  const char *const start = s;
10220 
10221  if (!s || s >= e) return 0;
10222 
10223  /* remove spaces at head */
10224  if (single_byte_optimizable(str)) {
10225  while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10226  }
10227  else {
10228  while (s < e) {
10229  int n;
10230  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10231 
10232  if (cc && !rb_isspace(cc)) break;
10233  s += n;
10234  }
10235  }
10236  return s - start;
10237 }
10238 
10239 /*
10240  * call-seq:
10241  * lstrip! -> self or nil
10242  *
10243  * Like String#lstrip, except that any modifications are made in +self+;
10244  * returns +self+ if any modification are made, +nil+ otherwise.
10245  *
10246  * Related: String#rstrip!, String#strip!.
10247  */
10248 
10249 static VALUE
10250 rb_str_lstrip_bang(VALUE str)
10251 {
10252  rb_encoding *enc;
10253  char *start, *s;
10254  long olen, loffset;
10255 
10256  str_modify_keep_cr(str);
10257  enc = STR_ENC_GET(str);
10258  RSTRING_GETMEM(str, start, olen);
10259  loffset = lstrip_offset(str, start, start+olen, enc);
10260  if (loffset > 0) {
10261  long len = olen-loffset;
10262  s = start + loffset;
10263  memmove(start, s, len);
10264  STR_SET_LEN(str, len);
10265  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10266  return str;
10267  }
10268  return Qnil;
10269 }
10270 
10271 
10272 /*
10273  * call-seq:
10274  * lstrip -> new_string
10275  *
10276  * Returns a copy of +self+ with leading whitespace removed;
10277  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10278  *
10279  * whitespace = "\x00\t\n\v\f\r "
10280  * s = whitespace + 'abc' + whitespace
10281  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10282  * s.lstrip # => "abc\u0000\t\n\v\f\r "
10283  *
10284  * Related: String#rstrip, String#strip.
10285  */
10286 
10287 static VALUE
10288 rb_str_lstrip(VALUE str)
10289 {
10290  char *start;
10291  long len, loffset;
10292  RSTRING_GETMEM(str, start, len);
10293  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10294  if (loffset <= 0) return str_duplicate(rb_cString, str);
10295  return rb_str_subseq(str, loffset, len - loffset);
10296 }
10297 
10298 static long
10299 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10300 {
10301  const char *t;
10302 
10303  rb_str_check_dummy_enc(enc);
10305  rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10306  }
10307  if (!s || s >= e) return 0;
10308  t = e;
10309 
10310  /* remove trailing spaces or '\0's */
10311  if (single_byte_optimizable(str)) {
10312  unsigned char c;
10313  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10314  }
10315  else {
10316  char *tp;
10317 
10318  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10319  unsigned int c = rb_enc_codepoint(tp, e, enc);
10320  if (c && !rb_isspace(c)) break;
10321  t = tp;
10322  }
10323  }
10324  return e - t;
10325 }
10326 
10327 /*
10328  * call-seq:
10329  * rstrip! -> self or nil
10330  *
10331  * Like String#rstrip, except that any modifications are made in +self+;
10332  * returns +self+ if any modification are made, +nil+ otherwise.
10333  *
10334  * Related: String#lstrip!, String#strip!.
10335  */
10336 
10337 static VALUE
10338 rb_str_rstrip_bang(VALUE str)
10339 {
10340  rb_encoding *enc;
10341  char *start;
10342  long olen, roffset;
10343 
10344  str_modify_keep_cr(str);
10345  enc = STR_ENC_GET(str);
10346  RSTRING_GETMEM(str, start, olen);
10347  roffset = rstrip_offset(str, start, start+olen, enc);
10348  if (roffset > 0) {
10349  long len = olen - roffset;
10350 
10351  STR_SET_LEN(str, len);
10352  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10353  return str;
10354  }
10355  return Qnil;
10356 }
10357 
10358 
10359 /*
10360  * call-seq:
10361  * rstrip -> new_string
10362  *
10363  * Returns a copy of the receiver with trailing whitespace removed;
10364  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10365  *
10366  * whitespace = "\x00\t\n\v\f\r "
10367  * s = whitespace + 'abc' + whitespace
10368  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10369  * s.rstrip # => "\u0000\t\n\v\f\r abc"
10370  *
10371  * Related: String#lstrip, String#strip.
10372  */
10373 
10374 static VALUE
10375 rb_str_rstrip(VALUE str)
10376 {
10377  rb_encoding *enc;
10378  char *start;
10379  long olen, roffset;
10380 
10381  enc = STR_ENC_GET(str);
10382  RSTRING_GETMEM(str, start, olen);
10383  roffset = rstrip_offset(str, start, start+olen, enc);
10384 
10385  if (roffset <= 0) return str_duplicate(rb_cString, str);
10386  return rb_str_subseq(str, 0, olen-roffset);
10387 }
10388 
10389 
10390 /*
10391  * call-seq:
10392  * strip! -> self or nil
10393  *
10394  * Like String#strip, except that any modifications are made in +self+;
10395  * returns +self+ if any modification are made, +nil+ otherwise.
10396  *
10397  * Related: String#lstrip!, String#strip!.
10398  */
10399 
10400 static VALUE
10401 rb_str_strip_bang(VALUE str)
10402 {
10403  char *start;
10404  long olen, loffset, roffset;
10405  rb_encoding *enc;
10406 
10407  str_modify_keep_cr(str);
10408  enc = STR_ENC_GET(str);
10409  RSTRING_GETMEM(str, start, olen);
10410  loffset = lstrip_offset(str, start, start+olen, enc);
10411  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10412 
10413  if (loffset > 0 || roffset > 0) {
10414  long len = olen-roffset;
10415  if (loffset > 0) {
10416  len -= loffset;
10417  memmove(start, start + loffset, len);
10418  }
10419  STR_SET_LEN(str, len);
10420  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10421  return str;
10422  }
10423  return Qnil;
10424 }
10425 
10426 
10427 /*
10428  * call-seq:
10429  * strip -> new_string
10430  *
10431  * Returns a copy of the receiver with leading and trailing whitespace removed;
10432  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10433  *
10434  * whitespace = "\x00\t\n\v\f\r "
10435  * s = whitespace + 'abc' + whitespace
10436  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10437  * s.strip # => "abc"
10438  *
10439  * Related: String#lstrip, String#rstrip.
10440  */
10441 
10442 static VALUE
10443 rb_str_strip(VALUE str)
10444 {
10445  char *start;
10446  long olen, loffset, roffset;
10447  rb_encoding *enc = STR_ENC_GET(str);
10448 
10449  RSTRING_GETMEM(str, start, olen);
10450  loffset = lstrip_offset(str, start, start+olen, enc);
10451  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10452 
10453  if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10454  return rb_str_subseq(str, loffset, olen-loffset-roffset);
10455 }
10456 
10457 static VALUE
10458 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10459 {
10460  VALUE result = Qnil;
10461  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10462  if (pos >= 0) {
10463  VALUE match;
10464  struct re_registers *regs;
10465  if (BUILTIN_TYPE(pat) == T_STRING) {
10466  regs = NULL;
10467  end = pos + RSTRING_LEN(pat);
10468  }
10469  else {
10470  match = rb_backref_get();
10471  regs = RMATCH_REGS(match);
10472  pos = BEG(0);
10473  end = END(0);
10474  }
10475 
10476  if (pos == end) {
10477  rb_encoding *enc = STR_ENC_GET(str);
10478  /*
10479  * Always consume at least one character of the input string
10480  */
10481  if (RSTRING_LEN(str) > end)
10482  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10483  RSTRING_END(str), enc);
10484  else
10485  *start = end + 1;
10486  }
10487  else {
10488  *start = end;
10489  }
10490 
10491  if (!regs || regs->num_regs == 1) {
10492  result = rb_str_subseq(str, pos, end - pos);
10493  return result;
10494  }
10495  else {
10496  result = rb_ary_new2(regs->num_regs);
10497  for (int i = 1; i < regs->num_regs; i++) {
10498  VALUE s = Qnil;
10499  if (BEG(i) >= 0) {
10500  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10501  }
10502 
10503  rb_ary_push(result, s);
10504  }
10505  }
10506 
10507  RB_GC_GUARD(match);
10508  }
10509 
10510  return result;
10511 }
10512 
10513 
10514 /*
10515  * call-seq:
10516  * scan(string_or_regexp) -> array
10517  * scan(string_or_regexp) {|matches| ... } -> self
10518  *
10519  * Matches a pattern against +self+; the pattern is:
10520  *
10521  * - +string_or_regexp+ itself, if it is a Regexp.
10522  * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10523  *
10524  * Iterates through +self+, generating a collection of matching results:
10525  *
10526  * - If the pattern contains no groups, each result is the
10527  * matched string, <code>$&</code>.
10528  * - If the pattern contains groups, each result is an array
10529  * containing one entry per group.
10530  *
10531  * With no block given, returns an array of the results:
10532  *
10533  * s = 'cruel world'
10534  * s.scan(/\w+/) # => ["cruel", "world"]
10535  * s.scan(/.../) # => ["cru", "el ", "wor"]
10536  * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10537  * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10538  *
10539  * With a block given, calls the block with each result; returns +self+:
10540  *
10541  * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10542  * print "\n"
10543  * s.scan(/(.)(.)/) {|x,y| print y, x }
10544  * print "\n"
10545  *
10546  * Output:
10547  *
10548  * <<cruel>> <<world>>
10549  * rceu lowlr
10550  *
10551  */
10552 
10553 static VALUE
10554 rb_str_scan(VALUE str, VALUE pat)
10555 {
10556  VALUE result;
10557  long start = 0;
10558  long last = -1, prev = 0;
10559  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10560 
10561  pat = get_pat_quoted(pat, 1);
10562  mustnot_broken(str);
10563  if (!rb_block_given_p()) {
10564  VALUE ary = rb_ary_new();
10565 
10566  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10567  last = prev;
10568  prev = start;
10569  rb_ary_push(ary, result);
10570  }
10571  if (last >= 0) rb_pat_search(pat, str, last, 1);
10572  else rb_backref_set(Qnil);
10573  return ary;
10574  }
10575 
10576  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10577  last = prev;
10578  prev = start;
10579  rb_yield(result);
10580  str_mod_check(str, p, len);
10581  }
10582  if (last >= 0) rb_pat_search(pat, str, last, 1);
10583  return str;
10584 }
10585 
10586 
10587 /*
10588  * call-seq:
10589  * hex -> integer
10590  *
10591  * Interprets the leading substring of +self+ as a string of hexadecimal digits
10592  * (with an optional sign and an optional <code>0x</code>) and returns the
10593  * corresponding number;
10594  * returns zero if there is no such leading substring:
10595  *
10596  * '0x0a'.hex # => 10
10597  * '-1234'.hex # => -4660
10598  * '0'.hex # => 0
10599  * 'non-numeric'.hex # => 0
10600  *
10601  * Related: String#oct.
10602  *
10603  */
10604 
10605 static VALUE
10606 rb_str_hex(VALUE str)
10607 {
10608  return rb_str_to_inum(str, 16, FALSE);
10609 }
10610 
10611 
10612 /*
10613  * call-seq:
10614  * oct -> integer
10615  *
10616  * Interprets the leading substring of +self+ as a string of octal digits
10617  * (with an optional sign) and returns the corresponding number;
10618  * returns zero if there is no such leading substring:
10619  *
10620  * '123'.oct # => 83
10621  * '-377'.oct # => -255
10622  * '0377non-numeric'.oct # => 255
10623  * 'non-numeric'.oct # => 0
10624  *
10625  * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10626  * see Kernel#Integer.
10627  *
10628  * Related: String#hex.
10629  *
10630  */
10631 
10632 static VALUE
10633 rb_str_oct(VALUE str)
10634 {
10635  return rb_str_to_inum(str, -8, FALSE);
10636 }
10637 
10638 #ifndef HAVE_CRYPT_R
10639 # include "ruby/thread_native.h"
10640 # include "ruby/atomic.h"
10641 
10642 static struct {
10643  rb_nativethread_lock_t lock;
10644 } crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10645 
10646 static void
10647 crypt_mutex_initialize(void)
10648 {
10649 }
10650 #endif
10651 
10652 /*
10653  * call-seq:
10654  * crypt(salt_str) -> new_string
10655  *
10656  * Returns the string generated by calling <code>crypt(3)</code>
10657  * standard library function with <code>str</code> and
10658  * <code>salt_str</code>, in this order, as its arguments. Please do
10659  * not use this method any longer. It is legacy; provided only for
10660  * backward compatibility with ruby scripts in earlier days. It is
10661  * bad to use in contemporary programs for several reasons:
10662  *
10663  * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10664  * run. The generated string lacks data portability.
10665  *
10666  * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10667  * (i.e. silently ends up in unexpected results).
10668  *
10669  * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10670  * thread safe.
10671  *
10672  * * So-called "traditional" usage of <code>crypt(3)</code> is very
10673  * very very weak. According to its manpage, Linux's traditional
10674  * <code>crypt(3)</code> output has only 2**56 variations; too
10675  * easy to brute force today. And this is the default behaviour.
10676  *
10677  * * In order to make things robust some OSes implement so-called
10678  * "modular" usage. To go through, you have to do a complex
10679  * build-up of the <code>salt_str</code> parameter, by hand.
10680  * Failure in generation of a proper salt string tends not to
10681  * yield any errors; typos in parameters are normally not
10682  * detectable.
10683  *
10684  * * For instance, in the following example, the second invocation
10685  * of String#crypt is wrong; it has a typo in "round=" (lacks
10686  * "s"). However the call does not fail and something unexpected
10687  * is generated.
10688  *
10689  * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10690  * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10691  *
10692  * * Even in the "modular" mode, some hash functions are considered
10693  * archaic and no longer recommended at all; for instance module
10694  * <code>$1$</code> is officially abandoned by its author: see
10695  * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10696  * instance module <code>$3$</code> is considered completely
10697  * broken: see the manpage of FreeBSD.
10698  *
10699  * * On some OS such as Mac OS, there is no modular mode. Yet, as
10700  * written above, <code>crypt(3)</code> on Mac OS never fails.
10701  * This means even if you build up a proper salt string it
10702  * generates a traditional DES hash anyways, and there is no way
10703  * for you to be aware of.
10704  *
10705  * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10706  *
10707  * If for some reason you cannot migrate to other secure contemporary
10708  * password hashing algorithms, install the string-crypt gem and
10709  * <code>require 'string/crypt'</code> to continue using it.
10710  */
10711 
10712 static VALUE
10713 rb_str_crypt(VALUE str, VALUE salt)
10714 {
10715 #ifdef HAVE_CRYPT_R
10716  VALUE databuf;
10717  struct crypt_data *data;
10718 # define CRYPT_END() ALLOCV_END(databuf)
10719 #else
10720  extern char *crypt(const char *, const char *);
10721 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10722 #endif
10723  VALUE result;
10724  const char *s, *saltp;
10725  char *res;
10726 #ifdef BROKEN_CRYPT
10727  char salt_8bit_clean[3];
10728 #endif
10729 
10730  StringValue(salt);
10731  mustnot_wchar(str);
10732  mustnot_wchar(salt);
10733  s = StringValueCStr(str);
10734  saltp = RSTRING_PTR(salt);
10735  if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10736  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10737  }
10738 
10739 #ifdef BROKEN_CRYPT
10740  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10741  salt_8bit_clean[0] = saltp[0] & 0x7f;
10742  salt_8bit_clean[1] = saltp[1] & 0x7f;
10743  salt_8bit_clean[2] = '\0';
10744  saltp = salt_8bit_clean;
10745  }
10746 #endif
10747 #ifdef HAVE_CRYPT_R
10748  data = ALLOCV(databuf, sizeof(struct crypt_data));
10749 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10750  data->initialized = 0;
10751 # endif
10752  res = crypt_r(s, saltp, data);
10753 #else
10754  crypt_mutex_initialize();
10755  rb_nativethread_lock_lock(&crypt_mutex.lock);
10756  res = crypt(s, saltp);
10757 #endif
10758  if (!res) {
10759  int err = errno;
10760  CRYPT_END();
10761  rb_syserr_fail(err, "crypt");
10762  }
10763  result = rb_str_new_cstr(res);
10764  CRYPT_END();
10765  return result;
10766 }
10767 
10768 
10769 /*
10770  * call-seq:
10771  * ord -> integer
10772  *
10773  * :include: doc/string/ord.rdoc
10774  *
10775  */
10776 
10777 static VALUE
10778 rb_str_ord(VALUE s)
10779 {
10780  unsigned int c;
10781 
10782  c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10783  return UINT2NUM(c);
10784 }
10785 /*
10786  * call-seq:
10787  * sum(n = 16) -> integer
10788  *
10789  * :include: doc/string/sum.rdoc
10790  *
10791  */
10792 
10793 static VALUE
10794 rb_str_sum(int argc, VALUE *argv, VALUE str)
10795 {
10796  int bits = 16;
10797  char *ptr, *p, *pend;
10798  long len;
10799  VALUE sum = INT2FIX(0);
10800  unsigned long sum0 = 0;
10801 
10802  if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10803  bits = 0;
10804  }
10805  ptr = p = RSTRING_PTR(str);
10806  len = RSTRING_LEN(str);
10807  pend = p + len;
10808 
10809  while (p < pend) {
10810  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10811  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10812  str_mod_check(str, ptr, len);
10813  sum0 = 0;
10814  }
10815  sum0 += (unsigned char)*p;
10816  p++;
10817  }
10818 
10819  if (bits == 0) {
10820  if (sum0) {
10821  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10822  }
10823  }
10824  else {
10825  if (sum == INT2FIX(0)) {
10826  if (bits < (int)sizeof(long)*CHAR_BIT) {
10827  sum0 &= (((unsigned long)1)<<bits)-1;
10828  }
10829  sum = LONG2FIX(sum0);
10830  }
10831  else {
10832  VALUE mod;
10833 
10834  if (sum0) {
10835  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10836  }
10837 
10838  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10839  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10840  sum = rb_funcall(sum, '&', 1, mod);
10841  }
10842  }
10843  return sum;
10844 }
10845 
10846 static VALUE
10847 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10848 {
10849  rb_encoding *enc;
10850  VALUE w;
10851  long width, len, flen = 1, fclen = 1;
10852  VALUE res;
10853  char *p;
10854  const char *f = " ";
10855  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10856  VALUE pad;
10857  int singlebyte = 1, cr;
10858  int termlen;
10859 
10860  rb_scan_args(argc, argv, "11", &w, &pad);
10861  enc = STR_ENC_GET(str);
10862  termlen = rb_enc_mbminlen(enc);
10863  width = NUM2LONG(w);
10864  if (argc == 2) {
10865  StringValue(pad);
10866  enc = rb_enc_check(str, pad);
10867  f = RSTRING_PTR(pad);
10868  flen = RSTRING_LEN(pad);
10869  fclen = str_strlen(pad, enc); /* rb_enc_check */
10870  singlebyte = single_byte_optimizable(pad);
10871  if (flen == 0 || fclen == 0) {
10872  rb_raise(rb_eArgError, "zero width padding");
10873  }
10874  }
10875  len = str_strlen(str, enc); /* rb_enc_check */
10876  if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10877  n = width - len;
10878  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10879  rlen = n - llen;
10880  cr = ENC_CODERANGE(str);
10881  if (flen > 1) {
10882  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10883  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10884  }
10885  size = RSTRING_LEN(str);
10886  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10887  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10888  (len += llen2 + rlen2) >= LONG_MAX - size) {
10889  rb_raise(rb_eArgError, "argument too big");
10890  }
10891  len += size;
10892  res = str_enc_new(rb_cString, 0, len, enc);
10893  p = RSTRING_PTR(res);
10894  if (flen <= 1) {
10895  memset(p, *f, llen);
10896  p += llen;
10897  }
10898  else {
10899  while (llen >= fclen) {
10900  memcpy(p,f,flen);
10901  p += flen;
10902  llen -= fclen;
10903  }
10904  if (llen > 0) {
10905  memcpy(p, f, llen2);
10906  p += llen2;
10907  }
10908  }
10909  memcpy(p, RSTRING_PTR(str), size);
10910  p += size;
10911  if (flen <= 1) {
10912  memset(p, *f, rlen);
10913  p += rlen;
10914  }
10915  else {
10916  while (rlen >= fclen) {
10917  memcpy(p,f,flen);
10918  p += flen;
10919  rlen -= fclen;
10920  }
10921  if (rlen > 0) {
10922  memcpy(p, f, rlen2);
10923  p += rlen2;
10924  }
10925  }
10926  TERM_FILL(p, termlen);
10927  STR_SET_LEN(res, p-RSTRING_PTR(res));
10928 
10929  if (argc == 2)
10930  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10931  if (cr != ENC_CODERANGE_BROKEN)
10932  ENC_CODERANGE_SET(res, cr);
10933 
10934  RB_GC_GUARD(pad);
10935  return res;
10936 }
10937 
10938 
10939 /*
10940  * call-seq:
10941  * ljust(size, pad_string = ' ') -> new_string
10942  *
10943  * :include: doc/string/ljust.rdoc
10944  *
10945  * Related: String#rjust, String#center.
10946  *
10947  */
10948 
10949 static VALUE
10950 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10951 {
10952  return rb_str_justify(argc, argv, str, 'l');
10953 }
10954 
10955 /*
10956  * call-seq:
10957  * rjust(size, pad_string = ' ') -> new_string
10958  *
10959  * :include: doc/string/rjust.rdoc
10960  *
10961  * Related: String#ljust, String#center.
10962  *
10963  */
10964 
10965 static VALUE
10966 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10967 {
10968  return rb_str_justify(argc, argv, str, 'r');
10969 }
10970 
10971 
10972 /*
10973  * call-seq:
10974  * center(size, pad_string = ' ') -> new_string
10975  *
10976  * :include: doc/string/center.rdoc
10977  *
10978  * Related: String#ljust, String#rjust.
10979  *
10980  */
10981 
10982 static VALUE
10983 rb_str_center(int argc, VALUE *argv, VALUE str)
10984 {
10985  return rb_str_justify(argc, argv, str, 'c');
10986 }
10987 
10988 /*
10989  * call-seq:
10990  * partition(string_or_regexp) -> [head, match, tail]
10991  *
10992  * :include: doc/string/partition.rdoc
10993  *
10994  */
10995 
10996 static VALUE
10997 rb_str_partition(VALUE str, VALUE sep)
10998 {
10999  long pos;
11000 
11001  sep = get_pat_quoted(sep, 0);
11002  if (RB_TYPE_P(sep, T_REGEXP)) {
11003  if (rb_reg_search(sep, str, 0, 0) < 0) {
11004  goto failed;
11005  }
11006  VALUE match = rb_backref_get();
11007  struct re_registers *regs = RMATCH_REGS(match);
11008 
11009  pos = BEG(0);
11010  sep = rb_str_subseq(str, pos, END(0) - pos);
11011  }
11012  else {
11013  pos = rb_str_index(str, sep, 0);
11014  if (pos < 0) goto failed;
11015  }
11016  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11017  sep,
11018  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11019  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11020 
11021  failed:
11022  return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11023 }
11024 
11025 /*
11026  * call-seq:
11027  * rpartition(sep) -> [head, match, tail]
11028  *
11029  * :include: doc/string/rpartition.rdoc
11030  *
11031  */
11032 
11033 static VALUE
11034 rb_str_rpartition(VALUE str, VALUE sep)
11035 {
11036  long pos = RSTRING_LEN(str);
11037 
11038  sep = get_pat_quoted(sep, 0);
11039  if (RB_TYPE_P(sep, T_REGEXP)) {
11040  if (rb_reg_search(sep, str, pos, 1) < 0) {
11041  goto failed;
11042  }
11043  VALUE match = rb_backref_get();
11044  struct re_registers *regs = RMATCH_REGS(match);
11045 
11046  pos = BEG(0);
11047  sep = rb_str_subseq(str, pos, END(0) - pos);
11048  }
11049  else {
11050  pos = rb_str_sublen(str, pos);
11051  pos = rb_str_rindex(str, sep, pos);
11052  if (pos < 0) {
11053  goto failed;
11054  }
11055  }
11056 
11057  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11058  sep,
11059  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11060  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11061  failed:
11062  return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11063 }
11064 
11065 /*
11066  * call-seq:
11067  * start_with?(*string_or_regexp) -> true or false
11068  *
11069  * :include: doc/string/start_with_p.rdoc
11070  *
11071  */
11072 
11073 static VALUE
11074 rb_str_start_with(int argc, VALUE *argv, VALUE str)
11075 {
11076  int i;
11077 
11078  for (i=0; i<argc; i++) {
11079  VALUE tmp = argv[i];
11080  if (RB_TYPE_P(tmp, T_REGEXP)) {
11081  if (rb_reg_start_with_p(tmp, str))
11082  return Qtrue;
11083  }
11084  else {
11085  const char *p, *s, *e;
11086  long slen, tlen;
11087  rb_encoding *enc;
11088 
11089  StringValue(tmp);
11090  enc = rb_enc_check(str, tmp);
11091  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11092  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11093  p = RSTRING_PTR(str);
11094  e = p + slen;
11095  s = p + tlen;
11096  if (!at_char_right_boundary(p, s, e, enc))
11097  continue;
11098  if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11099  return Qtrue;
11100  }
11101  }
11102  return Qfalse;
11103 }
11104 
11105 /*
11106  * call-seq:
11107  * end_with?(*strings) -> true or false
11108  *
11109  * :include: doc/string/end_with_p.rdoc
11110  *
11111  */
11112 
11113 static VALUE
11114 rb_str_end_with(int argc, VALUE *argv, VALUE str)
11115 {
11116  int i;
11117 
11118  for (i=0; i<argc; i++) {
11119  VALUE tmp = argv[i];
11120  const char *p, *s, *e;
11121  long slen, tlen;
11122  rb_encoding *enc;
11123 
11124  StringValue(tmp);
11125  enc = rb_enc_check(str, tmp);
11126  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11127  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11128  p = RSTRING_PTR(str);
11129  e = p + slen;
11130  s = e - tlen;
11131  if (!at_char_boundary(p, s, e, enc))
11132  continue;
11133  if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11134  return Qtrue;
11135  }
11136  return Qfalse;
11137 }
11138 
11148 static long
11149 deleted_prefix_length(VALUE str, VALUE prefix)
11150 {
11151  const char *strptr, *prefixptr;
11152  long olen, prefixlen;
11153  rb_encoding *enc = rb_enc_get(str);
11154 
11155  StringValue(prefix);
11156 
11157  if (!is_broken_string(prefix) ||
11158  !rb_enc_asciicompat(enc) ||
11159  !rb_enc_asciicompat(rb_enc_get(prefix))) {
11160  enc = rb_enc_check(str, prefix);
11161  }
11162 
11163  /* return 0 if not start with prefix */
11164  prefixlen = RSTRING_LEN(prefix);
11165  if (prefixlen <= 0) return 0;
11166  olen = RSTRING_LEN(str);
11167  if (olen < prefixlen) return 0;
11168  strptr = RSTRING_PTR(str);
11169  prefixptr = RSTRING_PTR(prefix);
11170  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11171  if (is_broken_string(prefix)) {
11172  if (!is_broken_string(str)) {
11173  /* prefix in a valid string cannot be broken */
11174  return 0;
11175  }
11176  const char *strend = strptr + olen;
11177  const char *after_prefix = strptr + prefixlen;
11178  if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11179  /* prefix does not end at char-boundary */
11180  return 0;
11181  }
11182  }
11183  /* prefix part in `str` also should be valid. */
11184 
11185  return prefixlen;
11186 }
11187 
11188 /*
11189  * call-seq:
11190  * delete_prefix!(prefix) -> self or nil
11191  *
11192  * Like String#delete_prefix, except that +self+ is modified in place.
11193  * Returns +self+ if the prefix is removed, +nil+ otherwise.
11194  *
11195  */
11196 
11197 static VALUE
11198 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11199 {
11200  long prefixlen;
11201  str_modify_keep_cr(str);
11202 
11203  prefixlen = deleted_prefix_length(str, prefix);
11204  if (prefixlen <= 0) return Qnil;
11205 
11206  return rb_str_drop_bytes(str, prefixlen);
11207 }
11208 
11209 /*
11210  * call-seq:
11211  * delete_prefix(prefix) -> new_string
11212  *
11213  * :include: doc/string/delete_prefix.rdoc
11214  *
11215  */
11216 
11217 static VALUE
11218 rb_str_delete_prefix(VALUE str, VALUE prefix)
11219 {
11220  long prefixlen;
11221 
11222  prefixlen = deleted_prefix_length(str, prefix);
11223  if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11224 
11225  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11226 }
11227 
11237 static long
11238 deleted_suffix_length(VALUE str, VALUE suffix)
11239 {
11240  const char *strptr, *suffixptr;
11241  long olen, suffixlen;
11242  rb_encoding *enc;
11243 
11244  StringValue(suffix);
11245  if (is_broken_string(suffix)) return 0;
11246  enc = rb_enc_check(str, suffix);
11247 
11248  /* return 0 if not start with suffix */
11249  suffixlen = RSTRING_LEN(suffix);
11250  if (suffixlen <= 0) return 0;
11251  olen = RSTRING_LEN(str);
11252  if (olen < suffixlen) return 0;
11253  strptr = RSTRING_PTR(str);
11254  suffixptr = RSTRING_PTR(suffix);
11255  const char *strend = strptr + olen;
11256  const char *before_suffix = strend - suffixlen;
11257  if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11258  if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11259 
11260  return suffixlen;
11261 }
11262 
11263 /*
11264  * call-seq:
11265  * delete_suffix!(suffix) -> self or nil
11266  *
11267  * Like String#delete_suffix, except that +self+ is modified in place.
11268  * Returns +self+ if the suffix is removed, +nil+ otherwise.
11269  *
11270  */
11271 
11272 static VALUE
11273 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11274 {
11275  long olen, suffixlen, len;
11276  str_modifiable(str);
11277 
11278  suffixlen = deleted_suffix_length(str, suffix);
11279  if (suffixlen <= 0) return Qnil;
11280 
11281  olen = RSTRING_LEN(str);
11282  str_modify_keep_cr(str);
11283  len = olen - suffixlen;
11284  STR_SET_LEN(str, len);
11285  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11286  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11287  ENC_CODERANGE_CLEAR(str);
11288  }
11289  return str;
11290 }
11291 
11292 /*
11293  * call-seq:
11294  * delete_suffix(suffix) -> new_string
11295  *
11296  * :include: doc/string/delete_suffix.rdoc
11297  *
11298  */
11299 
11300 static VALUE
11301 rb_str_delete_suffix(VALUE str, VALUE suffix)
11302 {
11303  long suffixlen;
11304 
11305  suffixlen = deleted_suffix_length(str, suffix);
11306  if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11307 
11308  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11309 }
11310 
11311 void
11312 rb_str_setter(VALUE val, ID id, VALUE *var)
11313 {
11314  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11315  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11316  }
11317  *var = val;
11318 }
11319 
11320 static void
11321 rb_fs_setter(VALUE val, ID id, VALUE *var)
11322 {
11323  val = rb_fs_check(val);
11324  if (!val) {
11326  "value of %"PRIsVALUE" must be String or Regexp",
11327  rb_id2str(id));
11328  }
11329  if (!NIL_P(val)) {
11330  rb_warn_deprecated("'$;'", NULL);
11331  }
11332  *var = val;
11333 }
11334 
11335 
11336 /*
11337  * call-seq:
11338  * force_encoding(encoding) -> self
11339  *
11340  * :include: doc/string/force_encoding.rdoc
11341  *
11342  */
11343 
11344 static VALUE
11345 rb_str_force_encoding(VALUE str, VALUE enc)
11346 {
11347  str_modifiable(str);
11348 
11349  rb_encoding *encoding = rb_to_encoding(enc);
11350  int idx = rb_enc_to_index(encoding);
11351 
11352  // If the encoding is unchanged, we do nothing.
11353  if (ENCODING_GET(str) == idx) {
11354  return str;
11355  }
11356 
11357  rb_enc_associate_index(str, idx);
11358 
11359  // If the coderange was 7bit and the new encoding is ASCII-compatible
11360  // we can keep the coderange.
11361  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11362  return str;
11363  }
11364 
11365  ENC_CODERANGE_CLEAR(str);
11366  return str;
11367 }
11368 
11369 /*
11370  * call-seq:
11371  * b -> string
11372  *
11373  * :include: doc/string/b.rdoc
11374  *
11375  */
11376 
11377 static VALUE
11378 rb_str_b(VALUE str)
11379 {
11380  VALUE str2;
11381  if (STR_EMBED_P(str)) {
11382  str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11383  }
11384  else {
11385  str2 = str_alloc_heap(rb_cString);
11386  }
11387  str_replace_shared_without_enc(str2, str);
11388 
11389  if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11390  // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11391  // If we know the receiver's code range then we know the result's code range.
11392  int cr = ENC_CODERANGE(str);
11393  switch (cr) {
11394  case ENC_CODERANGE_7BIT:
11396  break;
11397  case ENC_CODERANGE_BROKEN:
11398  case ENC_CODERANGE_VALID:
11400  break;
11401  default:
11402  ENC_CODERANGE_CLEAR(str2);
11403  break;
11404  }
11405  }
11406 
11407  return str2;
11408 }
11409 
11410 /*
11411  * call-seq:
11412  * valid_encoding? -> true or false
11413  *
11414  * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11415  *
11416  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11417  * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11418  * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11419  */
11420 
11421 static VALUE
11422 rb_str_valid_encoding_p(VALUE str)
11423 {
11424  int cr = rb_enc_str_coderange(str);
11425 
11426  return RBOOL(cr != ENC_CODERANGE_BROKEN);
11427 }
11428 
11429 /*
11430  * call-seq:
11431  * ascii_only? -> true or false
11432  *
11433  * Returns +true+ if +self+ contains only ASCII characters,
11434  * +false+ otherwise:
11435  *
11436  * 'abc'.ascii_only? # => true
11437  * "abc\u{6666}".ascii_only? # => false
11438  *
11439  */
11440 
11441 static VALUE
11442 rb_str_is_ascii_only_p(VALUE str)
11443 {
11444  int cr = rb_enc_str_coderange(str);
11445 
11446  return RBOOL(cr == ENC_CODERANGE_7BIT);
11447 }
11448 
11449 VALUE
11451 {
11452  static const char ellipsis[] = "...";
11453  const long ellipsislen = sizeof(ellipsis) - 1;
11454  rb_encoding *const enc = rb_enc_get(str);
11455  const long blen = RSTRING_LEN(str);
11456  const char *const p = RSTRING_PTR(str), *e = p + blen;
11457  VALUE estr, ret = 0;
11458 
11459  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11460  if (len * rb_enc_mbminlen(enc) >= blen ||
11461  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11462  ret = str;
11463  }
11464  else if (len <= ellipsislen ||
11465  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11466  if (rb_enc_asciicompat(enc)) {
11467  ret = rb_str_new(ellipsis, len);
11468  rb_enc_associate(ret, enc);
11469  }
11470  else {
11471  estr = rb_usascii_str_new(ellipsis, len);
11472  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11473  }
11474  }
11475  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11476  rb_str_cat(ret, ellipsis, ellipsislen);
11477  }
11478  else {
11479  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11480  rb_enc_from_encoding(enc), 0, Qnil);
11481  rb_str_append(ret, estr);
11482  }
11483  return ret;
11484 }
11485 
11486 static VALUE
11487 str_compat_and_valid(VALUE str, rb_encoding *enc)
11488 {
11489  int cr;
11490  str = StringValue(str);
11491  cr = rb_enc_str_coderange(str);
11492  if (cr == ENC_CODERANGE_BROKEN) {
11493  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11494  }
11495  else {
11496  rb_encoding *e = STR_ENC_GET(str);
11497  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11498  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11499  rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11500  }
11501  }
11502  return str;
11503 }
11504 
11505 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11506 
11507 VALUE
11509 {
11510  rb_encoding *enc = STR_ENC_GET(str);
11511  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11512 }
11513 
11514 VALUE
11515 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11516 {
11517  int cr = ENC_CODERANGE_UNKNOWN;
11518  if (enc == STR_ENC_GET(str)) {
11519  /* cached coderange makes sense only when enc equals the
11520  * actual encoding of str */
11521  cr = ENC_CODERANGE(str);
11522  }
11523  return enc_str_scrub(enc, str, repl, cr);
11524 }
11525 
11526 static VALUE
11527 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11528 {
11529  int encidx;
11530  VALUE buf = Qnil;
11531  const char *rep, *p, *e, *p1, *sp;
11532  long replen = -1;
11533  long slen;
11534 
11535  if (rb_block_given_p()) {
11536  if (!NIL_P(repl))
11537  rb_raise(rb_eArgError, "both of block and replacement given");
11538  replen = 0;
11539  }
11540 
11541  if (ENC_CODERANGE_CLEAN_P(cr))
11542  return Qnil;
11543 
11544  if (!NIL_P(repl)) {
11545  repl = str_compat_and_valid(repl, enc);
11546  }
11547 
11548  if (rb_enc_dummy_p(enc)) {
11549  return Qnil;
11550  }
11551  encidx = rb_enc_to_index(enc);
11552 
11553 #define DEFAULT_REPLACE_CHAR(str) do { \
11554  static const char replace[sizeof(str)-1] = str; \
11555  rep = replace; replen = (int)sizeof(replace); \
11556  } while (0)
11557 
11558  slen = RSTRING_LEN(str);
11559  p = RSTRING_PTR(str);
11560  e = RSTRING_END(str);
11561  p1 = p;
11562  sp = p;
11563 
11564  if (rb_enc_asciicompat(enc)) {
11565  int rep7bit_p;
11566  if (!replen) {
11567  rep = NULL;
11568  rep7bit_p = FALSE;
11569  }
11570  else if (!NIL_P(repl)) {
11571  rep = RSTRING_PTR(repl);
11572  replen = RSTRING_LEN(repl);
11573  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11574  }
11575  else if (encidx == rb_utf8_encindex()) {
11576  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11577  rep7bit_p = FALSE;
11578  }
11579  else {
11580  DEFAULT_REPLACE_CHAR("?");
11581  rep7bit_p = TRUE;
11582  }
11583  cr = ENC_CODERANGE_7BIT;
11584 
11585  p = search_nonascii(p, e);
11586  if (!p) {
11587  p = e;
11588  }
11589  while (p < e) {
11590  int ret = rb_enc_precise_mbclen(p, e, enc);
11591  if (MBCLEN_NEEDMORE_P(ret)) {
11592  break;
11593  }
11594  else if (MBCLEN_CHARFOUND_P(ret)) {
11595  cr = ENC_CODERANGE_VALID;
11596  p += MBCLEN_CHARFOUND_LEN(ret);
11597  }
11598  else if (MBCLEN_INVALID_P(ret)) {
11599  /*
11600  * p1~p: valid ascii/multibyte chars
11601  * p ~e: invalid bytes + unknown bytes
11602  */
11603  long clen = rb_enc_mbmaxlen(enc);
11604  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11605  if (p > p1) {
11606  rb_str_buf_cat(buf, p1, p - p1);
11607  }
11608 
11609  if (e - p < clen) clen = e - p;
11610  if (clen <= 2) {
11611  clen = 1;
11612  }
11613  else {
11614  const char *q = p;
11615  clen--;
11616  for (; clen > 1; clen--) {
11617  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11618  if (MBCLEN_NEEDMORE_P(ret)) break;
11619  if (MBCLEN_INVALID_P(ret)) continue;
11620  UNREACHABLE;
11621  }
11622  }
11623  if (rep) {
11624  rb_str_buf_cat(buf, rep, replen);
11625  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11626  }
11627  else {
11628  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11629  str_mod_check(str, sp, slen);
11630  repl = str_compat_and_valid(repl, enc);
11631  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11632  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11633  cr = ENC_CODERANGE_VALID;
11634  }
11635  p += clen;
11636  p1 = p;
11637  p = search_nonascii(p, e);
11638  if (!p) {
11639  p = e;
11640  break;
11641  }
11642  }
11643  else {
11644  UNREACHABLE;
11645  }
11646  }
11647  if (NIL_P(buf)) {
11648  if (p == e) {
11649  ENC_CODERANGE_SET(str, cr);
11650  return Qnil;
11651  }
11652  buf = rb_str_buf_new(RSTRING_LEN(str));
11653  }
11654  if (p1 < p) {
11655  rb_str_buf_cat(buf, p1, p - p1);
11656  }
11657  if (p < e) {
11658  if (rep) {
11659  rb_str_buf_cat(buf, rep, replen);
11660  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11661  }
11662  else {
11663  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11664  str_mod_check(str, sp, slen);
11665  repl = str_compat_and_valid(repl, enc);
11666  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11667  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11668  cr = ENC_CODERANGE_VALID;
11669  }
11670  }
11671  }
11672  else {
11673  /* ASCII incompatible */
11674  long mbminlen = rb_enc_mbminlen(enc);
11675  if (!replen) {
11676  rep = NULL;
11677  }
11678  else if (!NIL_P(repl)) {
11679  rep = RSTRING_PTR(repl);
11680  replen = RSTRING_LEN(repl);
11681  }
11682  else if (encidx == ENCINDEX_UTF_16BE) {
11683  DEFAULT_REPLACE_CHAR("\xFF\xFD");
11684  }
11685  else if (encidx == ENCINDEX_UTF_16LE) {
11686  DEFAULT_REPLACE_CHAR("\xFD\xFF");
11687  }
11688  else if (encidx == ENCINDEX_UTF_32BE) {
11689  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11690  }
11691  else if (encidx == ENCINDEX_UTF_32LE) {
11692  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11693  }
11694  else {
11695  DEFAULT_REPLACE_CHAR("?");
11696  }
11697 
11698  while (p < e) {
11699  int ret = rb_enc_precise_mbclen(p, e, enc);
11700  if (MBCLEN_NEEDMORE_P(ret)) {
11701  break;
11702  }
11703  else if (MBCLEN_CHARFOUND_P(ret)) {
11704  p += MBCLEN_CHARFOUND_LEN(ret);
11705  }
11706  else if (MBCLEN_INVALID_P(ret)) {
11707  const char *q = p;
11708  long clen = rb_enc_mbmaxlen(enc);
11709  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11710  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11711 
11712  if (e - p < clen) clen = e - p;
11713  if (clen <= mbminlen * 2) {
11714  clen = mbminlen;
11715  }
11716  else {
11717  clen -= mbminlen;
11718  for (; clen > mbminlen; clen-=mbminlen) {
11719  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11720  if (MBCLEN_NEEDMORE_P(ret)) break;
11721  if (MBCLEN_INVALID_P(ret)) continue;
11722  UNREACHABLE;
11723  }
11724  }
11725  if (rep) {
11726  rb_str_buf_cat(buf, rep, replen);
11727  }
11728  else {
11729  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11730  str_mod_check(str, sp, slen);
11731  repl = str_compat_and_valid(repl, enc);
11732  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11733  }
11734  p += clen;
11735  p1 = p;
11736  }
11737  else {
11738  UNREACHABLE;
11739  }
11740  }
11741  if (NIL_P(buf)) {
11742  if (p == e) {
11744  return Qnil;
11745  }
11746  buf = rb_str_buf_new(RSTRING_LEN(str));
11747  }
11748  if (p1 < p) {
11749  rb_str_buf_cat(buf, p1, p - p1);
11750  }
11751  if (p < e) {
11752  if (rep) {
11753  rb_str_buf_cat(buf, rep, replen);
11754  }
11755  else {
11756  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11757  str_mod_check(str, sp, slen);
11758  repl = str_compat_and_valid(repl, enc);
11759  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11760  }
11761  }
11762  cr = ENC_CODERANGE_VALID;
11763  }
11764  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11765  return buf;
11766 }
11767 
11768 /*
11769  * call-seq:
11770  * scrub(replacement_string = default_replacement) -> new_string
11771  * scrub{|bytes| ... } -> new_string
11772  *
11773  * :include: doc/string/scrub.rdoc
11774  *
11775  */
11776 static VALUE
11777 str_scrub(int argc, VALUE *argv, VALUE str)
11778 {
11779  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11780  VALUE new = rb_str_scrub(str, repl);
11781  return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11782 }
11783 
11784 /*
11785  * call-seq:
11786  * scrub! -> self
11787  * scrub!(replacement_string = default_replacement) -> self
11788  * scrub!{|bytes| ... } -> self
11789  *
11790  * Like String#scrub, except that any replacements are made in +self+.
11791  *
11792  */
11793 static VALUE
11794 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11795 {
11796  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11797  VALUE new = rb_str_scrub(str, repl);
11798  if (!NIL_P(new)) rb_str_replace(str, new);
11799  return str;
11800 }
11801 
11802 static ID id_normalize;
11803 static ID id_normalized_p;
11804 static VALUE mUnicodeNormalize;
11805 
11806 static VALUE
11807 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11808 {
11809  static int UnicodeNormalizeRequired = 0;
11810  VALUE argv2[2];
11811 
11812  if (!UnicodeNormalizeRequired) {
11813  rb_require("unicode_normalize/normalize.rb");
11814  UnicodeNormalizeRequired = 1;
11815  }
11816  argv2[0] = str;
11817  if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11818  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11819 }
11820 
11821 /*
11822  * call-seq:
11823  * unicode_normalize(form = :nfc) -> string
11824  *
11825  * Returns a copy of +self+ with
11826  * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11827  *
11828  * Argument +form+ must be one of the following symbols
11829  * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11830  *
11831  * - +:nfc+: Canonical decomposition, followed by canonical composition.
11832  * - +:nfd+: Canonical decomposition.
11833  * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11834  * - +:nfkd+: Compatibility decomposition.
11835  *
11836  * The encoding of +self+ must be one of:
11837  *
11838  * - Encoding::UTF_8
11839  * - Encoding::UTF_16BE
11840  * - Encoding::UTF_16LE
11841  * - Encoding::UTF_32BE
11842  * - Encoding::UTF_32LE
11843  * - Encoding::GB18030
11844  * - Encoding::UCS_2BE
11845  * - Encoding::UCS_4BE
11846  *
11847  * Examples:
11848  *
11849  * "a\u0300".unicode_normalize # => "a"
11850  * "\u00E0".unicode_normalize(:nfd) # => "a "
11851  *
11852  * Related: String#unicode_normalize!, String#unicode_normalized?.
11853  */
11854 static VALUE
11855 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11856 {
11857  return unicode_normalize_common(argc, argv, str, id_normalize);
11858 }
11859 
11860 /*
11861  * call-seq:
11862  * unicode_normalize!(form = :nfc) -> self
11863  *
11864  * Like String#unicode_normalize, except that the normalization
11865  * is performed on +self+.
11866  *
11867  * Related String#unicode_normalized?.
11868  *
11869  */
11870 static VALUE
11871 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11872 {
11873  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11874 }
11875 
11876 /* call-seq:
11877  * unicode_normalized?(form = :nfc) -> true or false
11878  *
11879  * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11880  * +false+ otherwise.
11881  * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11882  *
11883  * Examples:
11884  *
11885  * "a\u0300".unicode_normalized? # => false
11886  * "a\u0300".unicode_normalized?(:nfd) # => true
11887  * "\u00E0".unicode_normalized? # => true
11888  * "\u00E0".unicode_normalized?(:nfd) # => false
11889  *
11890  *
11891  * Raises an exception if +self+ is not in a Unicode encoding:
11892  *
11893  * s = "\xE0".force_encoding('ISO-8859-1')
11894  * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11895  *
11896  * Related: String#unicode_normalize, String#unicode_normalize!.
11897  *
11898  */
11899 static VALUE
11900 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11901 {
11902  return unicode_normalize_common(argc, argv, str, id_normalized_p);
11903 }
11904 
11905 /**********************************************************************
11906  * Document-class: Symbol
11907  *
11908  * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11909  *
11910  * You can create a +Symbol+ object explicitly with:
11911  *
11912  * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11913  *
11914  * The same +Symbol+ object will be
11915  * created for a given name or string for the duration of a program's
11916  * execution, regardless of the context or meaning of that name. Thus
11917  * if <code>Fred</code> is a constant in one context, a method in
11918  * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11919  * will be the same object in all three contexts.
11920  *
11921  * module One
11922  * class Fred
11923  * end
11924  * $f1 = :Fred
11925  * end
11926  * module Two
11927  * Fred = 1
11928  * $f2 = :Fred
11929  * end
11930  * def Fred()
11931  * end
11932  * $f3 = :Fred
11933  * $f1.object_id #=> 2514190
11934  * $f2.object_id #=> 2514190
11935  * $f3.object_id #=> 2514190
11936  *
11937  * Constant, method, and variable names are returned as symbols:
11938  *
11939  * module One
11940  * Two = 2
11941  * def three; 3 end
11942  * @four = 4
11943  * @@five = 5
11944  * $six = 6
11945  * end
11946  * seven = 7
11947  *
11948  * One.constants
11949  * # => [:Two]
11950  * One.instance_methods(true)
11951  * # => [:three]
11952  * One.instance_variables
11953  * # => [:@four]
11954  * One.class_variables
11955  * # => [:@@five]
11956  * global_variables.grep(/six/)
11957  * # => [:$six]
11958  * local_variables
11959  * # => [:seven]
11960  *
11961  * A +Symbol+ object differs from a String object in that
11962  * a +Symbol+ object represents an identifier, while a String object
11963  * represents text or data.
11964  *
11965  * == What's Here
11966  *
11967  * First, what's elsewhere. \Class +Symbol+:
11968  *
11969  * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11970  * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11971  *
11972  * Here, class +Symbol+ provides methods that are useful for:
11973  *
11974  * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11975  * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11976  * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11977  *
11978  * === Methods for Querying
11979  *
11980  * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11981  * - #=~: Returns the index of the first substring in symbol that matches a
11982  * given Regexp or other object; returns +nil+ if no match is found.
11983  * - #[], #slice : Returns a substring of symbol
11984  * determined by a given index, start/length, or range, or string.
11985  * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11986  * - #encoding: Returns the Encoding object that represents the encoding
11987  * of symbol.
11988  * - #end_with?: Returns +true+ if symbol ends with
11989  * any of the given strings.
11990  * - #match: Returns a MatchData object if symbol
11991  * matches a given Regexp; +nil+ otherwise.
11992  * - #match?: Returns +true+ if symbol
11993  * matches a given Regexp; +false+ otherwise.
11994  * - #length, #size: Returns the number of characters in symbol.
11995  * - #start_with?: Returns +true+ if symbol starts with
11996  * any of the given strings.
11997  *
11998  * === Methods for Comparing
11999  *
12000  * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
12001  * or larger than symbol.
12002  * - #==, #===: Returns +true+ if a given symbol has the same content and
12003  * encoding.
12004  * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12005  * symbol is smaller than, equal to, or larger than symbol.
12006  * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12007  * after Unicode case folding; +false+ otherwise.
12008  *
12009  * === Methods for Converting
12010  *
12011  * - #capitalize: Returns symbol with the first character upcased
12012  * and all other characters downcased.
12013  * - #downcase: Returns symbol with all characters downcased.
12014  * - #inspect: Returns the string representation of +self+ as a symbol literal.
12015  * - #name: Returns the frozen string corresponding to symbol.
12016  * - #succ, #next: Returns the symbol that is the successor to symbol.
12017  * - #swapcase: Returns symbol with all upcase characters downcased
12018  * and all downcase characters upcased.
12019  * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12020  * - #to_s, #id2name: Returns the string corresponding to +self+.
12021  * - #to_sym, #intern: Returns +self+.
12022  * - #upcase: Returns symbol with all characters upcased.
12023  *
12024  */
12025 
12026 
12027 /*
12028  * call-seq:
12029  * symbol == object -> true or false
12030  *
12031  * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12032  */
12033 
12034 #define sym_equal rb_obj_equal
12035 
12036 static int
12037 sym_printable(const char *s, const char *send, rb_encoding *enc)
12038 {
12039  while (s < send) {
12040  int n;
12041  int c = rb_enc_precise_mbclen(s, send, enc);
12042 
12043  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12044  n = MBCLEN_CHARFOUND_LEN(c);
12045  c = rb_enc_mbc_to_codepoint(s, send, enc);
12046  if (!rb_enc_isprint(c, enc)) return FALSE;
12047  s += n;
12048  }
12049  return TRUE;
12050 }
12051 
12052 int
12053 rb_str_symname_p(VALUE sym)
12054 {
12055  rb_encoding *enc;
12056  const char *ptr;
12057  long len;
12059 
12060  if (resenc == NULL) resenc = rb_default_external_encoding();
12061  enc = STR_ENC_GET(sym);
12062  ptr = RSTRING_PTR(sym);
12063  len = RSTRING_LEN(sym);
12064  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12065  !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12066  return FALSE;
12067  }
12068  return TRUE;
12069 }
12070 
12071 VALUE
12072 rb_str_quote_unprintable(VALUE str)
12073 {
12074  rb_encoding *enc;
12075  const char *ptr;
12076  long len;
12077  rb_encoding *resenc;
12078 
12079  Check_Type(str, T_STRING);
12080  resenc = rb_default_internal_encoding();
12081  if (resenc == NULL) resenc = rb_default_external_encoding();
12082  enc = STR_ENC_GET(str);
12083  ptr = RSTRING_PTR(str);
12084  len = RSTRING_LEN(str);
12085  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12086  !sym_printable(ptr, ptr + len, enc)) {
12087  return rb_str_escape(str);
12088  }
12089  return str;
12090 }
12091 
12092 VALUE
12093 rb_id_quote_unprintable(ID id)
12094 {
12095  VALUE str = rb_id2str(id);
12096  if (!rb_str_symname_p(str)) {
12097  return rb_str_escape(str);
12098  }
12099  return str;
12100 }
12101 
12102 /*
12103  * call-seq:
12104  * inspect -> string
12105  *
12106  * Returns a string representation of +self+ (including the leading colon):
12107  *
12108  * :foo.inspect # => ":foo"
12109  *
12110  * Related: Symbol#to_s, Symbol#name.
12111  *
12112  */
12113 
12114 static VALUE
12115 sym_inspect(VALUE sym)
12116 {
12117  VALUE str = rb_sym2str(sym);
12118  const char *ptr;
12119  long len;
12120  char *dest;
12121 
12122  if (!rb_str_symname_p(str)) {
12123  str = rb_str_inspect(str);
12124  len = RSTRING_LEN(str);
12125  rb_str_resize(str, len + 1);
12126  dest = RSTRING_PTR(str);
12127  memmove(dest + 1, dest, len);
12128  }
12129  else {
12130  rb_encoding *enc = STR_ENC_GET(str);
12131  VALUE orig_str = str;
12132 
12133  len = RSTRING_LEN(orig_str);
12134  str = rb_enc_str_new(0, len + 1, enc);
12135 
12136  // Get data pointer after allocation
12137  ptr = RSTRING_PTR(orig_str);
12138  dest = RSTRING_PTR(str);
12139  memcpy(dest + 1, ptr, len);
12140 
12141  RB_GC_GUARD(orig_str);
12142  }
12143  dest[0] = ':';
12144 
12146 
12147  return str;
12148 }
12149 
12150 VALUE
12152 {
12153  VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12154  FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12155  return str;
12156 }
12157 
12158 VALUE
12159 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12160 {
12161  VALUE obj;
12162 
12163  if (argc < 1) {
12164  rb_raise(rb_eArgError, "no receiver given");
12165  }
12166  obj = argv[0];
12167  return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12168 }
12169 
12170 /*
12171  * call-seq:
12172  * succ
12173  *
12174  * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12175  *
12176  * :foo.succ # => :fop
12177  *
12178  * Related: String#succ.
12179  */
12180 
12181 static VALUE
12182 sym_succ(VALUE sym)
12183 {
12184  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12185 }
12186 
12187 /*
12188  * call-seq:
12189  * symbol <=> object -> -1, 0, +1, or nil
12190  *
12191  * If +object+ is a symbol,
12192  * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12193  *
12194  * :bar <=> :foo # => -1
12195  * :foo <=> :foo # => 0
12196  * :foo <=> :bar # => 1
12197  *
12198  * Otherwise, returns +nil+:
12199  *
12200  * :foo <=> 'bar' # => nil
12201  *
12202  * Related: String#<=>.
12203  */
12204 
12205 static VALUE
12206 sym_cmp(VALUE sym, VALUE other)
12207 {
12208  if (!SYMBOL_P(other)) {
12209  return Qnil;
12210  }
12211  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12212 }
12213 
12214 /*
12215  * call-seq:
12216  * casecmp(object) -> -1, 0, 1, or nil
12217  *
12218  * :include: doc/symbol/casecmp.rdoc
12219  *
12220  */
12221 
12222 static VALUE
12223 sym_casecmp(VALUE sym, VALUE other)
12224 {
12225  if (!SYMBOL_P(other)) {
12226  return Qnil;
12227  }
12228  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12229 }
12230 
12231 /*
12232  * call-seq:
12233  * casecmp?(object) -> true, false, or nil
12234  *
12235  * :include: doc/symbol/casecmp_p.rdoc
12236  *
12237  */
12238 
12239 static VALUE
12240 sym_casecmp_p(VALUE sym, VALUE other)
12241 {
12242  if (!SYMBOL_P(other)) {
12243  return Qnil;
12244  }
12245  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12246 }
12247 
12248 /*
12249  * call-seq:
12250  * symbol =~ object -> integer or nil
12251  *
12252  * Equivalent to <tt>symbol.to_s =~ object</tt>,
12253  * including possible updates to global variables;
12254  * see String#=~.
12255  *
12256  */
12257 
12258 static VALUE
12259 sym_match(VALUE sym, VALUE other)
12260 {
12261  return rb_str_match(rb_sym2str(sym), other);
12262 }
12263 
12264 /*
12265  * call-seq:
12266  * match(pattern, offset = 0) -> matchdata or nil
12267  * match(pattern, offset = 0) {|matchdata| } -> object
12268  *
12269  * Equivalent to <tt>self.to_s.match</tt>,
12270  * including possible updates to global variables;
12271  * see String#match.
12272  *
12273  */
12274 
12275 static VALUE
12276 sym_match_m(int argc, VALUE *argv, VALUE sym)
12277 {
12278  return rb_str_match_m(argc, argv, rb_sym2str(sym));
12279 }
12280 
12281 /*
12282  * call-seq:
12283  * match?(pattern, offset) -> true or false
12284  *
12285  * Equivalent to <tt>sym.to_s.match?</tt>;
12286  * see String#match.
12287  *
12288  */
12289 
12290 static VALUE
12291 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12292 {
12293  return rb_str_match_m_p(argc, argv, sym);
12294 }
12295 
12296 /*
12297  * call-seq:
12298  * symbol[index] -> string or nil
12299  * symbol[start, length] -> string or nil
12300  * symbol[range] -> string or nil
12301  * symbol[regexp, capture = 0] -> string or nil
12302  * symbol[substring] -> string or nil
12303  *
12304  * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12305  *
12306  */
12307 
12308 static VALUE
12309 sym_aref(int argc, VALUE *argv, VALUE sym)
12310 {
12311  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12312 }
12313 
12314 /*
12315  * call-seq:
12316  * length -> integer
12317  *
12318  * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12319  */
12320 
12321 static VALUE
12322 sym_length(VALUE sym)
12323 {
12324  return rb_str_length(rb_sym2str(sym));
12325 }
12326 
12327 /*
12328  * call-seq:
12329  * empty? -> true or false
12330  *
12331  * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12332  *
12333  */
12334 
12335 static VALUE
12336 sym_empty(VALUE sym)
12337 {
12338  return rb_str_empty(rb_sym2str(sym));
12339 }
12340 
12341 /*
12342  * call-seq:
12343  * upcase(*options) -> symbol
12344  *
12345  * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12346  *
12347  * See String#upcase.
12348  *
12349  */
12350 
12351 static VALUE
12352 sym_upcase(int argc, VALUE *argv, VALUE sym)
12353 {
12354  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12355 }
12356 
12357 /*
12358  * call-seq:
12359  * downcase(*options) -> symbol
12360  *
12361  * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12362  *
12363  * See String#downcase.
12364  *
12365  * Related: Symbol#upcase.
12366  *
12367  */
12368 
12369 static VALUE
12370 sym_downcase(int argc, VALUE *argv, VALUE sym)
12371 {
12372  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12373 }
12374 
12375 /*
12376  * call-seq:
12377  * capitalize(*options) -> symbol
12378  *
12379  * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12380  *
12381  * See String#capitalize.
12382  *
12383  */
12384 
12385 static VALUE
12386 sym_capitalize(int argc, VALUE *argv, VALUE sym)
12387 {
12388  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12389 }
12390 
12391 /*
12392  * call-seq:
12393  * swapcase(*options) -> symbol
12394  *
12395  * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12396  *
12397  * See String#swapcase.
12398  *
12399  */
12400 
12401 static VALUE
12402 sym_swapcase(int argc, VALUE *argv, VALUE sym)
12403 {
12404  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12405 }
12406 
12407 /*
12408  * call-seq:
12409  * start_with?(*string_or_regexp) -> true or false
12410  *
12411  * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12412  *
12413  */
12414 
12415 static VALUE
12416 sym_start_with(int argc, VALUE *argv, VALUE sym)
12417 {
12418  return rb_str_start_with(argc, argv, rb_sym2str(sym));
12419 }
12420 
12421 /*
12422  * call-seq:
12423  * end_with?(*strings) -> true or false
12424  *
12425  *
12426  * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12427  *
12428  */
12429 
12430 static VALUE
12431 sym_end_with(int argc, VALUE *argv, VALUE sym)
12432 {
12433  return rb_str_end_with(argc, argv, rb_sym2str(sym));
12434 }
12435 
12436 /*
12437  * call-seq:
12438  * encoding -> encoding
12439  *
12440  * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12441  *
12442  */
12443 
12444 static VALUE
12445 sym_encoding(VALUE sym)
12446 {
12447  return rb_obj_encoding(rb_sym2str(sym));
12448 }
12449 
12450 static VALUE
12451 string_for_symbol(VALUE name)
12452 {
12453  if (!RB_TYPE_P(name, T_STRING)) {
12454  VALUE tmp = rb_check_string_type(name);
12455  if (NIL_P(tmp)) {
12456  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12457  name);
12458  }
12459  name = tmp;
12460  }
12461  return name;
12462 }
12463 
12464 ID
12466 {
12467  if (SYMBOL_P(name)) {
12468  return SYM2ID(name);
12469  }
12470  name = string_for_symbol(name);
12471  return rb_intern_str(name);
12472 }
12473 
12474 VALUE
12476 {
12477  if (SYMBOL_P(name)) {
12478  return name;
12479  }
12480  name = string_for_symbol(name);
12481  return rb_str_intern(name);
12482 }
12483 
12484 /*
12485  * call-seq:
12486  * Symbol.all_symbols -> array_of_symbols
12487  *
12488  * Returns an array of all symbols currently in Ruby's symbol table:
12489  *
12490  * Symbol.all_symbols.size # => 9334
12491  * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12492  *
12493  */
12494 
12495 static VALUE
12496 sym_all_symbols(VALUE _)
12497 {
12498  return rb_sym_all_symbols();
12499 }
12500 
12501 VALUE
12503 {
12504  return rb_fstring(str);
12505 }
12506 
12507 VALUE
12508 rb_interned_str(const char *ptr, long len)
12509 {
12510  struct RString fake_str;
12511  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12512 }
12513 
12514 VALUE
12516 {
12517  return rb_interned_str(ptr, strlen(ptr));
12518 }
12519 
12520 VALUE
12521 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12522 {
12523  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12524  rb_enc_autoload(enc);
12525  }
12526 
12527  struct RString fake_str;
12528  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12529 }
12530 
12531 VALUE
12532 rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12533 {
12534  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12535  rb_enc_autoload(enc);
12536  }
12537 
12538  struct RString fake_str;
12539  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12540 }
12541 
12542 VALUE
12544 {
12545  return rb_enc_interned_str(ptr, strlen(ptr), enc);
12546 }
12547 
12548 #if USE_YJIT
12549 void
12550 rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12551 {
12553  ssize_t code = RB_NUM2SSIZE(codepoint);
12554 
12555  if (RB_LIKELY(code >= 0 && code < 0xff)) {
12556  rb_str_buf_cat_byte(str, (char) code);
12557  return;
12558  }
12559  }
12560 
12561  rb_str_concat(str, codepoint);
12562 }
12563 #endif
12564 
12565 void
12566 Init_String(void)
12567 {
12568  rb_cString = rb_define_class("String", rb_cObject);
12569  RUBY_ASSERT(rb_vm_fstring_table());
12570  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12572  rb_define_alloc_func(rb_cString, empty_str_alloc);
12573  rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12574  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12575  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12576  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12577  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12580  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12581  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12582  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12583  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12586  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12587  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12588  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12589  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12590  rb_define_method(rb_cString, "length", rb_str_length, 0);
12592  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12593  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12594  rb_define_method(rb_cString, "=~", rb_str_match, 1);
12595  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12596  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12598  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12600  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12601  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12602  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12603  rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12604  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12605  rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12606  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12607  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12608  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12609  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12610  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12611  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12612  rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12613  rb_define_method(rb_cString, "scrub", str_scrub, -1);
12614  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12615  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12616  rb_define_method(rb_cString, "+@", str_uplus, 0);
12617  rb_define_method(rb_cString, "-@", str_uminus, 0);
12618  rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12619  rb_define_alias(rb_cString, "dedup", "-@");
12620 
12621  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12622  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12623  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12624  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12625  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12627  rb_define_method(rb_cString, "undump", str_undump, 0);
12628 
12629  sym_ascii = ID2SYM(rb_intern_const("ascii"));
12630  sym_turkic = ID2SYM(rb_intern_const("turkic"));
12631  sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12632  sym_fold = ID2SYM(rb_intern_const("fold"));
12633 
12634  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12635  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12636  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12637  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12638 
12639  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12640  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12641  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12642  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12643 
12644  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12645  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12646  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12647  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12648  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12649  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12650  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12651  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12652  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12653  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12654  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12655  rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12657  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12658  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12659  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12660  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12661  rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12662 
12663  rb_define_method(rb_cString, "include?", rb_str_include, 1);
12664  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12665  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12666 
12667  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12668 
12669  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12670  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12671  rb_define_method(rb_cString, "center", rb_str_center, -1);
12672 
12673  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12674  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12675  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12676  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12677  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12678  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12679  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12680  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12681  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12682 
12683  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12684  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12685  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12686  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12687  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12688  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12689  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12690  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12691  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12692 
12693  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12694  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12695  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12696  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12697  rb_define_method(rb_cString, "count", rb_str_count, -1);
12698 
12699  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12700  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12701  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12702  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12703 
12704  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12705  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12706  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12707  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12708  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12709 
12710  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12711 
12712  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12713  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12714 
12715  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12716  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12717 
12718  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12719  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12720  rb_define_method(rb_cString, "b", rb_str_b, 0);
12721  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12722  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12723 
12724  /* define UnicodeNormalize module here so that we don't have to look it up */
12725  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12726  id_normalize = rb_intern_const("normalize");
12727  id_normalized_p = rb_intern_const("normalized?");
12728 
12729  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12730  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12731  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12732 
12733  rb_fs = Qnil;
12734  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12735  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12737 
12738  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12742  rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12743 
12744  rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12745  rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12746  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12747  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12748  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12749  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12750 
12751  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12752  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12753  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12754  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12755 
12756  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12757  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12758  rb_define_method(rb_cSymbol, "length", sym_length, 0);
12759  rb_define_method(rb_cSymbol, "size", sym_length, 0);
12760  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12761  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12762  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12763 
12764  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12765  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12766  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12767  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12768 
12769  rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12770  rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12771 
12772  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12773 }
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition: assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition: assert.h:219
Atomic operations.
#define RB_LIKELY(x)
Asserts that the given Boolean expression likely holds.
Definition: assume.h:43
#define RB_UNLIKELY(x)
Asserts that the given Boolean expression likely doesn't hold.
Definition: assume.h:50
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition: coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition: coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition: ctype.h:395
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition: ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition: ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition: ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition: fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition: fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition: fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition: class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition: class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2635
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:2142
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:916
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition: class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition: value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition: value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition: fl_type.h:134
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition: fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition: memory.h:399
#define ISSPACE
Old name of rb_isspace.
Definition: ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition: coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition: coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:137
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition: assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition: value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition: value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition: symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition: coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition: size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition: fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition: encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition: long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition: coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition: memory.h:396
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:394
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition: fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition: fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition: array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition: long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition: fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition: ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition: encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition: st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition: encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition: fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition: long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition: util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition: memory.h:400
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition: fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition: double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition: ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition: encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition: fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition: fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition: fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition: int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition: encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition: coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition: fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition: encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition: error.c:476
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition: error.c:3635
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:676
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition: error.c:3747
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:1089
VALUE rb_eRangeError
RangeError exception.
Definition: error.c:1412
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1408
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition: error.c:3686
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1415
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1406
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1409
VALUE rb_eIndexError
IndexError exception.
Definition: error.c:1410
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1045
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition: error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition: object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition: object.c:2093
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition: object.c:2111
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition: object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition: object.c:3479
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition: object.c:576
VALUE rb_cSymbol
Symbol class.
Definition: string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition: object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1260
VALUE rb_mComparable
Comparable module.
Definition: compar.c:19
VALUE rb_cString
String class.
Definition: string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:3188
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition: gc.h:603
Encoding relates APIs.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1523
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1589
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:197
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1191
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:920
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1022
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1487
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1241
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:683
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1149
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1481
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1173
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1469
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1140
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1227
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:638
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1676
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:191
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:704
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:323
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:986
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1463
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1062
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:182
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:768
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1475
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:662
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:571
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:643
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:402
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:447
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1179
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:994
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1028
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:99
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:591
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:417
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:726
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:432
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:619
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1203
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1493
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1537
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1285
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition: string.c:2930
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:900
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition: string.c:1150
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition: string.c:1169
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition: string.c:12521
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition: re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition: string.c:2260
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition: string.c:3612
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1098
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_new(), except it additionally takes an encoding.
Definition: string.c:1068
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition: string.c:1390
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition: string.c:1291
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:919
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1163
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition: string.c:12543
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:784
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition: symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2914
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2651
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition: vm_eval.c:1099
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1058
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition: vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition: gc.h:479
void rb_gc_register_address(VALUE *valptr)
Inform the garbage collector that the global or static variable pointed by valptr stores a live Ruby ...
Definition: gc.c:2829
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:1014
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:747
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
Definition: array.c:741
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1384
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
Definition: array.c:648
VALUE rb_ary_new_from_args(long n,...)
Constructs an array from the passed objects.
Definition: array.c:753
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Identical to rb_cstr2inum(), except it takes Ruby's strings instead of C's.
Definition: bignum.c:4308
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition: enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition: enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition: error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:284
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1864
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2073
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2893
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Identical to rb_hash_aref(), except it always returns RUBY_Qnil for misshits.
Definition: hash.c:2099
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1475
VALUE rb_rs
The record separator character for inputs, or the $/.
Definition: io.c:205
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition: string.c:669
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition: io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition: vm.c:1825
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition: symbol.c:1042
void rb_backref_set(VALUE md)
Updates $~.
Definition: vm.c:1831
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition: range.c:1842
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition: re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition: re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition: re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition: re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition: re.c:1905
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition: string.c:12502
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition: string.c:1677
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: string.c:1455
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition: string.c:2411
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition: string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition: string.h:939
VALUE rb_utf8_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1062
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition: string.c:3677
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition: string.c:1366
VALUE rb_utf8_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1092
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition: string.c:12151
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition: string.c:2483
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition: string.c:1342
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1671
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition: string.c:2958
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition: string.c:5269
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition: string.c:4046
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition: string.c:3055
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:11450
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition: random.c:1752
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1724
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition: string.c:1132
VALUE rb_str_buf_cat(VALUE, const char *, long)
Just another name of rb_str_cat.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:954
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1461
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1927
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2651
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition: string.c:4032
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition: string.c:3445
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition: string.c:2349
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition: string.c:1945
VALUE rb_usascii_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1056
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition: string.c:6477
VALUE rb_usascii_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1086
VALUE rb_str_buf_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition: string.c:3063
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition: string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition: string.c:12515
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition: string.c:1372
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition: string.c:3643
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition: string.c:3005
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition: string.c:4148
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3269
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition: string.c:7198
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition: string.c:2703
VALUE rb_str_buf_new_cstr(const char *ptr)
This is a rb_str_buf_new() + rb_str_buf_cat() combo.
Definition: string.c:1659
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition: string.c:12508
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition: string.c:4102
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition: string.c:3919
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition: string.c:4077
#define rb_strlen_lit(str)
Length of a string literal.
Definition: string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition: string.c:3619
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition: string.c:3178
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition: string.c:5779
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:1050
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition: string.c:11508
VALUE rb_str_dup_frozen(VALUE)
Just another name of rb_str_new_frozen.
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition: string.c:1627
VALUE rb_locale_str_new_cstr(const char *ptr)
Identical to rb_locale_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1360
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2854
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition: string.c:3150
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition: string.c:3252
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1074
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3317
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition: string.c:1144
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2659
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:7312
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition: string.c:1354
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1643
VALUE rb_external_str_new_cstr(const char *ptr)
Identical to rb_external_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1348
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition: string.c:2363
VALUE rb_str_cat_cstr(VALUE dst, const char *src)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:3455
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5697
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition: string.c:9405
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition: string.c:1138
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition: symbol.c:878
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition: string.c:1786
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1859
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition: variable.c:1876
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2955
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition: vm_method.c:1286
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:823
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition: symbol.c:970
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition: string.c:12475
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: string.c:12465
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: symbol.c:829
VALUE rb_id2str(ID id)
Identical to rb_id2name(), except it returns a Ruby's String instead of C's.
Definition: symbol.c:986
void rb_define_hooked_variable(const char *name, VALUE *var, rb_gvar_getter_t *getter, rb_gvar_setter_t *setter)
Identical to rb_define_virtual_variable(), but can also specify a storage.
Definition: variable.c:707
int capa
Designed capacity of the buffer.
Definition: io.h:11
char * ptr
Pointer to the underlying memory region, of at least capa bytes.
Definition: io.h:2
int off
Offset inside of ptr.
Definition: io.h:5
int len
Length of the buffer.
Definition: io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition: re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition: re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition: re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition: sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition: vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:367
#define ALLOCA_N(type, n)
Definition: memory.h:287
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition: memory.h:355
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:162
VALUE type(ANYARGS)
ANYARGS-ed function type.
Definition: cxxanyargs.hpp:56
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition: rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition: rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition: rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition: rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition: rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition: string.c:1384
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:442
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:416
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition: rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition: rstring.h:488
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition: string.c:2726
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition: string.c:2831
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition: string.c:2715
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition: rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition: string.c:1378
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition: string.c:1715
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition: load.c:1416
#define errno
Ractor-aware version of errno.
Definition: ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition: size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition: stdarg.h:35
VALUE flags
Per-object flags.
Definition: rbasic.h:75
Ruby's String.
Definition: rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition: rstring.h:199
union RString::@48 as
String's specific fields.
long len
Length of the string, not including terminating NUL character.
Definition: rstring.h:206
struct RString::@48::@50 embed
Embedded contents.
struct RString::@48::@49 heap
Strings that use separated memory region for contents use this pattern.
VALUE shared
Parent of the string.
Definition: rstring.h:240
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:200
Definition: st.h:79
Definition: string.c:8270
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition: thread.c:298
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition: value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition: value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:376
ruby_value_type
C-level type of an object.
Definition: value_type.h:113
void ruby_xfree(void *ptr)
Deallocates a storage instance.
Definition: gc.c:4299