Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
string.c (892c46283a5ea4179500d951c9d4866c0051f27b)
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author$
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "ruby/internal/config.h"
15 
16 #include <ctype.h>
17 #include <errno.h>
18 #include <math.h>
19 
20 #ifdef HAVE_UNISTD_H
21 # include <unistd.h>
22 #endif
23 
24 #include "debug_counter.h"
25 #include "encindex.h"
26 #include "id.h"
27 #include "internal.h"
28 #include "internal/array.h"
29 #include "internal/compar.h"
30 #include "internal/compilers.h"
31 #include "internal/encoding.h"
32 #include "internal/error.h"
33 #include "internal/gc.h"
34 #include "internal/numeric.h"
35 #include "internal/object.h"
36 #include "internal/proc.h"
37 #include "internal/re.h"
38 #include "internal/sanitizers.h"
39 #include "internal/string.h"
40 #include "internal/transcode.h"
41 #include "probes.h"
42 #include "ruby/encoding.h"
43 #include "ruby/re.h"
44 #include "ruby/util.h"
45 #include "ruby_assert.h"
46 #include "vm_sync.h"
47 
48 #if defined HAVE_CRYPT_R
49 # if defined HAVE_CRYPT_H
50 # include <crypt.h>
51 # endif
52 #elif !defined HAVE_CRYPT
53 # include "missing/crypt.h"
54 # define HAVE_CRYPT_R 1
55 #endif
56 
57 #define BEG(no) (regs->beg[(no)])
58 #define END(no) (regs->end[(no)])
59 
60 #undef rb_str_new
61 #undef rb_usascii_str_new
62 #undef rb_utf8_str_new
63 #undef rb_enc_str_new
64 #undef rb_str_new_cstr
65 #undef rb_usascii_str_new_cstr
66 #undef rb_utf8_str_new_cstr
67 #undef rb_enc_str_new_cstr
68 #undef rb_external_str_new_cstr
69 #undef rb_locale_str_new_cstr
70 #undef rb_str_dup_frozen
71 #undef rb_str_buf_new_cstr
72 #undef rb_str_buf_cat
73 #undef rb_str_buf_cat2
74 #undef rb_str_cat2
75 #undef rb_str_cat_cstr
76 #undef rb_fstring_cstr
77 
80 
81 /* Flags of RString
82  *
83  * 0: STR_SHARED (equal to ELTS_SHARED)
84  * The string is shared. The buffer this string points to is owned by
85  * another string (the shared root).
86  * 1: RSTRING_NOEMBED
87  * The string is not embedded. When a string is embedded, the contents
88  * follow the header. When a string is not embedded, the contents is
89  * on a separately allocated buffer.
90  * 2: STR_CHILLED_LITERAL (will be frozen in a future version)
91  * The string was allocated as a literal in a file without an explicit `frozen_string_literal` comment.
92  * It emits a deprecation warning when mutated for the first time.
93  * 3: STR_CHILLED_SYMBOL_TO_S (will be frozen in a future version)
94  * The string was allocated by the `Symbol#to_s` method.
95  * It emits a deprecation warning when mutated for the first time.
96  * 4: STR_PRECOMPUTED_HASH
97  * The string is embedded and has its precomputed hashcode stored
98  * after the terminator.
99  * 5: STR_SHARED_ROOT
100  * Other strings may point to the contents of this string. When this
101  * flag is set, STR_SHARED must not be set.
102  * 6: STR_BORROWED
103  * When RSTRING_NOEMBED is set and klass is 0, this string is unsafe
104  * to be unshared by rb_str_tmp_frozen_release.
105  * 7: STR_TMPLOCK
106  * The pointer to the buffer is passed to a system call such as
107  * read(2). Any modification and realloc is prohibited.
108  * 8-9: ENC_CODERANGE
109  * Stores the coderange of the string.
110  * 10-16: ENCODING
111  * Stores the encoding of the string.
112  * 17: RSTRING_FSTR
113  * The string is a fstring. The string is deduplicated in the fstring
114  * table.
115  * 18: STR_NOFREE
116  * Do not free this string's buffer when the string is reclaimed
117  * by the garbage collector. Used for when the string buffer is a C
118  * string literal.
119  * 19: STR_FAKESTR
120  * The string is not allocated or managed by the garbage collector.
121  * Typically, the string object header (struct RString) is temporarily
122  * allocated on C stack.
123  */
124 
125 #define RUBY_MAX_CHAR_LEN 16
126 #define STR_PRECOMPUTED_HASH FL_USER4
127 #define STR_SHARED_ROOT FL_USER5
128 #define STR_BORROWED FL_USER6
129 #define STR_TMPLOCK FL_USER7
130 #define STR_NOFREE FL_USER18
131 #define STR_FAKESTR FL_USER19
132 
133 #define STR_SET_NOEMBED(str) do {\
134  FL_SET((str), STR_NOEMBED);\
135  FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
136 } while (0)
137 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED | STR_SHARED | STR_NOFREE)
138 
139 #define STR_SET_LEN(str, n) do { \
140  RSTRING(str)->len = (n); \
141 } while (0)
142 
143 static inline bool
144 str_encindex_fastpath(int encindex)
145 {
146  // The overwhelming majority of strings are in one of these 3 encodings.
147  switch (encindex) {
148  case ENCINDEX_ASCII_8BIT:
149  case ENCINDEX_UTF_8:
150  case ENCINDEX_US_ASCII:
151  return true;
152  default:
153  return false;
154  }
155 }
156 
157 static inline bool
158 str_enc_fastpath(VALUE str)
159 {
160  return str_encindex_fastpath(ENCODING_GET_INLINED(str));
161 }
162 
163 #define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
164 #define TERM_FILL(ptr, termlen) do {\
165  char *const term_fill_ptr = (ptr);\
166  const int term_fill_len = (termlen);\
167  *term_fill_ptr = '\0';\
168  if (UNLIKELY(term_fill_len > 1))\
169  memset(term_fill_ptr, 0, term_fill_len);\
170 } while (0)
171 
172 #define RESIZE_CAPA(str,capacity) do {\
173  const int termlen = TERM_LEN(str);\
174  RESIZE_CAPA_TERM(str,capacity,termlen);\
175 } while (0)
176 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
177  if (STR_EMBED_P(str)) {\
178  if (str_embed_capa(str) < capacity + termlen) {\
179  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
180  const long tlen = RSTRING_LEN(str);\
181  memcpy(tmp, RSTRING_PTR(str), tlen);\
182  RSTRING(str)->as.heap.ptr = tmp;\
183  RSTRING(str)->len = tlen;\
184  STR_SET_NOEMBED(str);\
185  RSTRING(str)->as.heap.aux.capa = (capacity);\
186  }\
187  }\
188  else {\
189  RUBY_ASSERT(!FL_TEST((str), STR_SHARED)); \
190  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
191  (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
192  RSTRING(str)->as.heap.aux.capa = (capacity);\
193  }\
194 } while (0)
195 
196 #define STR_SET_SHARED(str, shared_str) do { \
197  if (!FL_TEST(str, STR_FAKESTR)) { \
198  RUBY_ASSERT(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
199  RUBY_ASSERT(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
200  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
201  FL_SET((str), STR_SHARED); \
202  FL_SET((shared_str), STR_SHARED_ROOT); \
203  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
204  FL_SET_RAW((shared_str), STR_BORROWED); \
205  } \
206 } while (0)
207 
208 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
209 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
210 /* TODO: include the terminator size in capa. */
211 
212 #define STR_ENC_GET(str) get_encoding(str)
213 
214 #if !defined SHARABLE_MIDDLE_SUBSTRING
215 # define SHARABLE_MIDDLE_SUBSTRING 0
216 #endif
217 #if !SHARABLE_MIDDLE_SUBSTRING
218 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
219 #else
220 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
221 #endif
222 
223 
224 static inline long
225 str_embed_capa(VALUE str)
226 {
227  return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
228 }
229 
230 bool
231 rb_str_reembeddable_p(VALUE str)
232 {
233  return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
234 }
235 
236 static inline size_t
237 rb_str_embed_size(long capa)
238 {
239  return offsetof(struct RString, as.embed.ary) + capa;
240 }
241 
242 size_t
243 rb_str_size_as_embedded(VALUE str)
244 {
245  size_t real_size;
246  if (STR_EMBED_P(str)) {
247  real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
248  }
249  /* if the string is not currently embedded, but it can be embedded, how
250  * much space would it require */
251  else if (rb_str_reembeddable_p(str)) {
252  real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
253  }
254  else {
255  real_size = sizeof(struct RString);
256  }
257 
258  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
259  real_size += sizeof(st_index_t);
260  }
261 
262  return real_size;
263 }
264 
265 static inline bool
266 STR_EMBEDDABLE_P(long len, long termlen)
267 {
268  return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
269 }
270 
271 static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
272 static VALUE str_new_frozen(VALUE klass, VALUE orig);
273 static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
274 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
275 static VALUE str_new(VALUE klass, const char *ptr, long len);
276 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
277 static inline void str_modifiable(VALUE str);
278 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
279 static inline VALUE str_alloc_embed(VALUE klass, size_t capa);
280 
281 static inline void
282 str_make_independent(VALUE str)
283 {
284  long len = RSTRING_LEN(str);
285  int termlen = TERM_LEN(str);
286  str_make_independent_expand((str), len, 0L, termlen);
287 }
288 
289 static inline int str_dependent_p(VALUE str);
290 
291 void
292 rb_str_make_independent(VALUE str)
293 {
294  if (str_dependent_p(str)) {
295  str_make_independent(str);
296  }
297 }
298 
299 void
300 rb_str_make_embedded(VALUE str)
301 {
302  RUBY_ASSERT(rb_str_reembeddable_p(str));
303  RUBY_ASSERT(!STR_EMBED_P(str));
304 
305  char *buf = RSTRING(str)->as.heap.ptr;
306  long len = RSTRING(str)->len;
307 
308  STR_SET_EMBED(str);
309  STR_SET_LEN(str, len);
310 
311  if (len > 0) {
312  memcpy(RSTRING_PTR(str), buf, len);
313  ruby_xfree(buf);
314  }
315 
316  TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
317 }
318 
319 void
320 rb_debug_rstring_null_ptr(const char *func)
321 {
322  fprintf(stderr, "%s is returning NULL!! "
323  "SIGSEGV is highly expected to follow immediately.\n"
324  "If you could reproduce, attach your debugger here, "
325  "and look at the passed string.\n",
326  func);
327 }
328 
329 /* symbols for [up|down|swap]case/capitalize options */
330 static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
331 
332 static rb_encoding *
333 get_encoding(VALUE str)
334 {
335  return rb_enc_from_index(ENCODING_GET(str));
336 }
337 
338 static void
339 mustnot_broken(VALUE str)
340 {
341  if (is_broken_string(str)) {
342  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
343  }
344 }
345 
346 static void
347 mustnot_wchar(VALUE str)
348 {
349  rb_encoding *enc = STR_ENC_GET(str);
350  if (rb_enc_mbminlen(enc) > 1) {
351  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
352  }
353 }
354 
355 static int fstring_cmp(VALUE a, VALUE b);
356 
357 static VALUE register_fstring(VALUE str, bool copy, bool force_precompute_hash);
358 
359 #if SIZEOF_LONG == SIZEOF_VOIDP
360 #define PRECOMPUTED_FAKESTR_HASH 1
361 #else
362 #endif
363 
364 #ifdef PRECOMPUTED_FAKESTR_HASH
365 static st_index_t
366 fstring_hash(VALUE str)
367 {
368  if (FL_TEST_RAW(str, STR_FAKESTR)) {
369  // register_fstring precomputes the hash and stores it in capa for fake strings
370  return (st_index_t)RSTRING(str)->as.heap.aux.capa;
371  }
372  else {
373  return rb_str_hash(str);
374  }
375 }
376 #else
377 #define fstring_hash rb_str_hash
378 #endif
379 
380 const struct st_hash_type rb_fstring_hash_type = {
381  fstring_cmp,
382  fstring_hash,
383 };
384 
385 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
386 
387 static inline st_index_t
388 str_do_hash(VALUE str)
389 {
390  st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
391  int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
392  if (e && !is_ascii_string(str)) {
393  h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
394  }
395  return h;
396 }
397 
398 static VALUE
399 str_store_precomputed_hash(VALUE str, st_index_t hash)
400 {
401  RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
402  RUBY_ASSERT(STR_EMBED_P(str));
403 
404 #if RUBY_DEBUG
405  size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
406  size_t free_bytes = str_embed_capa(str) - used_bytes;
407  RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
408 #endif
409 
410  memcpy(RSTRING_END(str) + TERM_LEN(str), &hash, sizeof(hash));
411 
412  FL_SET(str, STR_PRECOMPUTED_HASH);
413 
414  return str;
415 }
416 
418  VALUE fstr;
419  bool copy;
420  bool force_precompute_hash;
421 };
422 
423 static int
424 fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
425 {
426  struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
427  VALUE str = (VALUE)*key;
428 
429  if (existing) {
430  /* because of lazy sweep, str may be unmarked already and swept
431  * at next time */
432 
433  if (rb_objspace_garbage_object_p(str)) {
434  arg->fstr = Qundef;
435  return ST_DELETE;
436  }
437 
438  arg->fstr = str;
439  return ST_STOP;
440  }
441  else {
442  // Unless the string is empty or binary, its coderange has been precomputed.
443  int coderange = ENC_CODERANGE(str);
444 
445  if (FL_TEST_RAW(str, STR_FAKESTR)) {
446  if (arg->copy) {
447  VALUE new_str;
448  long len = RSTRING_LEN(str);
449  long capa = len + sizeof(st_index_t);
450  int term_len = TERM_LEN(str);
451 
452  if (arg->force_precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
453  new_str = str_alloc_embed(rb_cString, capa + term_len);
454  memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
455  STR_SET_LEN(new_str, RSTRING_LEN(str));
456  TERM_FILL(RSTRING_END(new_str), TERM_LEN(str));
457  rb_enc_copy(new_str, str);
458  str_store_precomputed_hash(new_str, fstring_hash(str));
459  }
460  else {
461  new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
462  rb_enc_copy(new_str, str);
463 #ifdef PRECOMPUTED_FAKESTR_HASH
464  if (rb_str_capacity(new_str) >= RSTRING_LEN(str) + term_len + sizeof(st_index_t)) {
465  str_store_precomputed_hash(new_str, (st_index_t)RSTRING(str)->as.heap.aux.capa);
466  }
467 #endif
468  }
469  str = new_str;
470  }
471  else {
472  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
473  RSTRING(str)->len,
474  ENCODING_GET(str));
475  }
476  OBJ_FREEZE(str);
477  }
478  else {
479  if (!OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
480  str = str_new_frozen(rb_cString, str);
481  }
482  if (STR_SHARED_P(str)) { /* str should not be shared */
483  /* shared substring */
484  str_make_independent(str);
485  RUBY_ASSERT(OBJ_FROZEN(str));
486  }
487  if (!BARE_STRING_P(str)) {
488  str = str_new_frozen(rb_cString, str);
489  }
490  }
491 
492  ENC_CODERANGE_SET(str, coderange);
493  RBASIC(str)->flags |= RSTRING_FSTR;
494 
495  *key = *value = arg->fstr = str;
496  return ST_CONTINUE;
497  }
498 }
499 
500 VALUE
501 rb_fstring(VALUE str)
502 {
503  VALUE fstr;
504  int bare;
505 
506  Check_Type(str, T_STRING);
507 
508  if (FL_TEST(str, RSTRING_FSTR))
509  return str;
510 
511  bare = BARE_STRING_P(str);
512  if (!bare) {
513  if (STR_EMBED_P(str)) {
514  OBJ_FREEZE(str);
515  return str;
516  }
517 
518  if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
519  RUBY_ASSERT(OBJ_FROZEN(str));
520  return str;
521  }
522  }
523 
524  if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
525  rb_str_resize(str, RSTRING_LEN(str));
526 
527  fstr = register_fstring(str, false, false);
528 
529  if (!bare) {
530  str_replace_shared_without_enc(str, fstr);
531  OBJ_FREEZE(str);
532  return str;
533  }
534  return fstr;
535 }
536 
537 static VALUE
538 register_fstring(VALUE str, bool copy, bool force_precompute_hash)
539 {
540  struct fstr_update_arg args = {
541  .copy = copy,
542  .force_precompute_hash = force_precompute_hash
543  };
544 
545 #if SIZEOF_VOIDP == SIZEOF_LONG
546  if (FL_TEST_RAW(str, STR_FAKESTR)) {
547  // if the string hasn't been interned, we'll need the hash twice, so we
548  // compute it once and store it in capa
549  RSTRING(str)->as.heap.aux.capa = (long)str_do_hash(str);
550  }
551 #endif
552 
553  RB_VM_LOCK_ENTER();
554  {
555  st_table *frozen_strings = rb_vm_fstring_table();
556  do {
557  args.fstr = str;
558  st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
559  } while (UNDEF_P(args.fstr));
560  }
561  RB_VM_LOCK_LEAVE();
562 
563  RUBY_ASSERT(OBJ_FROZEN(args.fstr));
564  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
565  RUBY_ASSERT(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
566  RUBY_ASSERT(RBASIC_CLASS(args.fstr) == rb_cString);
567 
568  return args.fstr;
569 }
570 
571 static VALUE
572 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
573 {
574  fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
575 
576  if (!name) {
577  RUBY_ASSERT_ALWAYS(len == 0);
578  name = "";
579  }
580 
581  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
582 
583  RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
584  fake_str->len = len;
585  fake_str->as.heap.ptr = (char *)name;
586  fake_str->as.heap.aux.capa = len;
587  return (VALUE)fake_str;
588 }
589 
590 /*
591  * set up a fake string which refers a static string literal.
592  */
593 VALUE
594 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
595 {
596  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
597 }
598 
599 /*
600  * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
601  * shared string which refers a static string literal. `ptr` must
602  * point a constant string.
603  */
604 VALUE
605 rb_fstring_new(const char *ptr, long len)
606 {
607  struct RString fake_str;
608  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
609 }
610 
611 VALUE
612 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
613 {
614  struct RString fake_str;
615  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
616 }
617 
618 VALUE
619 rb_fstring_cstr(const char *ptr)
620 {
621  return rb_fstring_new(ptr, strlen(ptr));
622 }
623 
624 static int
625 fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
626 {
627  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
628  return ST_CONTINUE;
629 }
630 
631 static int
632 fstring_cmp(VALUE a, VALUE b)
633 {
634  long alen, blen;
635  const char *aptr, *bptr;
636  RSTRING_GETMEM(a, aptr, alen);
637  RSTRING_GETMEM(b, bptr, blen);
638  return (alen != blen ||
639  ENCODING_GET(a) != ENCODING_GET(b) ||
640  memcmp(aptr, bptr, alen) != 0);
641 }
642 
643 static inline bool
644 single_byte_optimizable(VALUE str)
645 {
646  int encindex = ENCODING_GET(str);
647  switch (encindex) {
648  case ENCINDEX_ASCII_8BIT:
649  case ENCINDEX_US_ASCII:
650  return true;
651  case ENCINDEX_UTF_8:
652  // For UTF-8 it's worth scanning the string coderange when unknown.
654  }
655  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
656  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
657  return true;
658  }
659 
660  if (rb_enc_mbmaxlen(rb_enc_from_index(encindex)) == 1) {
661  return true;
662  }
663 
664  /* Conservative. Possibly single byte.
665  * "\xa1" in Shift_JIS for example. */
666  return false;
667 }
668 
670 
671 static inline const char *
672 search_nonascii(const char *p, const char *e)
673 {
674  const uintptr_t *s, *t;
675 
676 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
677 # if SIZEOF_UINTPTR_T == 8
678 # define NONASCII_MASK UINT64_C(0x8080808080808080)
679 # elif SIZEOF_UINTPTR_T == 4
680 # define NONASCII_MASK UINT32_C(0x80808080)
681 # else
682 # error "don't know what to do."
683 # endif
684 #else
685 # if SIZEOF_UINTPTR_T == 8
686 # define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
687 # elif SIZEOF_UINTPTR_T == 4
688 # define NONASCII_MASK 0x80808080UL /* or...? */
689 # else
690 # error "don't know what to do."
691 # endif
692 #endif
693 
694  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
695 #if !UNALIGNED_WORD_ACCESS
696  if ((uintptr_t)p % SIZEOF_VOIDP) {
697  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
698  p += l;
699  switch (l) {
700  default: UNREACHABLE;
701 #if SIZEOF_VOIDP > 4
702  case 7: if (p[-7]&0x80) return p-7;
703  case 6: if (p[-6]&0x80) return p-6;
704  case 5: if (p[-5]&0x80) return p-5;
705  case 4: if (p[-4]&0x80) return p-4;
706 #endif
707  case 3: if (p[-3]&0x80) return p-3;
708  case 2: if (p[-2]&0x80) return p-2;
709  case 1: if (p[-1]&0x80) return p-1;
710  case 0: break;
711  }
712  }
713 #endif
714 #if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
715 #define aligned_ptr(value) \
716  __builtin_assume_aligned((value), sizeof(uintptr_t))
717 #else
718 #define aligned_ptr(value) (uintptr_t *)(value)
719 #endif
720  s = aligned_ptr(p);
721  t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
722 #undef aligned_ptr
723  for (;s < t; s++) {
724  if (*s & NONASCII_MASK) {
725 #ifdef WORDS_BIGENDIAN
726  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
727 #else
728  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
729 #endif
730  }
731  }
732  p = (const char *)s;
733  }
734 
735  switch (e - p) {
736  default: UNREACHABLE;
737 #if SIZEOF_VOIDP > 4
738  case 7: if (e[-7]&0x80) return e-7;
739  case 6: if (e[-6]&0x80) return e-6;
740  case 5: if (e[-5]&0x80) return e-5;
741  case 4: if (e[-4]&0x80) return e-4;
742 #endif
743  case 3: if (e[-3]&0x80) return e-3;
744  case 2: if (e[-2]&0x80) return e-2;
745  case 1: if (e[-1]&0x80) return e-1;
746  case 0: return NULL;
747  }
748 }
749 
750 static int
751 coderange_scan(const char *p, long len, rb_encoding *enc)
752 {
753  const char *e = p + len;
754 
755  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
756  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
757  p = search_nonascii(p, e);
759  }
760 
761  if (rb_enc_asciicompat(enc)) {
762  p = search_nonascii(p, e);
763  if (!p) return ENC_CODERANGE_7BIT;
764  for (;;) {
765  int ret = rb_enc_precise_mbclen(p, e, enc);
766  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
767  p += MBCLEN_CHARFOUND_LEN(ret);
768  if (p == e) break;
769  p = search_nonascii(p, e);
770  if (!p) break;
771  }
772  }
773  else {
774  while (p < e) {
775  int ret = rb_enc_precise_mbclen(p, e, enc);
776  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
777  p += MBCLEN_CHARFOUND_LEN(ret);
778  }
779  }
780  return ENC_CODERANGE_VALID;
781 }
782 
783 long
784 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
785 {
786  const char *p = s;
787 
788  if (*cr == ENC_CODERANGE_BROKEN)
789  return e - s;
790 
791  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
792  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
793  if (*cr == ENC_CODERANGE_VALID) return e - s;
794  p = search_nonascii(p, e);
796  return e - s;
797  }
798  else if (rb_enc_asciicompat(enc)) {
799  p = search_nonascii(p, e);
800  if (!p) {
801  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
802  return e - s;
803  }
804  for (;;) {
805  int ret = rb_enc_precise_mbclen(p, e, enc);
806  if (!MBCLEN_CHARFOUND_P(ret)) {
808  return p - s;
809  }
810  p += MBCLEN_CHARFOUND_LEN(ret);
811  if (p == e) break;
812  p = search_nonascii(p, e);
813  if (!p) break;
814  }
815  }
816  else {
817  while (p < e) {
818  int ret = rb_enc_precise_mbclen(p, e, enc);
819  if (!MBCLEN_CHARFOUND_P(ret)) {
821  return p - s;
822  }
823  p += MBCLEN_CHARFOUND_LEN(ret);
824  }
825  }
826  *cr = ENC_CODERANGE_VALID;
827  return e - s;
828 }
829 
830 static inline void
831 str_enc_copy(VALUE str1, VALUE str2)
832 {
833  rb_enc_set_index(str1, ENCODING_GET(str2));
834 }
835 
836 /* Like str_enc_copy, but does not check frozen status of str1.
837  * You should use this only if you're certain that str1 is not frozen. */
838 static inline void
839 str_enc_copy_direct(VALUE str1, VALUE str2)
840 {
841  int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
842  if (inlined_encoding == ENCODING_INLINE_MAX) {
843  rb_enc_set_index(str1, rb_enc_get_index(str2));
844  }
845  else {
846  ENCODING_SET_INLINED(str1, inlined_encoding);
847  }
848 }
849 
850 static void
851 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
852 {
853  /* this function is designed for copying encoding and coderange
854  * from src to new string "dest" which is made from the part of src.
855  */
856  str_enc_copy(dest, src);
857  if (RSTRING_LEN(dest) == 0) {
858  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
860  else
862  return;
863  }
864  switch (ENC_CODERANGE(src)) {
865  case ENC_CODERANGE_7BIT:
867  break;
868  case ENC_CODERANGE_VALID:
869  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
870  search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
872  else
874  break;
875  default:
876  break;
877  }
878 }
879 
880 static void
881 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
882 {
883  str_enc_copy(dest, src);
884  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
885 }
886 
887 static int
888 enc_coderange_scan(VALUE str, rb_encoding *enc)
889 {
890  return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
891 }
892 
893 int
894 rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
895 {
896  return enc_coderange_scan(str, enc);
897 }
898 
899 int
901 {
902  int cr = ENC_CODERANGE(str);
903 
904  if (cr == ENC_CODERANGE_UNKNOWN) {
905  cr = enc_coderange_scan(str, get_encoding(str));
906  ENC_CODERANGE_SET(str, cr);
907  }
908  return cr;
909 }
910 
911 static inline bool
912 rb_enc_str_asciicompat(VALUE str)
913 {
914  int encindex = ENCODING_GET_INLINED(str);
915  return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex));
916 }
917 
918 int
920 {
921  switch(ENC_CODERANGE(str)) {
923  return rb_enc_str_asciicompat(str) && is_ascii_string(str);
924  case ENC_CODERANGE_7BIT:
925  return true;
926  default:
927  return false;
928  }
929 }
930 
931 static inline void
932 str_mod_check(VALUE s, const char *p, long len)
933 {
934  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
935  rb_raise(rb_eRuntimeError, "string modified");
936  }
937 }
938 
939 static size_t
940 str_capacity(VALUE str, const int termlen)
941 {
942  if (STR_EMBED_P(str)) {
943  return str_embed_capa(str) - termlen;
944  }
945  else if (FL_ANY_RAW(str, STR_SHARED|STR_NOFREE)) {
946  return RSTRING(str)->len;
947  }
948  else {
949  return RSTRING(str)->as.heap.aux.capa;
950  }
951 }
952 
953 size_t
955 {
956  return str_capacity(str, TERM_LEN(str));
957 }
958 
959 static inline void
960 must_not_null(const char *ptr)
961 {
962  if (!ptr) {
963  rb_raise(rb_eArgError, "NULL pointer given");
964  }
965 }
966 
967 static inline VALUE
968 str_alloc_embed(VALUE klass, size_t capa)
969 {
970  size_t size = rb_str_embed_size(capa);
971  RUBY_ASSERT(size > 0);
972  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
973 
974  NEWOBJ_OF(str, struct RString, klass,
976 
977  return (VALUE)str;
978 }
979 
980 static inline VALUE
981 str_alloc_heap(VALUE klass)
982 {
983  NEWOBJ_OF(str, struct RString, klass,
984  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
985 
986  return (VALUE)str;
987 }
988 
989 static inline VALUE
990 empty_str_alloc(VALUE klass)
991 {
992  RUBY_DTRACE_CREATE_HOOK(STRING, 0);
993  VALUE str = str_alloc_embed(klass, 0);
994  memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
996  return str;
997 }
998 
999 static VALUE
1000 str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
1001 {
1002  VALUE str;
1003 
1004  if (len < 0) {
1005  rb_raise(rb_eArgError, "negative string size (or size too big)");
1006  }
1007 
1008  if (enc == NULL) {
1009  enc = rb_ascii8bit_encoding();
1010  }
1011 
1012  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1013 
1014  int termlen = rb_enc_mbminlen(enc);
1015 
1016  if (STR_EMBEDDABLE_P(len, termlen)) {
1017  str = str_alloc_embed(klass, len + termlen);
1018  if (len == 0) {
1020  }
1021  }
1022  else {
1023  str = str_alloc_heap(klass);
1024  RSTRING(str)->as.heap.aux.capa = len;
1025  /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
1026  * integer overflow. If we can STATIC_ASSERT that, the following
1027  * mul_add_mul can be reverted to a simple ALLOC_N. */
1028  RSTRING(str)->as.heap.ptr =
1029  rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
1030  }
1031 
1032  rb_enc_raw_set(str, enc);
1033 
1034  if (ptr) {
1035  memcpy(RSTRING_PTR(str), ptr, len);
1036  }
1037 
1038  STR_SET_LEN(str, len);
1039  TERM_FILL(RSTRING_PTR(str) + len, termlen);
1040  return str;
1041 }
1042 
1043 static VALUE
1044 str_new(VALUE klass, const char *ptr, long len)
1045 {
1046  return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
1047 }
1048 
1049 VALUE
1050 rb_str_new(const char *ptr, long len)
1051 {
1052  return str_new(rb_cString, ptr, len);
1053 }
1054 
1055 VALUE
1056 rb_usascii_str_new(const char *ptr, long len)
1057 {
1058  return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
1059 }
1060 
1061 VALUE
1062 rb_utf8_str_new(const char *ptr, long len)
1063 {
1064  return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
1065 }
1066 
1067 VALUE
1068 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
1069 {
1070  return str_enc_new(rb_cString, ptr, len, enc);
1071 }
1072 
1073 VALUE
1074 rb_str_new_cstr(const char *ptr)
1075 {
1076  must_not_null(ptr);
1077  /* rb_str_new_cstr() can take pointer from non-malloc-generated
1078  * memory regions, and that cannot be detected by the MSAN. Just
1079  * trust the programmer that the argument passed here is a sane C
1080  * string. */
1081  __msan_unpoison_string(ptr);
1082  return rb_str_new(ptr, strlen(ptr));
1083 }
1084 
1085 VALUE
1087 {
1089 }
1090 
1091 VALUE
1093 {
1095 }
1096 
1097 VALUE
1099 {
1100  must_not_null(ptr);
1101  if (rb_enc_mbminlen(enc) != 1) {
1102  rb_raise(rb_eArgError, "wchar encoding given");
1103  }
1104  return rb_enc_str_new(ptr, strlen(ptr), enc);
1105 }
1106 
1107 static VALUE
1108 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1109 {
1110  VALUE str;
1111 
1112  if (len < 0) {
1113  rb_raise(rb_eArgError, "negative string size (or size too big)");
1114  }
1115 
1116  if (!ptr) {
1117  str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
1118  }
1119  else {
1120  RUBY_DTRACE_CREATE_HOOK(STRING, len);
1121  str = str_alloc_heap(klass);
1122  RSTRING(str)->len = len;
1123  RSTRING(str)->as.heap.ptr = (char *)ptr;
1124  RSTRING(str)->as.heap.aux.capa = len;
1125  RBASIC(str)->flags |= STR_NOFREE;
1126  rb_enc_associate_index(str, encindex);
1127  }
1128  return str;
1129 }
1130 
1131 VALUE
1132 rb_str_new_static(const char *ptr, long len)
1133 {
1134  return str_new_static(rb_cString, ptr, len, 0);
1135 }
1136 
1137 VALUE
1139 {
1140  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1141 }
1142 
1143 VALUE
1144 rb_utf8_str_new_static(const char *ptr, long len)
1145 {
1146  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1147 }
1148 
1149 VALUE
1150 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
1151 {
1152  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1153 }
1154 
1155 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1156  rb_encoding *from, rb_encoding *to,
1157  int ecflags, VALUE ecopts);
1158 
1159 static inline bool
1160 is_enc_ascii_string(VALUE str, rb_encoding *enc)
1161 {
1162  int encidx = rb_enc_to_index(enc);
1163  if (rb_enc_get_index(str) == encidx)
1164  return is_ascii_string(str);
1165  return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1166 }
1167 
1168 VALUE
1169 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1170 {
1171  long len;
1172  const char *ptr;
1173  VALUE newstr;
1174 
1175  if (!to) return str;
1176  if (!from) from = rb_enc_get(str);
1177  if (from == to) return str;
1178  if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1179  rb_is_ascii8bit_enc(to)) {
1180  if (STR_ENC_GET(str) != to) {
1181  str = rb_str_dup(str);
1182  rb_enc_associate(str, to);
1183  }
1184  return str;
1185  }
1186 
1187  RSTRING_GETMEM(str, ptr, len);
1188  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1189  from, to, ecflags, ecopts);
1190  if (NIL_P(newstr)) {
1191  /* some error, return original */
1192  return str;
1193  }
1194  return newstr;
1195 }
1196 
1197 VALUE
1198 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1199  rb_encoding *from, int ecflags, VALUE ecopts)
1200 {
1201  long olen;
1202 
1203  olen = RSTRING_LEN(newstr);
1204  if (ofs < -olen || olen < ofs)
1205  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1206  if (ofs < 0) ofs += olen;
1207  if (!from) {
1208  STR_SET_LEN(newstr, ofs);
1209  return rb_str_cat(newstr, ptr, len);
1210  }
1211 
1212  rb_str_modify(newstr);
1213  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1214  rb_enc_get(newstr),
1215  ecflags, ecopts);
1216 }
1217 
1218 VALUE
1219 rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1220 {
1221  STR_SET_LEN(str, 0);
1222  rb_enc_associate(str, enc);
1223  rb_str_cat(str, ptr, len);
1224  return str;
1225 }
1226 
1227 static VALUE
1228 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1229  rb_encoding *from, rb_encoding *to,
1230  int ecflags, VALUE ecopts)
1231 {
1232  rb_econv_t *ec;
1233  rb_econv_result_t ret;
1234  long olen;
1235  VALUE econv_wrapper;
1236  const unsigned char *start, *sp;
1237  unsigned char *dest, *dp;
1238  size_t converted_output = (size_t)ofs;
1239 
1240  olen = rb_str_capacity(newstr);
1241 
1242  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1243  RBASIC_CLEAR_CLASS(econv_wrapper);
1244  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1245  if (!ec) return Qnil;
1246  DATA_PTR(econv_wrapper) = ec;
1247 
1248  sp = (unsigned char*)ptr;
1249  start = sp;
1250  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1251  (dp = dest + converted_output),
1252  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1254  /* destination buffer short */
1255  size_t converted_input = sp - start;
1256  size_t rest = len - converted_input;
1257  converted_output = dp - dest;
1258  rb_str_set_len(newstr, converted_output);
1259  if (converted_input && converted_output &&
1260  rest < (LONG_MAX / converted_output)) {
1261  rest = (rest * converted_output) / converted_input;
1262  }
1263  else {
1264  rest = olen;
1265  }
1266  olen += rest < 2 ? 2 : rest;
1267  rb_str_resize(newstr, olen);
1268  }
1269  DATA_PTR(econv_wrapper) = 0;
1270  RB_GC_GUARD(econv_wrapper);
1271  rb_econv_close(ec);
1272  switch (ret) {
1273  case econv_finished:
1274  len = dp - (unsigned char*)RSTRING_PTR(newstr);
1275  rb_str_set_len(newstr, len);
1276  rb_enc_associate(newstr, to);
1277  return newstr;
1278 
1279  default:
1280  return Qnil;
1281  }
1282 }
1283 
1284 VALUE
1286 {
1287  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1288 }
1289 
1290 VALUE
1292 {
1293  rb_encoding *ienc;
1294  VALUE str;
1295  const int eidx = rb_enc_to_index(eenc);
1296 
1297  if (!ptr) {
1298  return rb_enc_str_new(ptr, len, eenc);
1299  }
1300 
1301  /* ASCII-8BIT case, no conversion */
1302  if ((eidx == rb_ascii8bit_encindex()) ||
1303  (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1304  return rb_str_new(ptr, len);
1305  }
1306  /* no default_internal or same encoding, no conversion */
1308  if (!ienc || eenc == ienc) {
1309  return rb_enc_str_new(ptr, len, eenc);
1310  }
1311  /* ASCII compatible, and ASCII only string, no conversion in
1312  * default_internal */
1313  if ((eidx == rb_ascii8bit_encindex()) ||
1314  (eidx == rb_usascii_encindex()) ||
1315  (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1316  return rb_enc_str_new(ptr, len, ienc);
1317  }
1318  /* convert from the given encoding to default_internal */
1319  str = rb_enc_str_new(NULL, 0, ienc);
1320  /* when the conversion failed for some reason, just ignore the
1321  * default_internal and result in the given encoding as-is. */
1322  if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1323  rb_str_initialize(str, ptr, len, eenc);
1324  }
1325  return str;
1326 }
1327 
1328 VALUE
1329 rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1330 {
1331  int eidx = rb_enc_to_index(eenc);
1332  if (eidx == rb_usascii_encindex() &&
1333  !is_ascii_string(str)) {
1335  return str;
1336  }
1337  rb_enc_associate_index(str, eidx);
1338  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1339 }
1340 
1341 VALUE
1342 rb_external_str_new(const char *ptr, long len)
1343 {
1345 }
1346 
1347 VALUE
1349 {
1351 }
1352 
1353 VALUE
1354 rb_locale_str_new(const char *ptr, long len)
1355 {
1357 }
1358 
1359 VALUE
1361 {
1363 }
1364 
1365 VALUE
1366 rb_filesystem_str_new(const char *ptr, long len)
1367 {
1369 }
1370 
1371 VALUE
1373 {
1375 }
1376 
1377 VALUE
1379 {
1381 }
1382 
1383 VALUE
1385 {
1387 }
1388 
1389 VALUE
1391 {
1392  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1393 }
1394 
1395 static VALUE
1396 str_replace_shared_without_enc(VALUE str2, VALUE str)
1397 {
1398  const int termlen = TERM_LEN(str);
1399  char *ptr;
1400  long len;
1401 
1402  RSTRING_GETMEM(str, ptr, len);
1403  if (str_embed_capa(str2) >= len + termlen) {
1404  char *ptr2 = RSTRING(str2)->as.embed.ary;
1405  STR_SET_EMBED(str2);
1406  memcpy(ptr2, RSTRING_PTR(str), len);
1407  TERM_FILL(ptr2+len, termlen);
1408  }
1409  else {
1410  VALUE root;
1411  if (STR_SHARED_P(str)) {
1412  root = RSTRING(str)->as.heap.aux.shared;
1413  RSTRING_GETMEM(str, ptr, len);
1414  }
1415  else {
1416  root = rb_str_new_frozen(str);
1417  RSTRING_GETMEM(root, ptr, len);
1418  }
1419  RUBY_ASSERT(OBJ_FROZEN(root));
1420 
1421  if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1422  if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1423  rb_fatal("about to free a possible shared root");
1424  }
1425  char *ptr2 = STR_HEAP_PTR(str2);
1426  if (ptr2 != ptr) {
1427  ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1428  }
1429  }
1430  FL_SET(str2, STR_NOEMBED);
1431  RSTRING(str2)->as.heap.ptr = ptr;
1432  STR_SET_SHARED(str2, root);
1433  }
1434 
1435  STR_SET_LEN(str2, len);
1436 
1437  return str2;
1438 }
1439 
1440 static VALUE
1441 str_replace_shared(VALUE str2, VALUE str)
1442 {
1443  str_replace_shared_without_enc(str2, str);
1444  rb_enc_cr_str_exact_copy(str2, str);
1445  return str2;
1446 }
1447 
1448 static VALUE
1449 str_new_shared(VALUE klass, VALUE str)
1450 {
1451  return str_replace_shared(str_alloc_heap(klass), str);
1452 }
1453 
1454 VALUE
1456 {
1457  return str_new_shared(rb_obj_class(str), str);
1458 }
1459 
1460 VALUE
1462 {
1463  if (RB_FL_TEST_RAW(orig, FL_FREEZE | STR_CHILLED) == FL_FREEZE) return orig;
1464  return str_new_frozen(rb_obj_class(orig), orig);
1465 }
1466 
1467 static VALUE
1468 rb_str_new_frozen_String(VALUE orig)
1469 {
1470  if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1471  return str_new_frozen(rb_cString, orig);
1472 }
1473 
1474 VALUE
1475 rb_str_tmp_frozen_acquire(VALUE orig)
1476 {
1477  if (OBJ_FROZEN_RAW(orig)) return orig;
1478  return str_new_frozen_buffer(0, orig, FALSE);
1479 }
1480 
1481 VALUE
1482 rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1483 {
1484  if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1485  if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1486 
1487  VALUE str = str_alloc_heap(0);
1488  OBJ_FREEZE(str);
1489  /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1490  FL_SET(str, STR_SHARED_ROOT);
1491 
1492  size_t capa = str_capacity(orig, TERM_LEN(orig));
1493 
1494  /* If the string is embedded then we want to create a copy that is heap
1495  * allocated. If the string is shared then the shared root must be
1496  * embedded, so we want to create a copy. If the string is a shared root
1497  * then it must be embedded, so we want to create a copy. */
1498  if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1499  RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1500  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1501  }
1502  else {
1503  /* orig must be heap allocated and not shared, so we can safely transfer
1504  * the pointer to str. */
1505  RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1506  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1507  RBASIC(orig)->flags &= ~STR_NOFREE;
1508  STR_SET_SHARED(orig, str);
1509  }
1510 
1511  RSTRING(str)->len = RSTRING(orig)->len;
1512  RSTRING(str)->as.heap.aux.capa = capa;
1513 
1514  return str;
1515 }
1516 
1517 void
1518 rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1519 {
1520  if (RBASIC_CLASS(tmp) != 0)
1521  return;
1522 
1523  if (STR_EMBED_P(tmp)) {
1525  }
1526  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1527  !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1528  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1529 
1530  if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1531  RUBY_ASSERT(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1532  RUBY_ASSERT(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1533 
1534  /* Unshare orig since the root (tmp) only has this one child. */
1535  FL_UNSET_RAW(orig, STR_SHARED);
1536  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1537  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1539 
1540  /* Make tmp embedded and empty so it is safe for sweeping. */
1541  STR_SET_EMBED(tmp);
1542  STR_SET_LEN(tmp, 0);
1543  }
1544  }
1545 }
1546 
1547 static VALUE
1548 str_new_frozen(VALUE klass, VALUE orig)
1549 {
1550  return str_new_frozen_buffer(klass, orig, TRUE);
1551 }
1552 
1553 static VALUE
1554 heap_str_make_shared(VALUE klass, VALUE orig)
1555 {
1556  RUBY_ASSERT(!STR_EMBED_P(orig));
1557  RUBY_ASSERT(!STR_SHARED_P(orig));
1558 
1559  VALUE str = str_alloc_heap(klass);
1560  STR_SET_LEN(str, RSTRING_LEN(orig));
1561  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1562  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1563  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1564  RBASIC(orig)->flags &= ~STR_NOFREE;
1565  STR_SET_SHARED(orig, str);
1566  if (klass == 0)
1567  FL_UNSET_RAW(str, STR_BORROWED);
1568  return str;
1569 }
1570 
1571 static VALUE
1572 str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1573 {
1574  VALUE str;
1575 
1576  long len = RSTRING_LEN(orig);
1577  rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
1578  int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1579 
1580  if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1581  str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
1582  RUBY_ASSERT(STR_EMBED_P(str));
1583  }
1584  else {
1585  if (FL_TEST_RAW(orig, STR_SHARED)) {
1586  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1587  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1588  long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1589  RUBY_ASSERT(ofs >= 0);
1590  RUBY_ASSERT(rest >= 0);
1591  RUBY_ASSERT(ofs + rest <= RSTRING_LEN(shared));
1593 
1594  if ((ofs > 0) || (rest > 0) ||
1595  (klass != RBASIC(shared)->klass) ||
1596  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1597  str = str_new_shared(klass, shared);
1598  RUBY_ASSERT(!STR_EMBED_P(str));
1599  RSTRING(str)->as.heap.ptr += ofs;
1600  STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1601  }
1602  else {
1603  if (RBASIC_CLASS(shared) == 0)
1604  FL_SET_RAW(shared, STR_BORROWED);
1605  return shared;
1606  }
1607  }
1608  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1609  str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1610  STR_SET_EMBED(str);
1611  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1612  STR_SET_LEN(str, RSTRING_LEN(orig));
1613  ENC_CODERANGE_SET(str, ENC_CODERANGE(orig));
1614  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1615  }
1616  else {
1617  str = heap_str_make_shared(klass, orig);
1618  }
1619  }
1620 
1621  if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1622  OBJ_FREEZE(str);
1623  return str;
1624 }
1625 
1626 VALUE
1627 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1628 {
1629  return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
1630 }
1631 
1632 static VALUE
1633 str_new_empty_String(VALUE str)
1634 {
1635  VALUE v = rb_str_new(0, 0);
1636  rb_enc_copy(v, str);
1637  return v;
1638 }
1639 
1640 #define STR_BUF_MIN_SIZE 63
1641 
1642 VALUE
1644 {
1645  if (STR_EMBEDDABLE_P(capa, 1)) {
1646  return str_alloc_embed(rb_cString, capa + 1);
1647  }
1648 
1649  VALUE str = str_alloc_heap(rb_cString);
1650 
1651  RSTRING(str)->as.heap.aux.capa = capa;
1652  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1653  RSTRING(str)->as.heap.ptr[0] = '\0';
1654 
1655  return str;
1656 }
1657 
1658 VALUE
1660 {
1661  VALUE str;
1662  long len = strlen(ptr);
1663 
1664  str = rb_str_buf_new(len);
1665  rb_str_buf_cat(str, ptr, len);
1666 
1667  return str;
1668 }
1669 
1670 VALUE
1672 {
1673  return str_new(0, 0, len);
1674 }
1675 
1676 void
1678 {
1679  if (STR_EMBED_P(str)) {
1680  RB_DEBUG_COUNTER_INC(obj_str_embed);
1681  }
1682  else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1683  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1684  (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1685  }
1686  else {
1687  RB_DEBUG_COUNTER_INC(obj_str_ptr);
1688  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1689  }
1690 }
1691 
1692 size_t
1693 rb_str_memsize(VALUE str)
1694 {
1695  if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1696  return STR_HEAP_SIZE(str);
1697  }
1698  else {
1699  return 0;
1700  }
1701 }
1702 
1703 VALUE
1705 {
1706  return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1707 }
1708 
1709 static inline void str_discard(VALUE str);
1710 static void str_shared_replace(VALUE str, VALUE str2);
1711 
1712 void
1714 {
1715  if (str != str2) str_shared_replace(str, str2);
1716 }
1717 
1718 static void
1719 str_shared_replace(VALUE str, VALUE str2)
1720 {
1721  rb_encoding *enc;
1722  int cr;
1723  int termlen;
1724 
1725  RUBY_ASSERT(str2 != str);
1726  enc = STR_ENC_GET(str2);
1727  cr = ENC_CODERANGE(str2);
1728  str_discard(str);
1729  termlen = rb_enc_mbminlen(enc);
1730 
1731  STR_SET_LEN(str, RSTRING_LEN(str2));
1732 
1733  if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1734  STR_SET_EMBED(str);
1735  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1736  rb_enc_associate(str, enc);
1737  ENC_CODERANGE_SET(str, cr);
1738  }
1739  else {
1740  if (STR_EMBED_P(str2)) {
1741  RUBY_ASSERT(!FL_TEST(str2, STR_SHARED));
1742  long len = RSTRING_LEN(str2);
1743  RUBY_ASSERT(len + termlen <= str_embed_capa(str2));
1744 
1745  char *new_ptr = ALLOC_N(char, len + termlen);
1746  memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1747  RSTRING(str2)->as.heap.ptr = new_ptr;
1748  STR_SET_LEN(str2, len);
1749  RSTRING(str2)->as.heap.aux.capa = len;
1750  STR_SET_NOEMBED(str2);
1751  }
1752 
1753  STR_SET_NOEMBED(str);
1754  FL_UNSET(str, STR_SHARED);
1755  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1756 
1757  if (FL_TEST(str2, STR_SHARED)) {
1758  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1759  STR_SET_SHARED(str, shared);
1760  }
1761  else {
1762  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1763  }
1764 
1765  /* abandon str2 */
1766  STR_SET_EMBED(str2);
1767  RSTRING_PTR(str2)[0] = 0;
1768  STR_SET_LEN(str2, 0);
1769  rb_enc_associate(str, enc);
1770  ENC_CODERANGE_SET(str, cr);
1771  }
1772 }
1773 
1774 VALUE
1776 {
1777  VALUE str;
1778 
1779  if (RB_TYPE_P(obj, T_STRING)) {
1780  return obj;
1781  }
1782  str = rb_funcall(obj, idTo_s, 0);
1783  return rb_obj_as_string_result(str, obj);
1784 }
1785 
1786 VALUE
1787 rb_obj_as_string_result(VALUE str, VALUE obj)
1788 {
1789  if (!RB_TYPE_P(str, T_STRING))
1790  return rb_any_to_s(obj);
1791  return str;
1792 }
1793 
1794 static VALUE
1795 str_replace(VALUE str, VALUE str2)
1796 {
1797  long len;
1798 
1799  len = RSTRING_LEN(str2);
1800  if (STR_SHARED_P(str2)) {
1801  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1803  STR_SET_NOEMBED(str);
1804  STR_SET_LEN(str, len);
1805  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1806  STR_SET_SHARED(str, shared);
1807  rb_enc_cr_str_exact_copy(str, str2);
1808  }
1809  else {
1810  str_replace_shared(str, str2);
1811  }
1812 
1813  return str;
1814 }
1815 
1816 static inline VALUE
1817 ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1818 {
1819  size_t size = rb_str_embed_size(capa);
1820  RUBY_ASSERT(size > 0);
1821  RUBY_ASSERT(rb_gc_size_allocatable_p(size));
1822 
1823  NEWOBJ_OF(str, struct RString, klass,
1825 
1826  return (VALUE)str;
1827 }
1828 
1829 static inline VALUE
1830 ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1831 {
1832  NEWOBJ_OF(str, struct RString, klass,
1833  T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1834 
1835  return (VALUE)str;
1836 }
1837 
1838 static inline VALUE
1839 str_duplicate_setup_encoding(VALUE str, VALUE dup, VALUE flags)
1840 {
1841  int encidx = 0;
1842  if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1843  encidx = rb_enc_get_index(str);
1844  flags &= ~ENCODING_MASK;
1845  }
1846  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1847  if (encidx) rb_enc_associate_index(dup, encidx);
1848  return dup;
1849 }
1850 
1851 static const VALUE flag_mask = ENC_CODERANGE_MASK | ENCODING_MASK | FL_FREEZE;
1852 
1853 static inline VALUE
1854 str_duplicate_setup_embed(VALUE klass, VALUE str, VALUE dup)
1855 {
1856  VALUE flags = FL_TEST_RAW(str, flag_mask);
1857  long len = RSTRING_LEN(str);
1858 
1859  RUBY_ASSERT(STR_EMBED_P(dup));
1860  RUBY_ASSERT(str_embed_capa(dup) >= len + 1);
1861  MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1862  STR_SET_LEN(dup, RSTRING_LEN(str));
1863  return str_duplicate_setup_encoding(str, dup, flags);
1864 }
1865 
1866 static inline VALUE
1867 str_duplicate_setup_heap(VALUE klass, VALUE str, VALUE dup)
1868 {
1869  VALUE flags = FL_TEST_RAW(str, flag_mask);
1870  VALUE root = str;
1871  if (FL_TEST_RAW(str, STR_SHARED)) {
1872  root = RSTRING(str)->as.heap.aux.shared;
1873  }
1874  else if (UNLIKELY(!(flags & FL_FREEZE))) {
1875  root = str = str_new_frozen(klass, str);
1876  flags = FL_TEST_RAW(str, flag_mask);
1877  }
1878  RUBY_ASSERT(!STR_SHARED_P(root));
1880 
1881  RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1882  FL_SET(root, STR_SHARED_ROOT);
1883  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1884  flags |= RSTRING_NOEMBED | STR_SHARED;
1885 
1886  STR_SET_LEN(dup, RSTRING_LEN(str));
1887  return str_duplicate_setup_encoding(str, dup, flags);
1888 }
1889 
1890 static inline VALUE
1891 str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1892 {
1893  if (STR_EMBED_P(str)) {
1894  return str_duplicate_setup_embed(klass, str, dup);
1895  }
1896  else {
1897  return str_duplicate_setup_heap(klass, str, dup);
1898  }
1899 }
1900 
1901 static inline VALUE
1902 str_duplicate(VALUE klass, VALUE str)
1903 {
1904  VALUE dup;
1905  if (STR_EMBED_P(str)) {
1906  dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1907  }
1908  else {
1909  dup = str_alloc_heap(klass);
1910  }
1911 
1912  return str_duplicate_setup(klass, str, dup);
1913 }
1914 
1915 VALUE
1917 {
1918  return str_duplicate(rb_obj_class(str), str);
1919 }
1920 
1921 /* :nodoc: */
1922 VALUE
1923 rb_str_dup_m(VALUE str)
1924 {
1925  if (LIKELY(BARE_STRING_P(str))) {
1926  return str_duplicate(rb_obj_class(str), str);
1927  }
1928  else {
1929  return rb_obj_dup(str);
1930  }
1931 }
1932 
1933 VALUE
1935 {
1936  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1937  return str_duplicate(rb_cString, str);
1938 }
1939 
1940 VALUE
1941 rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled)
1942 {
1943  RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1944  VALUE new_str, klass = rb_cString;
1945 
1946  if (!(chilled && RTEST(rb_ivar_defined(str, id_debug_created_info))) && STR_EMBED_P(str)) {
1947  new_str = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1948  str_duplicate_setup_embed(klass, str, new_str);
1949  }
1950  else {
1951  new_str = ec_str_alloc_heap(ec, klass);
1952  str_duplicate_setup_heap(klass, str, new_str);
1953  }
1954  if (chilled) {
1955  FL_SET_RAW(new_str, STR_CHILLED_LITERAL);
1956  }
1957  return new_str;
1958 }
1959 
1960 VALUE
1961 rb_str_with_debug_created_info(VALUE str, VALUE path, int line)
1962 {
1963  VALUE debug_info = rb_ary_new_from_args(2, path, INT2FIX(line));
1964  if (OBJ_FROZEN_RAW(str)) str = rb_str_dup(str);
1965  rb_ivar_set(str, id_debug_created_info, rb_ary_freeze(debug_info));
1966  FL_SET_RAW(str, STR_CHILLED_LITERAL);
1967  return rb_str_freeze(str);
1968 }
1969 
1970 /*
1971  *
1972  * call-seq:
1973  * String.new(string = '', **opts) -> new_string
1974  *
1975  * :include: doc/string/new.rdoc
1976  *
1977  */
1978 
1979 static VALUE
1980 rb_str_init(int argc, VALUE *argv, VALUE str)
1981 {
1982  static ID keyword_ids[2];
1983  VALUE orig, opt, venc, vcapa;
1984  VALUE kwargs[2];
1985  rb_encoding *enc = 0;
1986  int n;
1987 
1988  if (!keyword_ids[0]) {
1989  keyword_ids[0] = rb_id_encoding();
1990  CONST_ID(keyword_ids[1], "capacity");
1991  }
1992 
1993  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1994  if (!NIL_P(opt)) {
1995  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1996  venc = kwargs[0];
1997  vcapa = kwargs[1];
1998  if (!UNDEF_P(venc) && !NIL_P(venc)) {
1999  enc = rb_to_encoding(venc);
2000  }
2001  if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
2002  long capa = NUM2LONG(vcapa);
2003  long len = 0;
2004  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
2005 
2006  if (capa < STR_BUF_MIN_SIZE) {
2007  capa = STR_BUF_MIN_SIZE;
2008  }
2009  if (n == 1) {
2010  StringValue(orig);
2011  len = RSTRING_LEN(orig);
2012  if (capa < len) {
2013  capa = len;
2014  }
2015  if (orig == str) n = 0;
2016  }
2017  str_modifiable(str);
2018  if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2019  /* make noembed always */
2020  const size_t size = (size_t)capa + termlen;
2021  const char *const old_ptr = RSTRING_PTR(str);
2022  const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
2023  char *new_ptr = ALLOC_N(char, size);
2024  if (STR_EMBED_P(str)) RUBY_ASSERT((long)osize <= str_embed_capa(str));
2025  memcpy(new_ptr, old_ptr, osize < size ? osize : size);
2026  FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
2027  RSTRING(str)->as.heap.ptr = new_ptr;
2028  }
2029  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
2030  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
2031  (size_t)capa + termlen, STR_HEAP_SIZE(str));
2032  }
2033  STR_SET_LEN(str, len);
2034  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
2035  if (n == 1) {
2036  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
2037  rb_enc_cr_str_exact_copy(str, orig);
2038  }
2039  FL_SET(str, STR_NOEMBED);
2040  RSTRING(str)->as.heap.aux.capa = capa;
2041  }
2042  else if (n == 1) {
2043  rb_str_replace(str, orig);
2044  }
2045  if (enc) {
2046  rb_enc_associate(str, enc);
2047  ENC_CODERANGE_CLEAR(str);
2048  }
2049  }
2050  else if (n == 1) {
2051  rb_str_replace(str, orig);
2052  }
2053  return str;
2054 }
2055 
2056 /* :nodoc: */
2057 static VALUE
2058 rb_str_s_new(int argc, VALUE *argv, VALUE klass)
2059 {
2060  if (klass != rb_cString) {
2061  return rb_class_new_instance_pass_kw(argc, argv, klass);
2062  }
2063 
2064  static ID keyword_ids[2];
2065  VALUE orig, opt, encoding = Qnil, capacity = Qnil;
2066  VALUE kwargs[2];
2067  rb_encoding *enc = NULL;
2068 
2069  int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
2070  if (NIL_P(opt)) {
2071  return rb_class_new_instance_pass_kw(argc, argv, klass);
2072  }
2073 
2074  keyword_ids[0] = rb_id_encoding();
2075  CONST_ID(keyword_ids[1], "capacity");
2076  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
2077  encoding = kwargs[0];
2078  capacity = kwargs[1];
2079 
2080  if (n == 1) {
2081  orig = StringValue(orig);
2082  }
2083  else {
2084  orig = Qnil;
2085  }
2086 
2087  if (UNDEF_P(encoding)) {
2088  if (!NIL_P(orig)) {
2089  encoding = rb_obj_encoding(orig);
2090  }
2091  }
2092 
2093  if (!UNDEF_P(encoding)) {
2094  enc = rb_to_encoding(encoding);
2095  }
2096 
2097  // If capacity is nil, we're basically just duping `orig`.
2098  if (UNDEF_P(capacity)) {
2099  if (NIL_P(orig)) {
2100  VALUE empty_str = str_new(klass, "", 0);
2101  if (enc) {
2102  rb_enc_associate(empty_str, enc);
2103  }
2104  return empty_str;
2105  }
2106  VALUE copy = str_duplicate(klass, orig);
2107  rb_enc_associate(copy, enc);
2108  ENC_CODERANGE_CLEAR(copy);
2109  return copy;
2110  }
2111 
2112  long capa = 0;
2113  capa = NUM2LONG(capacity);
2114  if (capa < 0) {
2115  capa = 0;
2116  }
2117 
2118  if (!NIL_P(orig)) {
2119  long orig_capa = rb_str_capacity(orig);
2120  if (orig_capa > capa) {
2121  capa = orig_capa;
2122  }
2123  }
2124 
2125  VALUE str = str_enc_new(klass, NULL, capa, enc);
2126  STR_SET_LEN(str, 0);
2127  TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
2128 
2129  if (!NIL_P(orig)) {
2130  rb_str_buf_append(str, orig);
2131  }
2132 
2133  return str;
2134 }
2135 
2136 #ifdef NONASCII_MASK
2137 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
2138 
2139 /*
2140  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
2141  * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
2142  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2143  *
2144  * if (!(byte & 0x80))
2145  * byte |= 0x40; // turn on bit6
2146  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2147  *
2148  * This function calculates whether a byte is leading or not for all bytes
2149  * in the argument word by concurrently using the above logic, and then
2150  * adds up the number of leading bytes in the word.
2151  */
2152 static inline uintptr_t
2153 count_utf8_lead_bytes_with_word(const uintptr_t *s)
2154 {
2155  uintptr_t d = *s;
2156 
2157  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2158  d = (d>>6) | (~d>>7);
2159  d &= NONASCII_MASK >> 7;
2160 
2161  /* Gather all bytes. */
2162 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2163  /* use only if it can use POPCNT */
2164  return rb_popcount_intptr(d);
2165 #else
2166  d += (d>>8);
2167  d += (d>>16);
2168 # if SIZEOF_VOIDP == 8
2169  d += (d>>32);
2170 # endif
2171  return (d&0xF);
2172 #endif
2173 }
2174 #endif
2175 
2176 static inline long
2177 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2178 {
2179  long c;
2180  const char *q;
2181 
2182  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2183  long diff = (long)(e - p);
2184  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2185  }
2186 #ifdef NONASCII_MASK
2187  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2188  uintptr_t len = 0;
2189  if ((int)sizeof(uintptr_t) * 2 < e - p) {
2190  const uintptr_t *s, *t;
2191  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2192  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2193  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2194  while (p < (const char *)s) {
2195  if (is_utf8_lead_byte(*p)) len++;
2196  p++;
2197  }
2198  while (s < t) {
2199  len += count_utf8_lead_bytes_with_word(s);
2200  s++;
2201  }
2202  p = (const char *)s;
2203  }
2204  while (p < e) {
2205  if (is_utf8_lead_byte(*p)) len++;
2206  p++;
2207  }
2208  return (long)len;
2209  }
2210 #endif
2211  else if (rb_enc_asciicompat(enc)) {
2212  c = 0;
2213  if (ENC_CODERANGE_CLEAN_P(cr)) {
2214  while (p < e) {
2215  if (ISASCII(*p)) {
2216  q = search_nonascii(p, e);
2217  if (!q)
2218  return c + (e - p);
2219  c += q - p;
2220  p = q;
2221  }
2222  p += rb_enc_fast_mbclen(p, e, enc);
2223  c++;
2224  }
2225  }
2226  else {
2227  while (p < e) {
2228  if (ISASCII(*p)) {
2229  q = search_nonascii(p, e);
2230  if (!q)
2231  return c + (e - p);
2232  c += q - p;
2233  p = q;
2234  }
2235  p += rb_enc_mbclen(p, e, enc);
2236  c++;
2237  }
2238  }
2239  return c;
2240  }
2241 
2242  for (c=0; p<e; c++) {
2243  p += rb_enc_mbclen(p, e, enc);
2244  }
2245  return c;
2246 }
2247 
2248 long
2249 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2250 {
2251  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2252 }
2253 
2254 /* To get strlen with cr
2255  * Note that given cr is not used.
2256  */
2257 long
2258 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2259 {
2260  long c;
2261  const char *q;
2262  int ret;
2263 
2264  *cr = 0;
2265  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2266  long diff = (long)(e - p);
2267  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2268  }
2269  else if (rb_enc_asciicompat(enc)) {
2270  c = 0;
2271  while (p < e) {
2272  if (ISASCII(*p)) {
2273  q = search_nonascii(p, e);
2274  if (!q) {
2275  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2276  return c + (e - p);
2277  }
2278  c += q - p;
2279  p = q;
2280  }
2281  ret = rb_enc_precise_mbclen(p, e, enc);
2282  if (MBCLEN_CHARFOUND_P(ret)) {
2283  *cr |= ENC_CODERANGE_VALID;
2284  p += MBCLEN_CHARFOUND_LEN(ret);
2285  }
2286  else {
2287  *cr = ENC_CODERANGE_BROKEN;
2288  p++;
2289  }
2290  c++;
2291  }
2292  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2293  return c;
2294  }
2295 
2296  for (c=0; p<e; c++) {
2297  ret = rb_enc_precise_mbclen(p, e, enc);
2298  if (MBCLEN_CHARFOUND_P(ret)) {
2299  *cr |= ENC_CODERANGE_VALID;
2300  p += MBCLEN_CHARFOUND_LEN(ret);
2301  }
2302  else {
2303  *cr = ENC_CODERANGE_BROKEN;
2304  if (p + rb_enc_mbminlen(enc) <= e)
2305  p += rb_enc_mbminlen(enc);
2306  else
2307  p = e;
2308  }
2309  }
2310  if (!*cr) *cr = ENC_CODERANGE_7BIT;
2311  return c;
2312 }
2313 
2314 /* enc must be str's enc or rb_enc_check(str, str2) */
2315 static long
2316 str_strlen(VALUE str, rb_encoding *enc)
2317 {
2318  const char *p, *e;
2319  int cr;
2320 
2321  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2322  if (!enc) enc = STR_ENC_GET(str);
2323  p = RSTRING_PTR(str);
2324  e = RSTRING_END(str);
2325  cr = ENC_CODERANGE(str);
2326 
2327  if (cr == ENC_CODERANGE_UNKNOWN) {
2328  long n = rb_enc_strlen_cr(p, e, enc, &cr);
2329  if (cr) ENC_CODERANGE_SET(str, cr);
2330  return n;
2331  }
2332  else {
2333  return enc_strlen(p, e, enc, cr);
2334  }
2335 }
2336 
2337 long
2339 {
2340  return str_strlen(str, NULL);
2341 }
2342 
2343 /*
2344  * call-seq:
2345  * length -> integer
2346  *
2347  * :include: doc/string/length.rdoc
2348  *
2349  */
2350 
2351 VALUE
2353 {
2354  return LONG2NUM(str_strlen(str, NULL));
2355 }
2356 
2357 /*
2358  * call-seq:
2359  * bytesize -> integer
2360  *
2361  * :include: doc/string/bytesize.rdoc
2362  *
2363  */
2364 
2365 VALUE
2366 rb_str_bytesize(VALUE str)
2367 {
2368  return LONG2NUM(RSTRING_LEN(str));
2369 }
2370 
2371 /*
2372  * call-seq:
2373  * empty? -> true or false
2374  *
2375  * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2376  *
2377  * "hello".empty? # => false
2378  * " ".empty? # => false
2379  * "".empty? # => true
2380  *
2381  */
2382 
2383 static VALUE
2384 rb_str_empty(VALUE str)
2385 {
2386  return RBOOL(RSTRING_LEN(str) == 0);
2387 }
2388 
2389 /*
2390  * call-seq:
2391  * string + other_string -> new_string
2392  *
2393  * Returns a new +String+ containing +other_string+ concatenated to +self+:
2394  *
2395  * "Hello from " + self.to_s # => "Hello from main"
2396  *
2397  */
2398 
2399 VALUE
2401 {
2402  VALUE str3;
2403  rb_encoding *enc;
2404  char *ptr1, *ptr2, *ptr3;
2405  long len1, len2;
2406  int termlen;
2407 
2408  StringValue(str2);
2409  enc = rb_enc_check_str(str1, str2);
2410  RSTRING_GETMEM(str1, ptr1, len1);
2411  RSTRING_GETMEM(str2, ptr2, len2);
2412  termlen = rb_enc_mbminlen(enc);
2413  if (len1 > LONG_MAX - len2) {
2414  rb_raise(rb_eArgError, "string size too big");
2415  }
2416  str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
2417  ptr3 = RSTRING_PTR(str3);
2418  memcpy(ptr3, ptr1, len1);
2419  memcpy(ptr3+len1, ptr2, len2);
2420  TERM_FILL(&ptr3[len1+len2], termlen);
2421 
2424  RB_GC_GUARD(str1);
2425  RB_GC_GUARD(str2);
2426  return str3;
2427 }
2428 
2429 /* A variant of rb_str_plus that does not raise but return Qundef instead. */
2430 VALUE
2431 rb_str_opt_plus(VALUE str1, VALUE str2)
2432 {
2435  long len1, len2;
2436  MAYBE_UNUSED(char) *ptr1, *ptr2;
2437  RSTRING_GETMEM(str1, ptr1, len1);
2438  RSTRING_GETMEM(str2, ptr2, len2);
2439  int enc1 = rb_enc_get_index(str1);
2440  int enc2 = rb_enc_get_index(str2);
2441 
2442  if (enc1 < 0) {
2443  return Qundef;
2444  }
2445  else if (enc2 < 0) {
2446  return Qundef;
2447  }
2448  else if (enc1 != enc2) {
2449  return Qundef;
2450  }
2451  else if (len1 > LONG_MAX - len2) {
2452  return Qundef;
2453  }
2454  else {
2455  return rb_str_plus(str1, str2);
2456  }
2457 
2458 }
2459 
2460 /*
2461  * call-seq:
2462  * string * integer -> new_string
2463  *
2464  * Returns a new +String+ containing +integer+ copies of +self+:
2465  *
2466  * "Ho! " * 3 # => "Ho! Ho! Ho! "
2467  * "Ho! " * 0 # => ""
2468  *
2469  */
2470 
2471 VALUE
2473 {
2474  VALUE str2;
2475  long n, len;
2476  char *ptr2;
2477  int termlen;
2478 
2479  if (times == INT2FIX(1)) {
2480  return str_duplicate(rb_cString, str);
2481  }
2482  if (times == INT2FIX(0)) {
2483  str2 = str_alloc_embed(rb_cString, 0);
2484  rb_enc_copy(str2, str);
2485  return str2;
2486  }
2487  len = NUM2LONG(times);
2488  if (len < 0) {
2489  rb_raise(rb_eArgError, "negative argument");
2490  }
2491  if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2492  if (STR_EMBEDDABLE_P(len, 1)) {
2493  str2 = str_alloc_embed(rb_cString, len + 1);
2494  memset(RSTRING_PTR(str2), 0, len + 1);
2495  }
2496  else {
2497  str2 = str_alloc_heap(rb_cString);
2498  RSTRING(str2)->as.heap.aux.capa = len;
2499  RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2500  }
2501  STR_SET_LEN(str2, len);
2502  rb_enc_copy(str2, str);
2503  return str2;
2504  }
2505  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2506  rb_raise(rb_eArgError, "argument too big");
2507  }
2508 
2509  len *= RSTRING_LEN(str);
2510  termlen = TERM_LEN(str);
2511  str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
2512  ptr2 = RSTRING_PTR(str2);
2513  if (len) {
2514  n = RSTRING_LEN(str);
2515  memcpy(ptr2, RSTRING_PTR(str), n);
2516  while (n <= len/2) {
2517  memcpy(ptr2 + n, ptr2, n);
2518  n *= 2;
2519  }
2520  memcpy(ptr2 + n, ptr2, len-n);
2521  }
2522  STR_SET_LEN(str2, len);
2523  TERM_FILL(&ptr2[len], termlen);
2524  rb_enc_cr_str_copy_for_substr(str2, str);
2525 
2526  return str2;
2527 }
2528 
2529 /*
2530  * call-seq:
2531  * string % object -> new_string
2532  *
2533  * Returns the result of formatting +object+ into the format specification +self+
2534  * (see Kernel#sprintf for formatting details):
2535  *
2536  * "%05d" % 123 # => "00123"
2537  *
2538  * If +self+ contains multiple substitutions, +object+ must be
2539  * an Array or Hash containing the values to be substituted:
2540  *
2541  * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2542  * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2543  * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2544  *
2545  */
2546 
2547 static VALUE
2548 rb_str_format_m(VALUE str, VALUE arg)
2549 {
2550  VALUE tmp = rb_check_array_type(arg);
2551 
2552  if (!NIL_P(tmp)) {
2553  return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2554  }
2555  return rb_str_format(1, &arg, str);
2556 }
2557 
2558 static inline void
2559 rb_check_lockedtmp(VALUE str)
2560 {
2561  if (FL_TEST(str, STR_TMPLOCK)) {
2562  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2563  }
2564 }
2565 
2566 // If none of these flags are set, we know we have an modifiable string.
2567 // If any is set, we need to do more detailed checks.
2568 #define STR_UNMODIFIABLE_MASK (FL_FREEZE | STR_TMPLOCK | STR_CHILLED)
2569 static inline void
2570 str_modifiable(VALUE str)
2571 {
2572  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_UNMODIFIABLE_MASK))) {
2573  if (CHILLED_STRING_P(str)) {
2574  CHILLED_STRING_MUTATED(str);
2575  }
2576  rb_check_lockedtmp(str);
2577  rb_check_frozen(str);
2578  }
2579 }
2580 
2581 static inline int
2582 str_dependent_p(VALUE str)
2583 {
2584  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2585  return FALSE;
2586  }
2587  else {
2588  return TRUE;
2589  }
2590 }
2591 
2592 // If none of these flags are set, we know we have an independent string.
2593 // If any is set, we need to do more detailed checks.
2594 #define STR_DEPENDANT_MASK (STR_UNMODIFIABLE_MASK | STR_SHARED | STR_NOFREE)
2595 static inline int
2596 str_independent(VALUE str)
2597 {
2598  if (RB_UNLIKELY(FL_ANY_RAW(str, STR_DEPENDANT_MASK))) {
2599  str_modifiable(str);
2600  return !str_dependent_p(str);
2601  }
2602  return TRUE;
2603 }
2604 
2605 static void
2606 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2607 {
2608  char *ptr;
2609  char *oldptr;
2610  long capa = len + expand;
2611 
2612  if (len > capa) len = capa;
2613 
2614  if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2615  ptr = RSTRING(str)->as.heap.ptr;
2616  STR_SET_EMBED(str);
2617  memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2618  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2619  STR_SET_LEN(str, len);
2620  return;
2621  }
2622 
2623  ptr = ALLOC_N(char, (size_t)capa + termlen);
2624  oldptr = RSTRING_PTR(str);
2625  if (oldptr) {
2626  memcpy(ptr, oldptr, len);
2627  }
2628  if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2629  xfree(oldptr);
2630  }
2631  STR_SET_NOEMBED(str);
2632  FL_UNSET(str, STR_SHARED|STR_NOFREE);
2633  TERM_FILL(ptr + len, termlen);
2634  RSTRING(str)->as.heap.ptr = ptr;
2635  STR_SET_LEN(str, len);
2636  RSTRING(str)->as.heap.aux.capa = capa;
2637 }
2638 
2639 void
2641 {
2642  if (!str_independent(str))
2643  str_make_independent(str);
2644  ENC_CODERANGE_CLEAR(str);
2645 }
2646 
2647 void
2648 rb_str_modify_expand(VALUE str, long expand)
2649 {
2650  int termlen = TERM_LEN(str);
2651  long len = RSTRING_LEN(str);
2652 
2653  if (expand < 0) {
2654  rb_raise(rb_eArgError, "negative expanding string size");
2655  }
2656  if (expand >= LONG_MAX - len) {
2657  rb_raise(rb_eArgError, "string size too big");
2658  }
2659 
2660  if (!str_independent(str)) {
2661  str_make_independent_expand(str, len, expand, termlen);
2662  }
2663  else if (expand > 0) {
2664  RESIZE_CAPA_TERM(str, len + expand, termlen);
2665  }
2666  ENC_CODERANGE_CLEAR(str);
2667 }
2668 
2669 /* As rb_str_modify(), but don't clear coderange */
2670 static void
2671 str_modify_keep_cr(VALUE str)
2672 {
2673  if (!str_independent(str))
2674  str_make_independent(str);
2675  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2676  /* Force re-scan later */
2677  ENC_CODERANGE_CLEAR(str);
2678 }
2679 
2680 static inline void
2681 str_discard(VALUE str)
2682 {
2683  str_modifiable(str);
2684  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2685  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2686  RSTRING(str)->as.heap.ptr = 0;
2687  STR_SET_LEN(str, 0);
2688  }
2689 }
2690 
2691 void
2693 {
2694  int encindex = rb_enc_get_index(str);
2695 
2696  if (RB_UNLIKELY(encindex == -1)) {
2697  rb_raise(rb_eTypeError, "not encoding capable object");
2698  }
2699 
2700  if (RB_LIKELY(str_encindex_fastpath(encindex))) {
2701  return;
2702  }
2703 
2704  rb_encoding *enc = rb_enc_from_index(encindex);
2705  if (!rb_enc_asciicompat(enc)) {
2706  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2707  }
2708 }
2709 
2710 VALUE
2712 {
2713  VALUE s = *ptr;
2714  if (!RB_TYPE_P(s, T_STRING)) {
2715  s = rb_str_to_str(s);
2716  *ptr = s;
2717  }
2718  return s;
2719 }
2720 
2721 char *
2723 {
2724  VALUE str = rb_string_value(ptr);
2725  return RSTRING_PTR(str);
2726 }
2727 
2728 static int
2729 zero_filled(const char *s, int n)
2730 {
2731  for (; n > 0; --n) {
2732  if (*s++) return 0;
2733  }
2734  return 1;
2735 }
2736 
2737 static const char *
2738 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2739 {
2740  const char *e = s + len;
2741 
2742  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2743  if (zero_filled(s, minlen)) return s;
2744  }
2745  return 0;
2746 }
2747 
2748 static char *
2749 str_fill_term(VALUE str, char *s, long len, int termlen)
2750 {
2751  /* This function assumes that (capa + termlen) bytes of memory
2752  * is allocated, like many other functions in this file.
2753  */
2754  if (str_dependent_p(str)) {
2755  if (!zero_filled(s + len, termlen))
2756  str_make_independent_expand(str, len, 0L, termlen);
2757  }
2758  else {
2759  TERM_FILL(s + len, termlen);
2760  return s;
2761  }
2762  return RSTRING_PTR(str);
2763 }
2764 
2765 void
2766 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2767 {
2768  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2769  long len = RSTRING_LEN(str);
2770 
2771  RUBY_ASSERT(capa >= len);
2772  if (capa - len < termlen) {
2773  rb_check_lockedtmp(str);
2774  str_make_independent_expand(str, len, 0L, termlen);
2775  }
2776  else if (str_dependent_p(str)) {
2777  if (termlen > oldtermlen)
2778  str_make_independent_expand(str, len, 0L, termlen);
2779  }
2780  else {
2781  if (!STR_EMBED_P(str)) {
2782  /* modify capa instead of realloc */
2783  RUBY_ASSERT(!FL_TEST((str), STR_SHARED));
2784  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2785  }
2786  if (termlen > oldtermlen) {
2787  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2788  }
2789  }
2790 
2791  return;
2792 }
2793 
2794 static char *
2795 str_null_check(VALUE str, int *w)
2796 {
2797  char *s = RSTRING_PTR(str);
2798  long len = RSTRING_LEN(str);
2799  rb_encoding *enc = rb_enc_get(str);
2800  const int minlen = rb_enc_mbminlen(enc);
2801 
2802  if (minlen > 1) {
2803  *w = 1;
2804  if (str_null_char(s, len, minlen, enc)) {
2805  return NULL;
2806  }
2807  return str_fill_term(str, s, len, minlen);
2808  }
2809  *w = 0;
2810  if (!s || memchr(s, 0, len)) {
2811  return NULL;
2812  }
2813  if (s[len]) {
2814  s = str_fill_term(str, s, len, minlen);
2815  }
2816  return s;
2817 }
2818 
2819 char *
2820 rb_str_to_cstr(VALUE str)
2821 {
2822  int w;
2823  return str_null_check(str, &w);
2824 }
2825 
2826 char *
2828 {
2829  VALUE str = rb_string_value(ptr);
2830  int w;
2831  char *s = str_null_check(str, &w);
2832  if (!s) {
2833  if (w) {
2834  rb_raise(rb_eArgError, "string contains null char");
2835  }
2836  rb_raise(rb_eArgError, "string contains null byte");
2837  }
2838  return s;
2839 }
2840 
2841 char *
2842 rb_str_fill_terminator(VALUE str, const int newminlen)
2843 {
2844  char *s = RSTRING_PTR(str);
2845  long len = RSTRING_LEN(str);
2846  return str_fill_term(str, s, len, newminlen);
2847 }
2848 
2849 VALUE
2851 {
2852  str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2853  return str;
2854 }
2855 
2856 /*
2857  * call-seq:
2858  * String.try_convert(object) -> object, new_string, or nil
2859  *
2860  * If +object+ is a +String+ object, returns +object+.
2861  *
2862  * Otherwise if +object+ responds to <tt>:to_str</tt>,
2863  * calls <tt>object.to_str</tt> and returns the result.
2864  *
2865  * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2866  *
2867  * Raises an exception unless <tt>object.to_str</tt> returns a +String+ object.
2868  */
2869 static VALUE
2870 rb_str_s_try_convert(VALUE dummy, VALUE str)
2871 {
2872  return rb_check_string_type(str);
2873 }
2874 
2875 static char*
2876 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2877 {
2878  long nth = *nthp;
2879  if (rb_enc_mbmaxlen(enc) == 1) {
2880  p += nth;
2881  }
2882  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2883  p += nth * rb_enc_mbmaxlen(enc);
2884  }
2885  else if (rb_enc_asciicompat(enc)) {
2886  const char *p2, *e2;
2887  int n;
2888 
2889  while (p < e && 0 < nth) {
2890  e2 = p + nth;
2891  if (e < e2) {
2892  *nthp = nth;
2893  return (char *)e;
2894  }
2895  if (ISASCII(*p)) {
2896  p2 = search_nonascii(p, e2);
2897  if (!p2) {
2898  nth -= e2 - p;
2899  *nthp = nth;
2900  return (char *)e2;
2901  }
2902  nth -= p2 - p;
2903  p = p2;
2904  }
2905  n = rb_enc_mbclen(p, e, enc);
2906  p += n;
2907  nth--;
2908  }
2909  *nthp = nth;
2910  if (nth != 0) {
2911  return (char *)e;
2912  }
2913  return (char *)p;
2914  }
2915  else {
2916  while (p < e && nth--) {
2917  p += rb_enc_mbclen(p, e, enc);
2918  }
2919  }
2920  if (p > e) p = e;
2921  *nthp = nth;
2922  return (char*)p;
2923 }
2924 
2925 char*
2926 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2927 {
2928  return str_nth_len(p, e, &nth, enc);
2929 }
2930 
2931 static char*
2932 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2933 {
2934  if (singlebyte)
2935  p += nth;
2936  else {
2937  p = str_nth_len(p, e, &nth, enc);
2938  }
2939  if (!p) return 0;
2940  if (p > e) p = e;
2941  return (char *)p;
2942 }
2943 
2944 /* char offset to byte offset */
2945 static long
2946 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2947 {
2948  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2949  if (!pp) return e - p;
2950  return pp - p;
2951 }
2952 
2953 long
2954 rb_str_offset(VALUE str, long pos)
2955 {
2956  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2957  STR_ENC_GET(str), single_byte_optimizable(str));
2958 }
2959 
2960 #ifdef NONASCII_MASK
2961 static char *
2962 str_utf8_nth(const char *p, const char *e, long *nthp)
2963 {
2964  long nth = *nthp;
2965  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2966  const uintptr_t *s, *t;
2967  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2968  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2969  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2970  while (p < (const char *)s) {
2971  if (is_utf8_lead_byte(*p)) nth--;
2972  p++;
2973  }
2974  do {
2975  nth -= count_utf8_lead_bytes_with_word(s);
2976  s++;
2977  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2978  p = (char *)s;
2979  }
2980  while (p < e) {
2981  if (is_utf8_lead_byte(*p)) {
2982  if (nth == 0) break;
2983  nth--;
2984  }
2985  p++;
2986  }
2987  *nthp = nth;
2988  return (char *)p;
2989 }
2990 
2991 static long
2992 str_utf8_offset(const char *p, const char *e, long nth)
2993 {
2994  const char *pp = str_utf8_nth(p, e, &nth);
2995  return pp - p;
2996 }
2997 #endif
2998 
2999 /* byte offset to char offset */
3000 long
3001 rb_str_sublen(VALUE str, long pos)
3002 {
3003  if (single_byte_optimizable(str) || pos < 0)
3004  return pos;
3005  else {
3006  char *p = RSTRING_PTR(str);
3007  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
3008  }
3009 }
3010 
3011 static VALUE
3012 str_subseq(VALUE str, long beg, long len)
3013 {
3014  VALUE str2;
3015 
3016  RUBY_ASSERT(beg >= 0);
3017  RUBY_ASSERT(len >= 0);
3018  RUBY_ASSERT(beg+len <= RSTRING_LEN(str));
3019 
3020  const int termlen = TERM_LEN(str);
3021  if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
3022  str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
3023  RB_GC_GUARD(str);
3024  return str2;
3025  }
3026 
3027  str2 = str_alloc_heap(rb_cString);
3028  if (str_embed_capa(str2) >= len + termlen) {
3029  char *ptr2 = RSTRING(str2)->as.embed.ary;
3030  STR_SET_EMBED(str2);
3031  memcpy(ptr2, RSTRING_PTR(str) + beg, len);
3032  TERM_FILL(ptr2+len, termlen);
3033 
3034  STR_SET_LEN(str2, len);
3035  RB_GC_GUARD(str);
3036  }
3037  else {
3038  str_replace_shared(str2, str);
3039  RUBY_ASSERT(!STR_EMBED_P(str2));
3040  ENC_CODERANGE_CLEAR(str2);
3041  RSTRING(str2)->as.heap.ptr += beg;
3042  if (RSTRING_LEN(str2) > len) {
3043  STR_SET_LEN(str2, len);
3044  }
3045  }
3046 
3047  return str2;
3048 }
3049 
3050 VALUE
3051 rb_str_subseq(VALUE str, long beg, long len)
3052 {
3053  VALUE str2 = str_subseq(str, beg, len);
3054  rb_enc_cr_str_copy_for_substr(str2, str);
3055  return str2;
3056 }
3057 
3058 char *
3059 rb_str_subpos(VALUE str, long beg, long *lenp)
3060 {
3061  long len = *lenp;
3062  long slen = -1L;
3063  const long blen = RSTRING_LEN(str);
3064  rb_encoding *enc = STR_ENC_GET(str);
3065  char *p, *s = RSTRING_PTR(str), *e = s + blen;
3066 
3067  if (len < 0) return 0;
3068  if (beg < 0 && -beg < 0) return 0;
3069  if (!blen) {
3070  len = 0;
3071  }
3072  if (single_byte_optimizable(str)) {
3073  if (beg > blen) return 0;
3074  if (beg < 0) {
3075  beg += blen;
3076  if (beg < 0) return 0;
3077  }
3078  if (len > blen - beg)
3079  len = blen - beg;
3080  if (len < 0) return 0;
3081  p = s + beg;
3082  goto end;
3083  }
3084  if (beg < 0) {
3085  if (len > -beg) len = -beg;
3086  if ((ENC_CODERANGE(str) == ENC_CODERANGE_VALID) &&
3087  (-beg * rb_enc_mbmaxlen(enc) < blen / 8)) {
3088  beg = -beg;
3089  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
3090  p = e;
3091  if (!p) return 0;
3092  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
3093  if (!p) return 0;
3094  len = e - p;
3095  goto end;
3096  }
3097  else {
3098  slen = str_strlen(str, enc);
3099  beg += slen;
3100  if (beg < 0) return 0;
3101  p = s + beg;
3102  if (len == 0) goto end;
3103  }
3104  }
3105  else if (beg > 0 && beg > blen) {
3106  return 0;
3107  }
3108  if (len == 0) {
3109  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
3110  p = s + beg;
3111  }
3112 #ifdef NONASCII_MASK
3113  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
3114  enc == rb_utf8_encoding()) {
3115  p = str_utf8_nth(s, e, &beg);
3116  if (beg > 0) return 0;
3117  len = str_utf8_offset(p, e, len);
3118  }
3119 #endif
3120  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
3121  int char_sz = rb_enc_mbmaxlen(enc);
3122 
3123  p = s + beg * char_sz;
3124  if (p > e) {
3125  return 0;
3126  }
3127  else if (len * char_sz > e - p)
3128  len = e - p;
3129  else
3130  len *= char_sz;
3131  }
3132  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
3133  if (beg > 0) return 0;
3134  len = 0;
3135  }
3136  else {
3137  len = str_offset(p, e, len, enc, 0);
3138  }
3139  end:
3140  *lenp = len;
3141  RB_GC_GUARD(str);
3142  return p;
3143 }
3144 
3145 static VALUE str_substr(VALUE str, long beg, long len, int empty);
3146 
3147 VALUE
3148 rb_str_substr(VALUE str, long beg, long len)
3149 {
3150  return str_substr(str, beg, len, TRUE);
3151 }
3152 
3153 VALUE
3154 rb_str_substr_two_fixnums(VALUE str, VALUE beg, VALUE len, int empty)
3155 {
3156  return str_substr(str, NUM2LONG(beg), NUM2LONG(len), empty);
3157 }
3158 
3159 static VALUE
3160 str_substr(VALUE str, long beg, long len, int empty)
3161 {
3162  char *p = rb_str_subpos(str, beg, &len);
3163 
3164  if (!p) return Qnil;
3165  if (!len && !empty) return Qnil;
3166 
3167  beg = p - RSTRING_PTR(str);
3168 
3169  VALUE str2 = str_subseq(str, beg, len);
3170  rb_enc_cr_str_copy_for_substr(str2, str);
3171  return str2;
3172 }
3173 
3174 /* :nodoc: */
3175 VALUE
3177 {
3178  if (CHILLED_STRING_P(str)) {
3179  FL_UNSET_RAW(str, STR_CHILLED);
3180  }
3181 
3182  if (OBJ_FROZEN(str)) return str;
3183  rb_str_resize(str, RSTRING_LEN(str));
3184  return rb_obj_freeze(str);
3185 }
3186 
3187 /*
3188  * call-seq:
3189  * +string -> new_string or self
3190  *
3191  * Returns +self+ if +self+ is not frozen.
3192  *
3193  * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3194  */
3195 static VALUE
3196 str_uplus(VALUE str)
3197 {
3198  if (OBJ_FROZEN(str) || CHILLED_STRING_P(str)) {
3199  return rb_str_dup(str);
3200  }
3201  else {
3202  return str;
3203  }
3204 }
3205 
3206 /*
3207  * call-seq:
3208  * -string -> frozen_string
3209  * dedup -> frozen_string
3210  *
3211  * Returns a frozen, possibly pre-existing copy of the string.
3212  *
3213  * The returned +String+ will be deduplicated as long as it does not have
3214  * any instance variables set on it and is not a String subclass.
3215  *
3216  * Note that <tt>-string</tt> variant is more convenient for defining
3217  * constants:
3218  *
3219  * FILENAME = -'config/database.yml'
3220  *
3221  * while +dedup+ is better suitable for using the method in chains
3222  * of calculations:
3223  *
3224  * @url_list.concat(urls.map(&:dedup))
3225  *
3226  */
3227 static VALUE
3228 str_uminus(VALUE str)
3229 {
3230  if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3231  str = rb_str_dup(str);
3232  }
3233  return rb_fstring(str);
3234 }
3235 
3236 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3237 #define rb_str_dup_frozen rb_str_new_frozen
3238 
3239 VALUE
3240 rb_str_locktmp(VALUE str)
3241 {
3242  if (FL_TEST(str, STR_TMPLOCK)) {
3243  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3244  }
3245  FL_SET(str, STR_TMPLOCK);
3246  return str;
3247 }
3248 
3249 VALUE
3251 {
3252  if (!FL_TEST(str, STR_TMPLOCK)) {
3253  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3254  }
3255  FL_UNSET(str, STR_TMPLOCK);
3256  return str;
3257 }
3258 
3259 VALUE
3260 rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3261 {
3262  rb_str_locktmp(str);
3263  return rb_ensure(func, arg, rb_str_unlocktmp, str);
3264 }
3265 
3266 void
3268 {
3269  long capa;
3270  const int termlen = TERM_LEN(str);
3271 
3272  str_modifiable(str);
3273  if (STR_SHARED_P(str)) {
3274  rb_raise(rb_eRuntimeError, "can't set length of shared string");
3275  }
3276  if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3277  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3278  }
3279 
3280  int cr = ENC_CODERANGE(str);
3281  if (len == 0) {
3282  /* Empty string does not contain non-ASCII */
3284  }
3285  else if (cr == ENC_CODERANGE_UNKNOWN) {
3286  /* Leave unknown. */
3287  }
3288  else if (len > RSTRING_LEN(str)) {
3289  if (ENC_CODERANGE_CLEAN_P(cr)) {
3290  /* Update the coderange regarding the extended part. */
3291  const char *const prev_end = RSTRING_END(str);
3292  const char *const new_end = RSTRING_PTR(str) + len;
3293  rb_encoding *enc = rb_enc_get(str);
3294  rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3295  ENC_CODERANGE_SET(str, cr);
3296  }
3297  else if (cr == ENC_CODERANGE_BROKEN) {
3298  /* May be valid now, by appended part. */
3300  }
3301  }
3302  else if (len < RSTRING_LEN(str)) {
3303  if (cr != ENC_CODERANGE_7BIT) {
3304  /* ASCII-only string is keeping after truncated. Valid
3305  * and broken may be invalid or valid, leave unknown. */
3307  }
3308  }
3309 
3310  STR_SET_LEN(str, len);
3311  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3312 }
3313 
3314 VALUE
3316 {
3317  if (len < 0) {
3318  rb_raise(rb_eArgError, "negative string size (or size too big)");
3319  }
3320 
3321  int independent = str_independent(str);
3322  long slen = RSTRING_LEN(str);
3323  const int termlen = TERM_LEN(str);
3324 
3325  if (slen > len || (termlen != 1 && slen < len)) {
3326  ENC_CODERANGE_CLEAR(str);
3327  }
3328 
3329  {
3330  long capa;
3331  if (STR_EMBED_P(str)) {
3332  if (len == slen) return str;
3333  if (str_embed_capa(str) >= len + termlen) {
3334  STR_SET_LEN(str, len);
3335  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3336  return str;
3337  }
3338  str_make_independent_expand(str, slen, len - slen, termlen);
3339  }
3340  else if (str_embed_capa(str) >= len + termlen) {
3341  char *ptr = STR_HEAP_PTR(str);
3342  STR_SET_EMBED(str);
3343  if (slen > len) slen = len;
3344  if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3345  TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3346  STR_SET_LEN(str, len);
3347  if (independent) ruby_xfree(ptr);
3348  return str;
3349  }
3350  else if (!independent) {
3351  if (len == slen) return str;
3352  str_make_independent_expand(str, slen, len - slen, termlen);
3353  }
3354  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3355  (capa - len) > (len < 1024 ? len : 1024)) {
3356  SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3357  (size_t)len + termlen, STR_HEAP_SIZE(str));
3358  RSTRING(str)->as.heap.aux.capa = len;
3359  }
3360  else if (len == slen) return str;
3361  STR_SET_LEN(str, len);
3362  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3363  }
3364  return str;
3365 }
3366 
3367 static void
3368 str_ensure_available_capa(VALUE str, long len)
3369 {
3370  str_modify_keep_cr(str);
3371 
3372  const int termlen = TERM_LEN(str);
3373  long olen = RSTRING_LEN(str);
3374 
3375  if (RB_UNLIKELY(olen > LONG_MAX - len)) {
3376  rb_raise(rb_eArgError, "string sizes too big");
3377  }
3378 
3379  long total = olen + len;
3380  long capa = str_capacity(str, termlen);
3381 
3382  if (capa < total) {
3383  if (total >= LONG_MAX / 2) {
3384  capa = total;
3385  }
3386  while (total > capa) {
3387  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3388  }
3389  RESIZE_CAPA_TERM(str, capa, termlen);
3390  }
3391 }
3392 
3393 static VALUE
3394 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3395 {
3396  if (keep_cr) {
3397  str_modify_keep_cr(str);
3398  }
3399  else {
3400  rb_str_modify(str);
3401  }
3402  if (len == 0) return 0;
3403 
3404  long total, olen, off = -1;
3405  char *sptr;
3406  const int termlen = TERM_LEN(str);
3407 
3408  RSTRING_GETMEM(str, sptr, olen);
3409  if (ptr >= sptr && ptr <= sptr + olen) {
3410  off = ptr - sptr;
3411  }
3412 
3413  long capa = str_capacity(str, termlen);
3414 
3415  if (olen > LONG_MAX - len) {
3416  rb_raise(rb_eArgError, "string sizes too big");
3417  }
3418  total = olen + len;
3419  if (capa < total) {
3420  if (total >= LONG_MAX / 2) {
3421  capa = total;
3422  }
3423  while (total > capa) {
3424  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3425  }
3426  RESIZE_CAPA_TERM(str, capa, termlen);
3427  sptr = RSTRING_PTR(str);
3428  }
3429  if (off != -1) {
3430  ptr = sptr + off;
3431  }
3432  memcpy(sptr + olen, ptr, len);
3433  STR_SET_LEN(str, total);
3434  TERM_FILL(sptr + total, termlen); /* sentinel */
3435 
3436  return str;
3437 }
3438 
3439 #define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3440 #define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3441 
3442 VALUE
3443 rb_str_cat(VALUE str, const char *ptr, long len)
3444 {
3445  if (len == 0) return str;
3446  if (len < 0) {
3447  rb_raise(rb_eArgError, "negative string size (or size too big)");
3448  }
3449  return str_buf_cat(str, ptr, len);
3450 }
3451 
3452 VALUE
3453 rb_str_cat_cstr(VALUE str, const char *ptr)
3454 {
3455  must_not_null(ptr);
3456  return rb_str_buf_cat(str, ptr, strlen(ptr));
3457 }
3458 
3459 static void
3460 rb_str_buf_cat_byte(VALUE str, unsigned char byte)
3461 {
3462  RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT || RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII);
3463 
3464  // We can't write directly to shared strings without impacting others, so we must make the string independent.
3465  if (UNLIKELY(!str_independent(str))) {
3466  str_make_independent(str);
3467  }
3468 
3469  long string_length = -1;
3470  const int null_terminator_length = 1;
3471  char *sptr;
3472  RSTRING_GETMEM(str, sptr, string_length);
3473 
3474  // Ensure the resulting string wouldn't be too long.
3475  if (UNLIKELY(string_length > LONG_MAX - 1)) {
3476  rb_raise(rb_eArgError, "string sizes too big");
3477  }
3478 
3479  long string_capacity = str_capacity(str, null_terminator_length);
3480 
3481  // Get the code range before any modifications since those might clear the code range.
3482  int cr = ENC_CODERANGE(str);
3483 
3484  // Check if the string has spare string_capacity to write the new byte.
3485  if (LIKELY(string_capacity >= string_length + 1)) {
3486  // In fast path we can write the new byte and note the string's new length.
3487  sptr[string_length] = byte;
3488  STR_SET_LEN(str, string_length + 1);
3489  TERM_FILL(sptr + string_length + 1, null_terminator_length);
3490  }
3491  else {
3492  // If there's not enough string_capacity, make a call into the general string concatenation function.
3493  str_buf_cat(str, (char *)&byte, 1);
3494  }
3495 
3496  // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
3497  // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
3498  // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
3499  // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
3500  if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
3501  if (ISASCII(byte)) {
3503  }
3504  else {
3506 
3507  // Promote a US-ASCII string to ASCII-8BIT when a non-ASCII byte is appended.
3508  if (UNLIKELY(RB_ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)) {
3509  rb_enc_associate_index(str, ENCINDEX_ASCII_8BIT);
3510  }
3511  }
3512  }
3513 }
3514 
3515 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3516 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3517 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3518 
3519 static VALUE
3520 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3521  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3522 {
3523  int str_encindex = ENCODING_GET(str);
3524  int res_encindex;
3525  int str_cr, res_cr;
3526  rb_encoding *str_enc, *ptr_enc;
3527 
3528  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3529 
3530  if (str_encindex == ptr_encindex) {
3531  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3532  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3533  }
3534  }
3535  else {
3536  str_enc = rb_enc_from_index(str_encindex);
3537  ptr_enc = rb_enc_from_index(ptr_encindex);
3538  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3539  if (len == 0)
3540  return str;
3541  if (RSTRING_LEN(str) == 0) {
3542  rb_str_buf_cat(str, ptr, len);
3543  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3544  rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3545  return str;
3546  }
3547  goto incompatible;
3548  }
3549  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3550  ptr_cr = coderange_scan(ptr, len, ptr_enc);
3551  }
3552  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3553  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3554  str_cr = rb_enc_str_coderange(str);
3555  }
3556  }
3557  }
3558  if (ptr_cr_ret)
3559  *ptr_cr_ret = ptr_cr;
3560 
3561  if (str_encindex != ptr_encindex &&
3562  str_cr != ENC_CODERANGE_7BIT &&
3563  ptr_cr != ENC_CODERANGE_7BIT) {
3564  str_enc = rb_enc_from_index(str_encindex);
3565  ptr_enc = rb_enc_from_index(ptr_encindex);
3566  goto incompatible;
3567  }
3568 
3569  if (str_cr == ENC_CODERANGE_UNKNOWN) {
3570  res_encindex = str_encindex;
3571  res_cr = ENC_CODERANGE_UNKNOWN;
3572  }
3573  else if (str_cr == ENC_CODERANGE_7BIT) {
3574  if (ptr_cr == ENC_CODERANGE_7BIT) {
3575  res_encindex = str_encindex;
3576  res_cr = ENC_CODERANGE_7BIT;
3577  }
3578  else {
3579  res_encindex = ptr_encindex;
3580  res_cr = ptr_cr;
3581  }
3582  }
3583  else if (str_cr == ENC_CODERANGE_VALID) {
3584  res_encindex = str_encindex;
3585  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3586  res_cr = str_cr;
3587  else
3588  res_cr = ptr_cr;
3589  }
3590  else { /* str_cr == ENC_CODERANGE_BROKEN */
3591  res_encindex = str_encindex;
3592  res_cr = str_cr;
3593  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3594  }
3595 
3596  if (len < 0) {
3597  rb_raise(rb_eArgError, "negative string size (or size too big)");
3598  }
3599  str_buf_cat(str, ptr, len);
3600  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3601  return str;
3602 
3603  incompatible:
3604  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3605  rb_enc_inspect_name(str_enc), rb_enc_inspect_name(ptr_enc));
3607 }
3608 
3609 VALUE
3610 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3611 {
3612  return rb_enc_cr_str_buf_cat(str, ptr, len,
3613  rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3614 }
3615 
3616 VALUE
3617 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
3618 {
3619  /* ptr must reference NUL terminated ASCII string. */
3620  int encindex = ENCODING_GET(str);
3621  rb_encoding *enc = rb_enc_from_index(encindex);
3622  if (rb_enc_asciicompat(enc)) {
3623  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3624  encindex, ENC_CODERANGE_7BIT, 0);
3625  }
3626  else {
3627  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3628  while (*ptr) {
3629  unsigned int c = (unsigned char)*ptr;
3630  int len = rb_enc_codelen(c, enc);
3631  rb_enc_mbcput(c, buf, enc);
3632  rb_enc_cr_str_buf_cat(str, buf, len,
3633  encindex, ENC_CODERANGE_VALID, 0);
3634  ptr++;
3635  }
3636  return str;
3637  }
3638 }
3639 
3640 VALUE
3642 {
3643  int str2_cr = rb_enc_str_coderange(str2);
3644 
3645  if (str_enc_fastpath(str)) {
3646  switch (str2_cr) {
3647  case ENC_CODERANGE_7BIT:
3648  // If RHS is 7bit we can do simple concatenation
3649  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3650  RB_GC_GUARD(str2);
3651  return str;
3652  case ENC_CODERANGE_VALID:
3653  // If RHS is valid, we can do simple concatenation if encodings are the same
3654  if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3655  str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3656  int str_cr = ENC_CODERANGE(str);
3657  if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3658  ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3659  }
3660  RB_GC_GUARD(str2);
3661  return str;
3662  }
3663  }
3664  }
3665 
3666  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3667  ENCODING_GET(str2), str2_cr, &str2_cr);
3668 
3669  ENC_CODERANGE_SET(str2, str2_cr);
3670 
3671  return str;
3672 }
3673 
3674 VALUE
3676 {
3677  StringValue(str2);
3678  return rb_str_buf_append(str, str2);
3679 }
3680 
3681 VALUE
3682 rb_str_concat_literals(size_t num, const VALUE *strary)
3683 {
3684  VALUE str;
3685  size_t i, s = 0;
3686  unsigned long len = 1;
3687 
3688  if (UNLIKELY(!num)) return rb_str_new(0, 0);
3689  if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3690 
3691  for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3692  str = rb_str_buf_new(len);
3693  str_enc_copy_direct(str, strary[0]);
3694 
3695  for (i = s; i < num; ++i) {
3696  const VALUE v = strary[i];
3697  int encidx = ENCODING_GET(v);
3698 
3699  rb_str_buf_append(str, v);
3700  if (encidx != ENCINDEX_US_ASCII) {
3701  if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3702  rb_enc_set_index(str, encidx);
3703  }
3704  }
3705  return str;
3706 }
3707 
3708 /*
3709  * call-seq:
3710  * concat(*objects) -> string
3711  *
3712  * Concatenates each object in +objects+ to +self+ and returns +self+:
3713  *
3714  * s = 'foo'
3715  * s.concat('bar', 'baz') # => "foobarbaz"
3716  * s # => "foobarbaz"
3717  *
3718  * For each given object +object+ that is an Integer,
3719  * the value is considered a codepoint and converted to a character before concatenation:
3720  *
3721  * s = 'foo'
3722  * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3723  *
3724  * Related: String#<<, which takes a single argument.
3725  */
3726 static VALUE
3727 rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3728 {
3729  str_modifiable(str);
3730 
3731  if (argc == 1) {
3732  return rb_str_concat(str, argv[0]);
3733  }
3734  else if (argc > 1) {
3735  int i;
3736  VALUE arg_str = rb_str_tmp_new(0);
3737  rb_enc_copy(arg_str, str);
3738  for (i = 0; i < argc; i++) {
3739  rb_str_concat(arg_str, argv[i]);
3740  }
3741  rb_str_buf_append(str, arg_str);
3742  }
3743 
3744  return str;
3745 }
3746 
3747 /*
3748  * call-seq:
3749  * append_as_bytes(*objects) -> string
3750  *
3751  * Concatenates each object in +objects+ into +self+ without any encoding
3752  * validation or conversion and returns +self+:
3753  *
3754  * s = 'foo'
3755  * s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
3756  * s.valid_encoding? # => false
3757  * s.append_as_bytes("\xAC 12")
3758  * s.valid_encoding? # => true
3759  *
3760  * For each given object +object+ that is an Integer,
3761  * the value is considered a Byte. If the Integer is bigger
3762  * than one byte, only the lower byte is considered, similar to String#setbyte:
3763  *
3764  * s = ""
3765  * s.append_as_bytes(0, 257) # => "\u0000\u0001"
3766  *
3767  * Related: String#<<, String#concat, which do an encoding aware concatenation.
3768  */
3769 
3770 VALUE
3771 rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
3772 {
3773  long needed_capacity = 0;
3774  volatile VALUE t0;
3775  enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
3776 
3777  for (int index = 0; index < argc; index++) {
3778  VALUE obj = argv[index];
3779  enum ruby_value_type type = types[index] = rb_type(obj);
3780  switch (type) {
3781  case T_FIXNUM:
3782  case T_BIGNUM:
3783  needed_capacity++;
3784  break;
3785  case T_STRING:
3786  needed_capacity += RSTRING_LEN(obj);
3787  break;
3788  default:
3789  rb_raise(
3790  rb_eTypeError,
3791  "wrong argument type %"PRIsVALUE" (expected String or Integer)",
3792  rb_obj_class(obj)
3793  );
3794  break;
3795  }
3796  }
3797 
3798  str_ensure_available_capa(str, needed_capacity);
3799  char *sptr = RSTRING_END(str);
3800 
3801  for (int index = 0; index < argc; index++) {
3802  VALUE obj = argv[index];
3803  enum ruby_value_type type = types[index];
3804  switch (type) {
3805  case T_FIXNUM:
3806  case T_BIGNUM: {
3807  argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
3808  char byte = (char)(NUM2INT(obj) & 0xFF);
3809  *sptr = byte;
3810  sptr++;
3811  break;
3812  }
3813  case T_STRING: {
3814  const char *ptr;
3815  long len;
3816  RSTRING_GETMEM(obj, ptr, len);
3817  memcpy(sptr, ptr, len);
3818  sptr += len;
3819  break;
3820  }
3821  default:
3822  rb_bug("append_as_bytes arguments should have been validated");
3823  }
3824  }
3825 
3826  STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
3827  TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
3828 
3829  int cr = ENC_CODERANGE(str);
3830  switch (cr) {
3831  case ENC_CODERANGE_7BIT: {
3832  for (int index = 0; index < argc; index++) {
3833  VALUE obj = argv[index];
3834  enum ruby_value_type type = types[index];
3835  switch (type) {
3836  case T_FIXNUM:
3837  case T_BIGNUM: {
3838  if (!ISASCII(NUM2INT(obj))) {
3839  goto clear_cr;
3840  }
3841  break;
3842  }
3843  case T_STRING: {
3844  if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
3845  goto clear_cr;
3846  }
3847  break;
3848  }
3849  default:
3850  rb_bug("append_as_bytes arguments should have been validated");
3851  }
3852  }
3853  break;
3854  }
3855  case ENC_CODERANGE_VALID:
3856  if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
3857  goto keep_cr;
3858  }
3859  else {
3860  goto clear_cr;
3861  }
3862  break;
3863  default:
3864  goto clear_cr;
3865  break;
3866  }
3867 
3868  RB_GC_GUARD(t0);
3869 
3870  clear_cr:
3871  // If no fast path was hit, we clear the coderange.
3872  // append_as_bytes is predominently meant to be used in
3873  // buffering situation, hence it's likely the coderange
3874  // will never be scanned, so it's not worth spending time
3875  // precomputing the coderange except for simple and common
3876  // situations.
3877  ENC_CODERANGE_CLEAR(str);
3878  keep_cr:
3879  return str;
3880 }
3881 
3882 /*
3883  * call-seq:
3884  * string << object -> string
3885  *
3886  * Concatenates +object+ to +self+ and returns +self+:
3887  *
3888  * s = 'foo'
3889  * s << 'bar' # => "foobar"
3890  * s # => "foobar"
3891  *
3892  * If +object+ is an Integer,
3893  * the value is considered a codepoint and converted to a character before concatenation:
3894  *
3895  * s = 'foo'
3896  * s << 33 # => "foo!"
3897  *
3898  * If that codepoint is not representable in the encoding of
3899  * _string_, RangeError is raised.
3900  *
3901  * s = 'foo'
3902  * s.encoding # => <Encoding:UTF-8>
3903  * s << 0x00110000 # 1114112 out of char range (RangeError)
3904  * s = 'foo'.encode('EUC-JP')
3905  * s << 0x00800080 # invalid codepoint 0x800080 in EUC-JP (RangeError)
3906  *
3907  * If the encoding is US-ASCII and the codepoint is 0..0xff, _string_
3908  * is automatically promoted to ASCII-8BIT.
3909  *
3910  * s = 'foo'.encode('US-ASCII')
3911  * s << 0xff
3912  * s.encoding # => #<Encoding:BINARY (ASCII-8BIT)>
3913  *
3914  * Related: String#concat, which takes multiple arguments.
3915  */
3916 VALUE
3918 {
3919  unsigned int code;
3920  rb_encoding *enc = STR_ENC_GET(str1);
3921  int encidx;
3922 
3923  if (RB_INTEGER_TYPE_P(str2)) {
3924  if (rb_num_to_uint(str2, &code) == 0) {
3925  }
3926  else if (FIXNUM_P(str2)) {
3927  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3928  }
3929  else {
3930  rb_raise(rb_eRangeError, "bignum out of char range");
3931  }
3932  }
3933  else {
3934  return rb_str_append(str1, str2);
3935  }
3936 
3937  encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3938 
3939  if (encidx >= 0) {
3940  rb_str_buf_cat_byte(str1, (unsigned char)code);
3941  }
3942  else {
3943  long pos = RSTRING_LEN(str1);
3944  int cr = ENC_CODERANGE(str1);
3945  int len;
3946  char *buf;
3947 
3948  switch (len = rb_enc_codelen(code, enc)) {
3949  case ONIGERR_INVALID_CODE_POINT_VALUE:
3950  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3951  break;
3952  case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3953  case 0:
3954  rb_raise(rb_eRangeError, "%u out of char range", code);
3955  break;
3956  }
3957  buf = ALLOCA_N(char, len + 1);
3958  rb_enc_mbcput(code, buf, enc);
3959  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3960  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3961  }
3962  rb_str_resize(str1, pos+len);
3963  memcpy(RSTRING_PTR(str1) + pos, buf, len);
3964  if (cr == ENC_CODERANGE_7BIT && code > 127) {
3965  cr = ENC_CODERANGE_VALID;
3966  }
3967  else if (cr == ENC_CODERANGE_BROKEN) {
3968  cr = ENC_CODERANGE_UNKNOWN;
3969  }
3970  ENC_CODERANGE_SET(str1, cr);
3971  }
3972  return str1;
3973 }
3974 
3975 int
3976 rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3977 {
3978  int encidx = rb_enc_to_index(enc);
3979 
3980  if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3981  /* US-ASCII automatically extended to ASCII-8BIT */
3982  if (code > 0xFF) {
3983  rb_raise(rb_eRangeError, "%u out of char range", code);
3984  }
3985  if (encidx == ENCINDEX_US_ASCII && code > 127) {
3986  return ENCINDEX_ASCII_8BIT;
3987  }
3988  return encidx;
3989  }
3990  else {
3991  return -1;
3992  }
3993 }
3994 
3995 /*
3996  * call-seq:
3997  * prepend(*other_strings) -> string
3998  *
3999  * Prepends each string in +other_strings+ to +self+ and returns +self+:
4000  *
4001  * s = 'foo'
4002  * s.prepend('bar', 'baz') # => "barbazfoo"
4003  * s # => "barbazfoo"
4004  *
4005  * Related: String#concat.
4006  */
4007 
4008 static VALUE
4009 rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
4010 {
4011  str_modifiable(str);
4012 
4013  if (argc == 1) {
4014  rb_str_update(str, 0L, 0L, argv[0]);
4015  }
4016  else if (argc > 1) {
4017  int i;
4018  VALUE arg_str = rb_str_tmp_new(0);
4019  rb_enc_copy(arg_str, str);
4020  for (i = 0; i < argc; i++) {
4021  rb_str_append(arg_str, argv[i]);
4022  }
4023  rb_str_update(str, 0L, 0L, arg_str);
4024  }
4025 
4026  return str;
4027 }
4028 
4029 st_index_t
4031 {
4032  if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
4033  st_index_t precomputed_hash;
4034  memcpy(&precomputed_hash, RSTRING_END(str) + TERM_LEN(str), sizeof(precomputed_hash));
4035 
4036  RUBY_ASSERT(precomputed_hash == str_do_hash(str));
4037  return precomputed_hash;
4038  }
4039 
4040  return str_do_hash(str);
4041 }
4042 
4043 int
4045 {
4046  long len1, len2;
4047  const char *ptr1, *ptr2;
4048  RSTRING_GETMEM(str1, ptr1, len1);
4049  RSTRING_GETMEM(str2, ptr2, len2);
4050  return (len1 != len2 ||
4051  !rb_str_comparable(str1, str2) ||
4052  memcmp(ptr1, ptr2, len1) != 0);
4053 }
4054 
4055 /*
4056  * call-seq:
4057  * hash -> integer
4058  *
4059  * Returns the integer hash value for +self+.
4060  * The value is based on the length, content and encoding of +self+.
4061  *
4062  * Related: Object#hash.
4063  */
4064 
4065 static VALUE
4066 rb_str_hash_m(VALUE str)
4067 {
4068  st_index_t hval = rb_str_hash(str);
4069  return ST2FIX(hval);
4070 }
4071 
4072 #define lesser(a,b) (((a)>(b))?(b):(a))
4073 
4074 int
4076 {
4077  int idx1, idx2;
4078  int rc1, rc2;
4079 
4080  if (RSTRING_LEN(str1) == 0) return TRUE;
4081  if (RSTRING_LEN(str2) == 0) return TRUE;
4082  idx1 = ENCODING_GET(str1);
4083  idx2 = ENCODING_GET(str2);
4084  if (idx1 == idx2) return TRUE;
4085  rc1 = rb_enc_str_coderange(str1);
4086  rc2 = rb_enc_str_coderange(str2);
4087  if (rc1 == ENC_CODERANGE_7BIT) {
4088  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
4090  return TRUE;
4091  }
4092  if (rc2 == ENC_CODERANGE_7BIT) {
4094  return TRUE;
4095  }
4096  return FALSE;
4097 }
4098 
4099 int
4101 {
4102  long len1, len2;
4103  const char *ptr1, *ptr2;
4104  int retval;
4105 
4106  if (str1 == str2) return 0;
4107  RSTRING_GETMEM(str1, ptr1, len1);
4108  RSTRING_GETMEM(str2, ptr2, len2);
4109  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
4110  if (len1 == len2) {
4111  if (!rb_str_comparable(str1, str2)) {
4112  if (ENCODING_GET(str1) > ENCODING_GET(str2))
4113  return 1;
4114  return -1;
4115  }
4116  return 0;
4117  }
4118  if (len1 > len2) return 1;
4119  return -1;
4120  }
4121  if (retval > 0) return 1;
4122  return -1;
4123 }
4124 
4125 /*
4126  * call-seq:
4127  * string == object -> true or false
4128  * string === object -> true or false
4129  *
4130  * Returns +true+ if +object+ has the same length and content;
4131  * as +self+; +false+ otherwise:
4132  *
4133  * s = 'foo'
4134  * s == 'foo' # => true
4135  * s == 'food' # => false
4136  * s == 'FOO' # => false
4137  *
4138  * Returns +false+ if the two strings' encodings are not compatible:
4139  * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
4140  *
4141  * If +object+ is not an instance of +String+ but responds to +to_str+, then the
4142  * two strings are compared using <code>object.==</code>.
4143  */
4144 
4145 VALUE
4147 {
4148  if (str1 == str2) return Qtrue;
4149  if (!RB_TYPE_P(str2, T_STRING)) {
4150  if (!rb_respond_to(str2, idTo_str)) {
4151  return Qfalse;
4152  }
4153  return rb_equal(str2, str1);
4154  }
4155  return rb_str_eql_internal(str1, str2);
4156 }
4157 
4158 /*
4159  * call-seq:
4160  * eql?(object) -> true or false
4161  *
4162  * Returns +true+ if +object+ has the same length and content;
4163  * as +self+; +false+ otherwise:
4164  *
4165  * s = 'foo'
4166  * s.eql?('foo') # => true
4167  * s.eql?('food') # => false
4168  * s.eql?('FOO') # => false
4169  *
4170  * Returns +false+ if the two strings' encodings are not compatible:
4171  *
4172  * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
4173  *
4174  */
4175 
4176 VALUE
4177 rb_str_eql(VALUE str1, VALUE str2)
4178 {
4179  if (str1 == str2) return Qtrue;
4180  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
4181  return rb_str_eql_internal(str1, str2);
4182 }
4183 
4184 /*
4185  * call-seq:
4186  * string <=> other_string -> -1, 0, 1, or nil
4187  *
4188  * Compares +self+ and +other_string+, returning:
4189  *
4190  * - -1 if +other_string+ is larger.
4191  * - 0 if the two are equal.
4192  * - 1 if +other_string+ is smaller.
4193  * - +nil+ if the two are incomparable.
4194  *
4195  * Examples:
4196  *
4197  * 'foo' <=> 'foo' # => 0
4198  * 'foo' <=> 'food' # => -1
4199  * 'food' <=> 'foo' # => 1
4200  * 'FOO' <=> 'foo' # => -1
4201  * 'foo' <=> 'FOO' # => 1
4202  * 'foo' <=> 1 # => nil
4203  *
4204  */
4205 
4206 static VALUE
4207 rb_str_cmp_m(VALUE str1, VALUE str2)
4208 {
4209  int result;
4210  VALUE s = rb_check_string_type(str2);
4211  if (NIL_P(s)) {
4212  return rb_invcmp(str1, str2);
4213  }
4214  result = rb_str_cmp(str1, s);
4215  return INT2FIX(result);
4216 }
4217 
4218 static VALUE str_casecmp(VALUE str1, VALUE str2);
4219 static VALUE str_casecmp_p(VALUE str1, VALUE str2);
4220 
4221 /*
4222  * call-seq:
4223  * casecmp(other_string) -> -1, 0, 1, or nil
4224  *
4225  * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
4226  *
4227  * - -1 if <tt>other_string.downcase</tt> is larger.
4228  * - 0 if the two are equal.
4229  * - 1 if <tt>other_string.downcase</tt> is smaller.
4230  * - +nil+ if the two are incomparable.
4231  *
4232  * Examples:
4233  *
4234  * 'foo'.casecmp('foo') # => 0
4235  * 'foo'.casecmp('food') # => -1
4236  * 'food'.casecmp('foo') # => 1
4237  * 'FOO'.casecmp('foo') # => 0
4238  * 'foo'.casecmp('FOO') # => 0
4239  * 'foo'.casecmp(1) # => nil
4240  *
4241  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4242  *
4243  * Related: String#casecmp?.
4244  *
4245  */
4246 
4247 static VALUE
4248 rb_str_casecmp(VALUE str1, VALUE str2)
4249 {
4250  VALUE s = rb_check_string_type(str2);
4251  if (NIL_P(s)) {
4252  return Qnil;
4253  }
4254  return str_casecmp(str1, s);
4255 }
4256 
4257 static VALUE
4258 str_casecmp(VALUE str1, VALUE str2)
4259 {
4260  long len;
4261  rb_encoding *enc;
4262  const char *p1, *p1end, *p2, *p2end;
4263 
4264  enc = rb_enc_compatible(str1, str2);
4265  if (!enc) {
4266  return Qnil;
4267  }
4268 
4269  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
4270  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
4271  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
4272  while (p1 < p1end && p2 < p2end) {
4273  if (*p1 != *p2) {
4274  unsigned int c1 = TOLOWER(*p1 & 0xff);
4275  unsigned int c2 = TOLOWER(*p2 & 0xff);
4276  if (c1 != c2)
4277  return INT2FIX(c1 < c2 ? -1 : 1);
4278  }
4279  p1++;
4280  p2++;
4281  }
4282  }
4283  else {
4284  while (p1 < p1end && p2 < p2end) {
4285  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
4286  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
4287 
4288  if (0 <= c1 && 0 <= c2) {
4289  c1 = TOLOWER(c1);
4290  c2 = TOLOWER(c2);
4291  if (c1 != c2)
4292  return INT2FIX(c1 < c2 ? -1 : 1);
4293  }
4294  else {
4295  int r;
4296  l1 = rb_enc_mbclen(p1, p1end, enc);
4297  l2 = rb_enc_mbclen(p2, p2end, enc);
4298  len = l1 < l2 ? l1 : l2;
4299  r = memcmp(p1, p2, len);
4300  if (r != 0)
4301  return INT2FIX(r < 0 ? -1 : 1);
4302  if (l1 != l2)
4303  return INT2FIX(l1 < l2 ? -1 : 1);
4304  }
4305  p1 += l1;
4306  p2 += l2;
4307  }
4308  }
4309  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
4310  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
4311  return INT2FIX(-1);
4312 }
4313 
4314 /*
4315  * call-seq:
4316  * casecmp?(other_string) -> true, false, or nil
4317  *
4318  * Returns +true+ if +self+ and +other_string+ are equal after
4319  * Unicode case folding, otherwise +false+:
4320  *
4321  * 'foo'.casecmp?('foo') # => true
4322  * 'foo'.casecmp?('food') # => false
4323  * 'food'.casecmp?('foo') # => false
4324  * 'FOO'.casecmp?('foo') # => true
4325  * 'foo'.casecmp?('FOO') # => true
4326  *
4327  * Returns +nil+ if the two values are incomparable:
4328  *
4329  * 'foo'.casecmp?(1) # => nil
4330  *
4331  * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
4332  *
4333  * Related: String#casecmp.
4334  *
4335  */
4336 
4337 static VALUE
4338 rb_str_casecmp_p(VALUE str1, VALUE str2)
4339 {
4340  VALUE s = rb_check_string_type(str2);
4341  if (NIL_P(s)) {
4342  return Qnil;
4343  }
4344  return str_casecmp_p(str1, s);
4345 }
4346 
4347 static VALUE
4348 str_casecmp_p(VALUE str1, VALUE str2)
4349 {
4350  rb_encoding *enc;
4351  VALUE folded_str1, folded_str2;
4352  VALUE fold_opt = sym_fold;
4353 
4354  enc = rb_enc_compatible(str1, str2);
4355  if (!enc) {
4356  return Qnil;
4357  }
4358 
4359  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
4360  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
4361 
4362  return rb_str_eql(folded_str1, folded_str2);
4363 }
4364 
4365 static long
4366 strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
4367  const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
4368 {
4369  const char *search_start = str_ptr;
4370  long pos, search_len = str_len - offset;
4371 
4372  for (;;) {
4373  const char *t;
4374  pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
4375  if (pos < 0) return pos;
4376  t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
4377  if (t == search_start + pos) break;
4378  search_len -= t - search_start;
4379  if (search_len <= 0) return -1;
4380  offset += t - search_start;
4381  search_start = t;
4382  }
4383  return pos + offset;
4384 }
4385 
4386 /* found index in byte */
4387 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
4388 #define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
4389 
4390 static long
4391 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
4392 {
4393  const char *str_ptr, *str_ptr_end, *sub_ptr;
4394  long str_len, sub_len;
4395  rb_encoding *enc;
4396 
4397  enc = rb_enc_check(str, sub);
4398  if (is_broken_string(sub)) return -1;
4399 
4400  str_ptr = RSTRING_PTR(str);
4401  str_ptr_end = RSTRING_END(str);
4402  str_len = RSTRING_LEN(str);
4403  sub_ptr = RSTRING_PTR(sub);
4404  sub_len = RSTRING_LEN(sub);
4405 
4406  if (str_len < sub_len) return -1;
4407 
4408  if (offset != 0) {
4409  long str_len_char, sub_len_char;
4410  int single_byte = single_byte_optimizable(str);
4411  str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4412  sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4413  if (offset < 0) {
4414  offset += str_len_char;
4415  if (offset < 0) return -1;
4416  }
4417  if (str_len_char - offset < sub_len_char) return -1;
4418  if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4419  str_ptr += offset;
4420  }
4421  if (sub_len == 0) return offset;
4422 
4423  /* need proceed one character at a time */
4424  return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4425 }
4426 
4427 
4428 /*
4429  * call-seq:
4430  * index(substring, offset = 0) -> integer or nil
4431  * index(regexp, offset = 0) -> integer or nil
4432  *
4433  * :include: doc/string/index.rdoc
4434  *
4435  */
4436 
4437 static VALUE
4438 rb_str_index_m(int argc, VALUE *argv, VALUE str)
4439 {
4440  VALUE sub;
4441  VALUE initpos;
4442  rb_encoding *enc = STR_ENC_GET(str);
4443  long pos;
4444 
4445  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4446  long slen = str_strlen(str, enc); /* str's enc */
4447  pos = NUM2LONG(initpos);
4448  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4449  if (RB_TYPE_P(sub, T_REGEXP)) {
4451  }
4452  return Qnil;
4453  }
4454  }
4455  else {
4456  pos = 0;
4457  }
4458 
4459  if (RB_TYPE_P(sub, T_REGEXP)) {
4460  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4461  enc, single_byte_optimizable(str));
4462 
4463  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4464  VALUE match = rb_backref_get();
4465  struct re_registers *regs = RMATCH_REGS(match);
4466  pos = rb_str_sublen(str, BEG(0));
4467  return LONG2NUM(pos);
4468  }
4469  }
4470  else {
4471  StringValue(sub);
4472  pos = rb_str_index(str, sub, pos);
4473  if (pos >= 0) {
4474  pos = rb_str_sublen(str, pos);
4475  return LONG2NUM(pos);
4476  }
4477  }
4478  return Qnil;
4479 }
4480 
4481 /* Ensure that the given pos is a valid character boundary.
4482  * Note that in this function, "character" means a code point
4483  * (Unicode scalar value), not a grapheme cluster.
4484  */
4485 static void
4486 str_ensure_byte_pos(VALUE str, long pos)
4487 {
4488  if (!single_byte_optimizable(str)) {
4489  const char *s = RSTRING_PTR(str);
4490  const char *e = RSTRING_END(str);
4491  const char *p = s + pos;
4492  if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4494  "offset %ld does not land on character boundary", pos);
4495  }
4496  }
4497 }
4498 
4499 /*
4500  * call-seq:
4501  * byteindex(substring, offset = 0) -> integer or nil
4502  * byteindex(regexp, offset = 0) -> integer or nil
4503  *
4504  * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4505  * or +nil+ if none found:
4506  *
4507  * 'foo'.byteindex('f') # => 0
4508  * 'foo'.byteindex('o') # => 1
4509  * 'foo'.byteindex('oo') # => 1
4510  * 'foo'.byteindex('ooo') # => nil
4511  *
4512  * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4513  * or +nil+ if none found:
4514  *
4515  * 'foo'.byteindex(/f/) # => 0
4516  * 'foo'.byteindex(/o/) # => 1
4517  * 'foo'.byteindex(/oo/) # => 1
4518  * 'foo'.byteindex(/ooo/) # => nil
4519  *
4520  * Integer argument +offset+, if given, specifies the byte-based position in the
4521  * string to begin the search:
4522  *
4523  * 'foo'.byteindex('o', 1) # => 1
4524  * 'foo'.byteindex('o', 2) # => 2
4525  * 'foo'.byteindex('o', 3) # => nil
4526  *
4527  * If +offset+ is negative, counts backward from the end of +self+:
4528  *
4529  * 'foo'.byteindex('o', -1) # => 2
4530  * 'foo'.byteindex('o', -2) # => 1
4531  * 'foo'.byteindex('o', -3) # => 1
4532  * 'foo'.byteindex('o', -4) # => nil
4533  *
4534  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4535  * raised.
4536  *
4537  * Related: String#index, String#byterindex.
4538  */
4539 
4540 static VALUE
4541 rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4542 {
4543  VALUE sub;
4544  VALUE initpos;
4545  long pos;
4546 
4547  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4548  long slen = RSTRING_LEN(str);
4549  pos = NUM2LONG(initpos);
4550  if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4551  if (RB_TYPE_P(sub, T_REGEXP)) {
4553  }
4554  return Qnil;
4555  }
4556  }
4557  else {
4558  pos = 0;
4559  }
4560 
4561  str_ensure_byte_pos(str, pos);
4562 
4563  if (RB_TYPE_P(sub, T_REGEXP)) {
4564  if (rb_reg_search(sub, str, pos, 0) >= 0) {
4565  VALUE match = rb_backref_get();
4566  struct re_registers *regs = RMATCH_REGS(match);
4567  pos = BEG(0);
4568  return LONG2NUM(pos);
4569  }
4570  }
4571  else {
4572  StringValue(sub);
4573  pos = rb_str_byteindex(str, sub, pos);
4574  if (pos >= 0) return LONG2NUM(pos);
4575  }
4576  return Qnil;
4577 }
4578 
4579 #ifndef HAVE_MEMRCHR
4580 static void*
4581 memrchr(const char *search_str, int chr, long search_len)
4582 {
4583  const char *ptr = search_str + search_len;
4584  while (ptr > search_str) {
4585  if ((unsigned char)*(--ptr) == chr) return (void *)ptr;
4586  }
4587 
4588  return ((void *)0);
4589 }
4590 #endif
4591 
4592 static long
4593 str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4594 {
4595  char *hit, *adjusted;
4596  int c;
4597  long slen, searchlen;
4598  char *sbeg, *e, *t;
4599 
4600  sbeg = RSTRING_PTR(str);
4601  slen = RSTRING_LEN(sub);
4602  if (slen == 0) return s - sbeg;
4603  e = RSTRING_END(str);
4604  t = RSTRING_PTR(sub);
4605  c = *t & 0xff;
4606  searchlen = s - sbeg + 1;
4607 
4608  if (memcmp(s, t, slen) == 0) {
4609  return s - sbeg;
4610  }
4611 
4612  do {
4613  hit = memrchr(sbeg, c, searchlen);
4614  if (!hit) break;
4615  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4616  if (hit != adjusted) {
4617  searchlen = adjusted - sbeg;
4618  continue;
4619  }
4620  if (memcmp(hit, t, slen) == 0)
4621  return hit - sbeg;
4622  searchlen = adjusted - sbeg;
4623  } while (searchlen > 0);
4624 
4625  return -1;
4626 }
4627 
4628 /* found index in byte */
4629 static long
4630 rb_str_rindex(VALUE str, VALUE sub, long pos)
4631 {
4632  long len, slen;
4633  char *sbeg, *s;
4634  rb_encoding *enc;
4635  int singlebyte;
4636 
4637  enc = rb_enc_check(str, sub);
4638  if (is_broken_string(sub)) return -1;
4639  singlebyte = single_byte_optimizable(str);
4640  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4641  slen = str_strlen(sub, enc); /* rb_enc_check */
4642 
4643  /* substring longer than string */
4644  if (len < slen) return -1;
4645  if (len - pos < slen) pos = len - slen;
4646  if (len == 0) return pos;
4647 
4648  sbeg = RSTRING_PTR(str);
4649 
4650  if (pos == 0) {
4651  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4652  return 0;
4653  else
4654  return -1;
4655  }
4656 
4657  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4658  return str_rindex(str, sub, s, enc);
4659 }
4660 
4661 /*
4662  * call-seq:
4663  * rindex(substring, offset = self.length) -> integer or nil
4664  * rindex(regexp, offset = self.length) -> integer or nil
4665  *
4666  * Returns the Integer index of the _last_ occurrence of the given +substring+,
4667  * or +nil+ if none found:
4668  *
4669  * 'foo'.rindex('f') # => 0
4670  * 'foo'.rindex('o') # => 2
4671  * 'foo'.rindex('oo') # => 1
4672  * 'foo'.rindex('ooo') # => nil
4673  *
4674  * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4675  * or +nil+ if none found:
4676  *
4677  * 'foo'.rindex(/f/) # => 0
4678  * 'foo'.rindex(/o/) # => 2
4679  * 'foo'.rindex(/oo/) # => 1
4680  * 'foo'.rindex(/ooo/) # => nil
4681  *
4682  * The _last_ match means starting at the possible last position, not
4683  * the last of longest matches.
4684  *
4685  * 'foo'.rindex(/o+/) # => 2
4686  * $~ #=> #<MatchData "o">
4687  *
4688  * To get the last longest match, needs to combine with negative
4689  * lookbehind.
4690  *
4691  * 'foo'.rindex(/(?<!o)o+/) # => 1
4692  * $~ #=> #<MatchData "oo">
4693  *
4694  * Or String#index with negative lookforward.
4695  *
4696  * 'foo'.index(/o+(?!.*o)/) # => 1
4697  * $~ #=> #<MatchData "oo">
4698  *
4699  * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4700  * string to _end_ the search:
4701  *
4702  * 'foo'.rindex('o', 0) # => nil
4703  * 'foo'.rindex('o', 1) # => 1
4704  * 'foo'.rindex('o', 2) # => 2
4705  * 'foo'.rindex('o', 3) # => 2
4706  *
4707  * If +offset+ is a negative Integer, the maximum starting position in the
4708  * string to _end_ the search is the sum of the string's length and +offset+:
4709  *
4710  * 'foo'.rindex('o', -1) # => 2
4711  * 'foo'.rindex('o', -2) # => 1
4712  * 'foo'.rindex('o', -3) # => nil
4713  * 'foo'.rindex('o', -4) # => nil
4714  *
4715  * Related: String#index.
4716  */
4717 
4718 static VALUE
4719 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4720 {
4721  VALUE sub;
4722  VALUE initpos;
4723  rb_encoding *enc = STR_ENC_GET(str);
4724  long pos, len = str_strlen(str, enc); /* str's enc */
4725 
4726  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4727  pos = NUM2LONG(initpos);
4728  if (pos < 0 && (pos += len) < 0) {
4729  if (RB_TYPE_P(sub, T_REGEXP)) {
4731  }
4732  return Qnil;
4733  }
4734  if (pos > len) pos = len;
4735  }
4736  else {
4737  pos = len;
4738  }
4739 
4740  if (RB_TYPE_P(sub, T_REGEXP)) {
4741  /* enc = rb_enc_check(str, sub); */
4742  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4743  enc, single_byte_optimizable(str));
4744 
4745  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4746  VALUE match = rb_backref_get();
4747  struct re_registers *regs = RMATCH_REGS(match);
4748  pos = rb_str_sublen(str, BEG(0));
4749  return LONG2NUM(pos);
4750  }
4751  }
4752  else {
4753  StringValue(sub);
4754  pos = rb_str_rindex(str, sub, pos);
4755  if (pos >= 0) {
4756  pos = rb_str_sublen(str, pos);
4757  return LONG2NUM(pos);
4758  }
4759  }
4760  return Qnil;
4761 }
4762 
4763 static long
4764 rb_str_byterindex(VALUE str, VALUE sub, long pos)
4765 {
4766  long len, slen;
4767  char *sbeg, *s;
4768  rb_encoding *enc;
4769 
4770  enc = rb_enc_check(str, sub);
4771  if (is_broken_string(sub)) return -1;
4772  len = RSTRING_LEN(str);
4773  slen = RSTRING_LEN(sub);
4774 
4775  /* substring longer than string */
4776  if (len < slen) return -1;
4777  if (len - pos < slen) pos = len - slen;
4778  if (len == 0) return pos;
4779 
4780  sbeg = RSTRING_PTR(str);
4781 
4782  if (pos == 0) {
4783  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4784  return 0;
4785  else
4786  return -1;
4787  }
4788 
4789  s = sbeg + pos;
4790  return str_rindex(str, sub, s, enc);
4791 }
4792 
4793 
4794 /*
4795  * call-seq:
4796  * byterindex(substring, offset = self.bytesize) -> integer or nil
4797  * byterindex(regexp, offset = self.bytesize) -> integer or nil
4798  *
4799  * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4800  * or +nil+ if none found:
4801  *
4802  * 'foo'.byterindex('f') # => 0
4803  * 'foo'.byterindex('o') # => 2
4804  * 'foo'.byterindex('oo') # => 1
4805  * 'foo'.byterindex('ooo') # => nil
4806  *
4807  * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4808  * or +nil+ if none found:
4809  *
4810  * 'foo'.byterindex(/f/) # => 0
4811  * 'foo'.byterindex(/o/) # => 2
4812  * 'foo'.byterindex(/oo/) # => 1
4813  * 'foo'.byterindex(/ooo/) # => nil
4814  *
4815  * The _last_ match means starting at the possible last position, not
4816  * the last of longest matches.
4817  *
4818  * 'foo'.byterindex(/o+/) # => 2
4819  * $~ #=> #<MatchData "o">
4820  *
4821  * To get the last longest match, needs to combine with negative
4822  * lookbehind.
4823  *
4824  * 'foo'.byterindex(/(?<!o)o+/) # => 1
4825  * $~ #=> #<MatchData "oo">
4826  *
4827  * Or String#byteindex with negative lookforward.
4828  *
4829  * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4830  * $~ #=> #<MatchData "oo">
4831  *
4832  * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4833  * string to _end_ the search:
4834  *
4835  * 'foo'.byterindex('o', 0) # => nil
4836  * 'foo'.byterindex('o', 1) # => 1
4837  * 'foo'.byterindex('o', 2) # => 2
4838  * 'foo'.byterindex('o', 3) # => 2
4839  *
4840  * If +offset+ is a negative Integer, the maximum starting position in the
4841  * string to _end_ the search is the sum of the string's length and +offset+:
4842  *
4843  * 'foo'.byterindex('o', -1) # => 2
4844  * 'foo'.byterindex('o', -2) # => 1
4845  * 'foo'.byterindex('o', -3) # => nil
4846  * 'foo'.byterindex('o', -4) # => nil
4847  *
4848  * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4849  * raised.
4850  *
4851  * Related: String#byteindex.
4852  */
4853 
4854 static VALUE
4855 rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4856 {
4857  VALUE sub;
4858  VALUE initpos;
4859  long pos, len = RSTRING_LEN(str);
4860 
4861  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4862  pos = NUM2LONG(initpos);
4863  if (pos < 0 && (pos += len) < 0) {
4864  if (RB_TYPE_P(sub, T_REGEXP)) {
4866  }
4867  return Qnil;
4868  }
4869  if (pos > len) pos = len;
4870  }
4871  else {
4872  pos = len;
4873  }
4874 
4875  str_ensure_byte_pos(str, pos);
4876 
4877  if (RB_TYPE_P(sub, T_REGEXP)) {
4878  if (rb_reg_search(sub, str, pos, 1) >= 0) {
4879  VALUE match = rb_backref_get();
4880  struct re_registers *regs = RMATCH_REGS(match);
4881  pos = BEG(0);
4882  return LONG2NUM(pos);
4883  }
4884  }
4885  else {
4886  StringValue(sub);
4887  pos = rb_str_byterindex(str, sub, pos);
4888  if (pos >= 0) return LONG2NUM(pos);
4889  }
4890  return Qnil;
4891 }
4892 
4893 /*
4894  * call-seq:
4895  * string =~ regexp -> integer or nil
4896  * string =~ object -> integer or nil
4897  *
4898  * Returns the Integer index of the first substring that matches
4899  * the given +regexp+, or +nil+ if no match found:
4900  *
4901  * 'foo' =~ /f/ # => 0
4902  * 'foo' =~ /o/ # => 1
4903  * 'foo' =~ /x/ # => nil
4904  *
4905  * Note: also updates Regexp@Global+Variables.
4906  *
4907  * If the given +object+ is not a Regexp, returns the value
4908  * returned by <tt>object =~ self</tt>.
4909  *
4910  * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4911  * (see Regexp#=~):
4912  *
4913  * number= nil
4914  * "no. 9" =~ /(?<number>\d+)/
4915  * number # => nil (not assigned)
4916  * /(?<number>\d+)/ =~ "no. 9"
4917  * number #=> "9"
4918  *
4919  */
4920 
4921 static VALUE
4922 rb_str_match(VALUE x, VALUE y)
4923 {
4924  switch (OBJ_BUILTIN_TYPE(y)) {
4925  case T_STRING:
4926  rb_raise(rb_eTypeError, "type mismatch: String given");
4927 
4928  case T_REGEXP:
4929  return rb_reg_match(y, x);
4930 
4931  default:
4932  return rb_funcall(y, idEqTilde, 1, x);
4933  }
4934 }
4935 
4936 
4937 static VALUE get_pat(VALUE);
4938 
4939 
4940 /*
4941  * call-seq:
4942  * match(pattern, offset = 0) -> matchdata or nil
4943  * match(pattern, offset = 0) {|matchdata| ... } -> object
4944  *
4945  * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4946  *
4947  * Note: also updates Regexp@Global+Variables.
4948  *
4949  * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4950  * regexp = Regexp.new(pattern)
4951  * - Computes +matchdata+, which will be either a MatchData object or +nil+
4952  * (see Regexp#match):
4953  * matchdata = <tt>regexp.match(self)
4954  *
4955  * With no block given, returns the computed +matchdata+:
4956  *
4957  * 'foo'.match('f') # => #<MatchData "f">
4958  * 'foo'.match('o') # => #<MatchData "o">
4959  * 'foo'.match('x') # => nil
4960  *
4961  * If Integer argument +offset+ is given, the search begins at index +offset+:
4962  *
4963  * 'foo'.match('f', 1) # => nil
4964  * 'foo'.match('o', 1) # => #<MatchData "o">
4965  *
4966  * With a block given, calls the block with the computed +matchdata+
4967  * and returns the block's return value:
4968  *
4969  * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4970  * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4971  * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4972  *
4973  */
4974 
4975 static VALUE
4976 rb_str_match_m(int argc, VALUE *argv, VALUE str)
4977 {
4978  VALUE re, result;
4979  if (argc < 1)
4980  rb_check_arity(argc, 1, 2);
4981  re = argv[0];
4982  argv[0] = str;
4983  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4984  if (!NIL_P(result) && rb_block_given_p()) {
4985  return rb_yield(result);
4986  }
4987  return result;
4988 }
4989 
4990 /*
4991  * call-seq:
4992  * match?(pattern, offset = 0) -> true or false
4993  *
4994  * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4995  *
4996  * Note: does not update Regexp@Global+Variables.
4997  *
4998  * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4999  * regexp = Regexp.new(pattern)
5000  *
5001  * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
5002  * +false+ otherwise:
5003  *
5004  * 'foo'.match?(/o/) # => true
5005  * 'foo'.match?('o') # => true
5006  * 'foo'.match?(/x/) # => false
5007  *
5008  * If Integer argument +offset+ is given, the search begins at index +offset+:
5009  * 'foo'.match?('f', 1) # => false
5010  * 'foo'.match?('o', 1) # => true
5011  *
5012  */
5013 
5014 static VALUE
5015 rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
5016 {
5017  VALUE re;
5018  rb_check_arity(argc, 1, 2);
5019  re = get_pat(argv[0]);
5020  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
5021 }
5022 
5023 enum neighbor_char {
5024  NEIGHBOR_NOT_CHAR,
5025  NEIGHBOR_FOUND,
5026  NEIGHBOR_WRAPPED
5027 };
5028 
5029 static enum neighbor_char
5030 enc_succ_char(char *p, long len, rb_encoding *enc)
5031 {
5032  long i;
5033  int l;
5034 
5035  if (rb_enc_mbminlen(enc) > 1) {
5036  /* wchar, trivial case */
5037  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5038  if (!MBCLEN_CHARFOUND_P(r)) {
5039  return NEIGHBOR_NOT_CHAR;
5040  }
5041  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
5042  l = rb_enc_code_to_mbclen(c, enc);
5043  if (!l) return NEIGHBOR_NOT_CHAR;
5044  if (l != len) return NEIGHBOR_WRAPPED;
5045  rb_enc_mbcput(c, p, enc);
5046  r = rb_enc_precise_mbclen(p, p + len, enc);
5047  if (!MBCLEN_CHARFOUND_P(r)) {
5048  return NEIGHBOR_NOT_CHAR;
5049  }
5050  return NEIGHBOR_FOUND;
5051  }
5052  while (1) {
5053  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
5054  p[i] = '\0';
5055  if (i < 0)
5056  return NEIGHBOR_WRAPPED;
5057  ++((unsigned char*)p)[i];
5058  l = rb_enc_precise_mbclen(p, p+len, enc);
5059  if (MBCLEN_CHARFOUND_P(l)) {
5060  l = MBCLEN_CHARFOUND_LEN(l);
5061  if (l == len) {
5062  return NEIGHBOR_FOUND;
5063  }
5064  else {
5065  memset(p+l, 0xff, len-l);
5066  }
5067  }
5068  if (MBCLEN_INVALID_P(l) && i < len-1) {
5069  long len2;
5070  int l2;
5071  for (len2 = len-1; 0 < len2; len2--) {
5072  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5073  if (!MBCLEN_INVALID_P(l2))
5074  break;
5075  }
5076  memset(p+len2+1, 0xff, len-(len2+1));
5077  }
5078  }
5079 }
5080 
5081 static enum neighbor_char
5082 enc_pred_char(char *p, long len, rb_encoding *enc)
5083 {
5084  long i;
5085  int l;
5086  if (rb_enc_mbminlen(enc) > 1) {
5087  /* wchar, trivial case */
5088  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
5089  if (!MBCLEN_CHARFOUND_P(r)) {
5090  return NEIGHBOR_NOT_CHAR;
5091  }
5092  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
5093  if (!c) return NEIGHBOR_NOT_CHAR;
5094  --c;
5095  l = rb_enc_code_to_mbclen(c, enc);
5096  if (!l) return NEIGHBOR_NOT_CHAR;
5097  if (l != len) return NEIGHBOR_WRAPPED;
5098  rb_enc_mbcput(c, p, enc);
5099  r = rb_enc_precise_mbclen(p, p + len, enc);
5100  if (!MBCLEN_CHARFOUND_P(r)) {
5101  return NEIGHBOR_NOT_CHAR;
5102  }
5103  return NEIGHBOR_FOUND;
5104  }
5105  while (1) {
5106  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
5107  p[i] = '\xff';
5108  if (i < 0)
5109  return NEIGHBOR_WRAPPED;
5110  --((unsigned char*)p)[i];
5111  l = rb_enc_precise_mbclen(p, p+len, enc);
5112  if (MBCLEN_CHARFOUND_P(l)) {
5113  l = MBCLEN_CHARFOUND_LEN(l);
5114  if (l == len) {
5115  return NEIGHBOR_FOUND;
5116  }
5117  else {
5118  memset(p+l, 0, len-l);
5119  }
5120  }
5121  if (MBCLEN_INVALID_P(l) && i < len-1) {
5122  long len2;
5123  int l2;
5124  for (len2 = len-1; 0 < len2; len2--) {
5125  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
5126  if (!MBCLEN_INVALID_P(l2))
5127  break;
5128  }
5129  memset(p+len2+1, 0, len-(len2+1));
5130  }
5131  }
5132 }
5133 
5134 /*
5135  overwrite +p+ by succeeding letter in +enc+ and returns
5136  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
5137  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
5138  assuming each ranges are successive, and mbclen
5139  never change in each ranges.
5140  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
5141  character.
5142  */
5143 static enum neighbor_char
5144 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
5145 {
5146  enum neighbor_char ret;
5147  unsigned int c;
5148  int ctype;
5149  int range;
5150  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
5151 
5152  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
5153  int try;
5154  const int max_gaps = 1;
5155 
5156  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5157  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
5158  ctype = ONIGENC_CTYPE_DIGIT;
5159  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
5160  ctype = ONIGENC_CTYPE_ALPHA;
5161  else
5162  return NEIGHBOR_NOT_CHAR;
5163 
5164  MEMCPY(save, p, char, len);
5165  for (try = 0; try <= max_gaps; ++try) {
5166  ret = enc_succ_char(p, len, enc);
5167  if (ret == NEIGHBOR_FOUND) {
5168  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5169  if (rb_enc_isctype(c, ctype, enc))
5170  return NEIGHBOR_FOUND;
5171  }
5172  }
5173  MEMCPY(p, save, char, len);
5174  range = 1;
5175  while (1) {
5176  MEMCPY(save, p, char, len);
5177  ret = enc_pred_char(p, len, enc);
5178  if (ret == NEIGHBOR_FOUND) {
5179  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
5180  if (!rb_enc_isctype(c, ctype, enc)) {
5181  MEMCPY(p, save, char, len);
5182  break;
5183  }
5184  }
5185  else {
5186  MEMCPY(p, save, char, len);
5187  break;
5188  }
5189  range++;
5190  }
5191  if (range == 1) {
5192  return NEIGHBOR_NOT_CHAR;
5193  }
5194 
5195  if (ctype != ONIGENC_CTYPE_DIGIT) {
5196  MEMCPY(carry, p, char, len);
5197  return NEIGHBOR_WRAPPED;
5198  }
5199 
5200  MEMCPY(carry, p, char, len);
5201  enc_succ_char(carry, len, enc);
5202  return NEIGHBOR_WRAPPED;
5203 }
5204 
5205 
5206 static VALUE str_succ(VALUE str);
5207 
5208 /*
5209  * call-seq:
5210  * succ -> new_str
5211  *
5212  * Returns the successor to +self+. The successor is calculated by
5213  * incrementing characters.
5214  *
5215  * The first character to be incremented is the rightmost alphanumeric:
5216  * or, if no alphanumerics, the rightmost character:
5217  *
5218  * 'THX1138'.succ # => "THX1139"
5219  * '<<koala>>'.succ # => "<<koalb>>"
5220  * '***'.succ # => '**+'
5221  *
5222  * The successor to a digit is another digit, "carrying" to the next-left
5223  * character for a "rollover" from 9 to 0, and prepending another digit
5224  * if necessary:
5225  *
5226  * '00'.succ # => "01"
5227  * '09'.succ # => "10"
5228  * '99'.succ # => "100"
5229  *
5230  * The successor to a letter is another letter of the same case,
5231  * carrying to the next-left character for a rollover,
5232  * and prepending another same-case letter if necessary:
5233  *
5234  * 'aa'.succ # => "ab"
5235  * 'az'.succ # => "ba"
5236  * 'zz'.succ # => "aaa"
5237  * 'AA'.succ # => "AB"
5238  * 'AZ'.succ # => "BA"
5239  * 'ZZ'.succ # => "AAA"
5240  *
5241  * The successor to a non-alphanumeric character is the next character
5242  * in the underlying character set's collating sequence,
5243  * carrying to the next-left character for a rollover,
5244  * and prepending another character if necessary:
5245  *
5246  * s = 0.chr * 3
5247  * s # => "\x00\x00\x00"
5248  * s.succ # => "\x00\x00\x01"
5249  * s = 255.chr * 3
5250  * s # => "\xFF\xFF\xFF"
5251  * s.succ # => "\x01\x00\x00\x00"
5252  *
5253  * Carrying can occur between and among mixtures of alphanumeric characters:
5254  *
5255  * s = 'zz99zz99'
5256  * s.succ # => "aaa00aa00"
5257  * s = '99zz99zz'
5258  * s.succ # => "100aa00aa"
5259  *
5260  * The successor to an empty +String+ is a new empty +String+:
5261  *
5262  * ''.succ # => ""
5263  *
5264  */
5265 
5266 VALUE
5268 {
5269  VALUE str;
5270  str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
5271  rb_enc_cr_str_copy_for_substr(str, orig);
5272  return str_succ(str);
5273 }
5274 
5275 static VALUE
5276 str_succ(VALUE str)
5277 {
5278  rb_encoding *enc;
5279  char *sbeg, *s, *e, *last_alnum = 0;
5280  int found_alnum = 0;
5281  long l, slen;
5282  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
5283  long carry_pos = 0, carry_len = 1;
5284  enum neighbor_char neighbor = NEIGHBOR_FOUND;
5285 
5286  slen = RSTRING_LEN(str);
5287  if (slen == 0) return str;
5288 
5289  enc = STR_ENC_GET(str);
5290  sbeg = RSTRING_PTR(str);
5291  s = e = sbeg + slen;
5292 
5293  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5294  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
5295  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
5296  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
5297  break;
5298  }
5299  }
5300  l = rb_enc_precise_mbclen(s, e, enc);
5301  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5302  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5303  neighbor = enc_succ_alnum_char(s, l, enc, carry);
5304  switch (neighbor) {
5305  case NEIGHBOR_NOT_CHAR:
5306  continue;
5307  case NEIGHBOR_FOUND:
5308  return str;
5309  case NEIGHBOR_WRAPPED:
5310  last_alnum = s;
5311  break;
5312  }
5313  found_alnum = 1;
5314  carry_pos = s - sbeg;
5315  carry_len = l;
5316  }
5317  if (!found_alnum) { /* str contains no alnum */
5318  s = e;
5319  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
5320  enum neighbor_char neighbor;
5321  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
5322  l = rb_enc_precise_mbclen(s, e, enc);
5323  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
5324  l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
5325  MEMCPY(tmp, s, char, l);
5326  neighbor = enc_succ_char(tmp, l, enc);
5327  switch (neighbor) {
5328  case NEIGHBOR_FOUND:
5329  MEMCPY(s, tmp, char, l);
5330  return str;
5331  break;
5332  case NEIGHBOR_WRAPPED:
5333  MEMCPY(s, tmp, char, l);
5334  break;
5335  case NEIGHBOR_NOT_CHAR:
5336  break;
5337  }
5338  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
5339  /* wrapped to \0...\0. search next valid char. */
5340  enc_succ_char(s, l, enc);
5341  }
5342  if (!rb_enc_asciicompat(enc)) {
5343  MEMCPY(carry, s, char, l);
5344  carry_len = l;
5345  }
5346  carry_pos = s - sbeg;
5347  }
5349  }
5350  RESIZE_CAPA(str, slen + carry_len);
5351  sbeg = RSTRING_PTR(str);
5352  s = sbeg + carry_pos;
5353  memmove(s + carry_len, s, slen - carry_pos);
5354  memmove(s, carry, carry_len);
5355  slen += carry_len;
5356  STR_SET_LEN(str, slen);
5357  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
5358  rb_enc_str_coderange(str);
5359  return str;
5360 }
5361 
5362 
5363 /*
5364  * call-seq:
5365  * succ! -> self
5366  *
5367  * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
5368  */
5369 
5370 static VALUE
5371 rb_str_succ_bang(VALUE str)
5372 {
5373  rb_str_modify(str);
5374  str_succ(str);
5375  return str;
5376 }
5377 
5378 static int
5379 all_digits_p(const char *s, long len)
5380 {
5381  while (len-- > 0) {
5382  if (!ISDIGIT(*s)) return 0;
5383  s++;
5384  }
5385  return 1;
5386 }
5387 
5388 static int
5389 str_upto_i(VALUE str, VALUE arg)
5390 {
5391  rb_yield(str);
5392  return 0;
5393 }
5394 
5395 /*
5396  * call-seq:
5397  * upto(other_string, exclusive = false) {|string| ... } -> self
5398  * upto(other_string, exclusive = false) -> new_enumerator
5399  *
5400  * With a block given, calls the block with each +String+ value
5401  * returned by successive calls to String#succ;
5402  * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
5403  * the sequence terminates when value +other_string+ is reached;
5404  * returns +self+:
5405  *
5406  * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5407  * Output:
5408  *
5409  * a8 a9 b0 b1 b2 b3 b4 b5 b6
5410  *
5411  * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5412  *
5413  * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5414  *
5415  * Output:
5416  *
5417  * a8 a9 b0 b1 b2 b3 b4 b5
5418  *
5419  * If +other_string+ would not be reached, does not call the block:
5420  *
5421  * '25'.upto('5') {|s| fail s }
5422  * 'aa'.upto('a') {|s| fail s }
5423  *
5424  * With no block given, returns a new Enumerator:
5425  *
5426  * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5427  *
5428  */
5429 
5430 static VALUE
5431 rb_str_upto(int argc, VALUE *argv, VALUE beg)
5432 {
5433  VALUE end, exclusive;
5434 
5435  rb_scan_args(argc, argv, "11", &end, &exclusive);
5436  RETURN_ENUMERATOR(beg, argc, argv);
5437  return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5438 }
5439 
5440 VALUE
5441 rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5442 {
5443  VALUE current, after_end;
5444  ID succ;
5445  int n, ascii;
5446  rb_encoding *enc;
5447 
5448  CONST_ID(succ, "succ");
5449  StringValue(end);
5450  enc = rb_enc_check(beg, end);
5451  ascii = (is_ascii_string(beg) && is_ascii_string(end));
5452  /* single character */
5453  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5454  char c = RSTRING_PTR(beg)[0];
5455  char e = RSTRING_PTR(end)[0];
5456 
5457  if (c > e || (excl && c == e)) return beg;
5458  for (;;) {
5459  VALUE str = rb_enc_str_new(&c, 1, enc);
5461  if ((*each)(str, arg)) break;
5462  if (!excl && c == e) break;
5463  c++;
5464  if (excl && c == e) break;
5465  }
5466  return beg;
5467  }
5468  /* both edges are all digits */
5469  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5470  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5471  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5472  VALUE b, e;
5473  int width;
5474 
5475  width = RSTRING_LENINT(beg);
5476  b = rb_str_to_inum(beg, 10, FALSE);
5477  e = rb_str_to_inum(end, 10, FALSE);
5478  if (FIXNUM_P(b) && FIXNUM_P(e)) {
5479  long bi = FIX2LONG(b);
5480  long ei = FIX2LONG(e);
5481  rb_encoding *usascii = rb_usascii_encoding();
5482 
5483  while (bi <= ei) {
5484  if (excl && bi == ei) break;
5485  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5486  bi++;
5487  }
5488  }
5489  else {
5490  ID op = excl ? '<' : idLE;
5491  VALUE args[2], fmt = rb_fstring_lit("%.*d");
5492 
5493  args[0] = INT2FIX(width);
5494  while (rb_funcall(b, op, 1, e)) {
5495  args[1] = b;
5496  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5497  b = rb_funcallv(b, succ, 0, 0);
5498  }
5499  }
5500  return beg;
5501  }
5502  /* normal case */
5503  n = rb_str_cmp(beg, end);
5504  if (n > 0 || (excl && n == 0)) return beg;
5505 
5506  after_end = rb_funcallv(end, succ, 0, 0);
5507  current = str_duplicate(rb_cString, beg);
5508  while (!rb_str_equal(current, after_end)) {
5509  VALUE next = Qnil;
5510  if (excl || !rb_str_equal(current, end))
5511  next = rb_funcallv(current, succ, 0, 0);
5512  if ((*each)(current, arg)) break;
5513  if (NIL_P(next)) break;
5514  current = next;
5515  StringValue(current);
5516  if (excl && rb_str_equal(current, end)) break;
5517  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5518  break;
5519  }
5520 
5521  return beg;
5522 }
5523 
5524 VALUE
5525 rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5526 {
5527  VALUE current;
5528  ID succ;
5529 
5530  CONST_ID(succ, "succ");
5531  /* both edges are all digits */
5532  if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5533  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5534  VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5535  int width = RSTRING_LENINT(beg);
5536  b = rb_str_to_inum(beg, 10, FALSE);
5537  if (FIXNUM_P(b)) {
5538  long bi = FIX2LONG(b);
5539  rb_encoding *usascii = rb_usascii_encoding();
5540 
5541  while (FIXABLE(bi)) {
5542  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5543  bi++;
5544  }
5545  b = LONG2NUM(bi);
5546  }
5547  args[0] = INT2FIX(width);
5548  while (1) {
5549  args[1] = b;
5550  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5551  b = rb_funcallv(b, succ, 0, 0);
5552  }
5553  }
5554  /* normal case */
5555  current = str_duplicate(rb_cString, beg);
5556  while (1) {
5557  VALUE next = rb_funcallv(current, succ, 0, 0);
5558  if ((*each)(current, arg)) break;
5559  current = next;
5560  StringValue(current);
5561  if (RSTRING_LEN(current) == 0)
5562  break;
5563  }
5564 
5565  return beg;
5566 }
5567 
5568 static int
5569 include_range_i(VALUE str, VALUE arg)
5570 {
5571  VALUE *argp = (VALUE *)arg;
5572  if (!rb_equal(str, *argp)) return 0;
5573  *argp = Qnil;
5574  return 1;
5575 }
5576 
5577 VALUE
5578 rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5579 {
5580  beg = rb_str_new_frozen(beg);
5581  StringValue(end);
5582  end = rb_str_new_frozen(end);
5583  if (NIL_P(val)) return Qfalse;
5584  val = rb_check_string_type(val);
5585  if (NIL_P(val)) return Qfalse;
5586  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5587  rb_enc_asciicompat(STR_ENC_GET(end)) &&
5588  rb_enc_asciicompat(STR_ENC_GET(val))) {
5589  const char *bp = RSTRING_PTR(beg);
5590  const char *ep = RSTRING_PTR(end);
5591  const char *vp = RSTRING_PTR(val);
5592  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5593  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5594  return Qfalse;
5595  else {
5596  char b = *bp;
5597  char e = *ep;
5598  char v = *vp;
5599 
5600  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5601  if (b <= v && v < e) return Qtrue;
5602  return RBOOL(!RTEST(exclusive) && v == e);
5603  }
5604  }
5605  }
5606 #if 0
5607  /* both edges are all digits */
5608  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5609  all_digits_p(bp, RSTRING_LEN(beg)) &&
5610  all_digits_p(ep, RSTRING_LEN(end))) {
5611  /* TODO */
5612  }
5613 #endif
5614  }
5615  rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5616 
5617  return RBOOL(NIL_P(val));
5618 }
5619 
5620 static VALUE
5621 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5622 {
5623  if (rb_reg_search(re, str, 0, 0) >= 0) {
5624  VALUE match = rb_backref_get();
5625  int nth = rb_reg_backref_number(match, backref);
5626  return rb_reg_nth_match(nth, match);
5627  }
5628  return Qnil;
5629 }
5630 
5631 static VALUE
5632 rb_str_aref(VALUE str, VALUE indx)
5633 {
5634  long idx;
5635 
5636  if (FIXNUM_P(indx)) {
5637  idx = FIX2LONG(indx);
5638  }
5639  else if (RB_TYPE_P(indx, T_REGEXP)) {
5640  return rb_str_subpat(str, indx, INT2FIX(0));
5641  }
5642  else if (RB_TYPE_P(indx, T_STRING)) {
5643  if (rb_str_index(str, indx, 0) != -1)
5644  return str_duplicate(rb_cString, indx);
5645  return Qnil;
5646  }
5647  else {
5648  /* check if indx is Range */
5649  long beg, len = str_strlen(str, NULL);
5650  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5651  case Qfalse:
5652  break;
5653  case Qnil:
5654  return Qnil;
5655  default:
5656  return rb_str_substr(str, beg, len);
5657  }
5658  idx = NUM2LONG(indx);
5659  }
5660 
5661  return str_substr(str, idx, 1, FALSE);
5662 }
5663 
5664 
5665 /*
5666  * call-seq:
5667  * string[index] -> new_string or nil
5668  * string[start, length] -> new_string or nil
5669  * string[range] -> new_string or nil
5670  * string[regexp, capture = 0] -> new_string or nil
5671  * string[substring] -> new_string or nil
5672  *
5673  * Returns the substring of +self+ specified by the arguments.
5674  * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5675  *
5676  *
5677  */
5678 
5679 static VALUE
5680 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5681 {
5682  if (argc == 2) {
5683  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5684  return rb_str_subpat(str, argv[0], argv[1]);
5685  }
5686  else {
5687  return rb_str_substr_two_fixnums(str, argv[0], argv[1], TRUE);
5688  }
5689  }
5690  rb_check_arity(argc, 1, 2);
5691  return rb_str_aref(str, argv[0]);
5692 }
5693 
5694 VALUE
5696 {
5697  char *ptr = RSTRING_PTR(str);
5698  long olen = RSTRING_LEN(str), nlen;
5699 
5700  str_modifiable(str);
5701  if (len > olen) len = olen;
5702  nlen = olen - len;
5703  if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5704  char *oldptr = ptr;
5705  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5706  STR_SET_EMBED(str);
5707  ptr = RSTRING(str)->as.embed.ary;
5708  memmove(ptr, oldptr + len, nlen);
5709  if (fl == STR_NOEMBED) xfree(oldptr);
5710  }
5711  else {
5712  if (!STR_SHARED_P(str)) {
5713  VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5714  rb_enc_cr_str_exact_copy(shared, str);
5715  OBJ_FREEZE(shared);
5716  }
5717  ptr = RSTRING(str)->as.heap.ptr += len;
5718  }
5719  STR_SET_LEN(str, nlen);
5720 
5721  if (!SHARABLE_MIDDLE_SUBSTRING) {
5722  TERM_FILL(ptr + nlen, TERM_LEN(str));
5723  }
5724  ENC_CODERANGE_CLEAR(str);
5725  return str;
5726 }
5727 
5728 static void
5729 rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5730 {
5731  char *sptr;
5732  long slen;
5733  int cr;
5734 
5735  if (beg == 0 && vlen == 0) {
5736  rb_str_drop_bytes(str, len);
5737  return;
5738  }
5739 
5740  str_modify_keep_cr(str);
5741  RSTRING_GETMEM(str, sptr, slen);
5742  if (len < vlen) {
5743  /* expand string */
5744  RESIZE_CAPA(str, slen + vlen - len);
5745  sptr = RSTRING_PTR(str);
5746  }
5747 
5748  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
5749  cr = rb_enc_str_coderange(val);
5750  else
5751  cr = ENC_CODERANGE_UNKNOWN;
5752 
5753  if (vlen != len) {
5754  memmove(sptr + beg + vlen,
5755  sptr + beg + len,
5756  slen - (beg + len));
5757  }
5758  if (vlen < beg && len < 0) {
5759  MEMZERO(sptr + slen, char, -len);
5760  }
5761  if (vlen > 0) {
5762  memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5763  }
5764  slen += vlen - len;
5765  STR_SET_LEN(str, slen);
5766  TERM_FILL(&sptr[slen], TERM_LEN(str));
5767  ENC_CODERANGE_SET(str, cr);
5768 }
5769 
5770 static inline void
5771 rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5772 {
5773  rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5774 }
5775 
5776 void
5777 rb_str_update(VALUE str, long beg, long len, VALUE val)
5778 {
5779  long slen;
5780  char *p, *e;
5781  rb_encoding *enc;
5782  int singlebyte = single_byte_optimizable(str);
5783  int cr;
5784 
5785  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5786 
5787  StringValue(val);
5788  enc = rb_enc_check(str, val);
5789  slen = str_strlen(str, enc); /* rb_enc_check */
5790 
5791  if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5792  rb_raise(rb_eIndexError, "index %ld out of string", beg);
5793  }
5794  if (beg < 0) {
5795  beg += slen;
5796  }
5797  RUBY_ASSERT(beg >= 0);
5798  RUBY_ASSERT(beg <= slen);
5799 
5800  if (len > slen - beg) {
5801  len = slen - beg;
5802  }
5803  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5804  if (!p) p = RSTRING_END(str);
5805  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5806  if (!e) e = RSTRING_END(str);
5807  /* error check */
5808  beg = p - RSTRING_PTR(str); /* physical position */
5809  len = e - p; /* physical length */
5810  rb_str_update_0(str, beg, len, val);
5811  rb_enc_associate(str, enc);
5813  if (cr != ENC_CODERANGE_BROKEN)
5814  ENC_CODERANGE_SET(str, cr);
5815 }
5816 
5817 static void
5818 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5819 {
5820  int nth;
5821  VALUE match;
5822  long start, end, len;
5823  rb_encoding *enc;
5824  struct re_registers *regs;
5825 
5826  if (rb_reg_search(re, str, 0, 0) < 0) {
5827  rb_raise(rb_eIndexError, "regexp not matched");
5828  }
5829  match = rb_backref_get();
5830  nth = rb_reg_backref_number(match, backref);
5831  regs = RMATCH_REGS(match);
5832  if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5833  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5834  }
5835  if (nth < 0) {
5836  nth += regs->num_regs;
5837  }
5838 
5839  start = BEG(nth);
5840  if (start == -1) {
5841  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5842  }
5843  end = END(nth);
5844  len = end - start;
5845  StringValue(val);
5846  enc = rb_enc_check_str(str, val);
5847  rb_str_update_0(str, start, len, val);
5848  rb_enc_associate(str, enc);
5849 }
5850 
5851 static VALUE
5852 rb_str_aset(VALUE str, VALUE indx, VALUE val)
5853 {
5854  long idx, beg;
5855 
5856  switch (TYPE(indx)) {
5857  case T_REGEXP:
5858  rb_str_subpat_set(str, indx, INT2FIX(0), val);
5859  return val;
5860 
5861  case T_STRING:
5862  beg = rb_str_index(str, indx, 0);
5863  if (beg < 0) {
5864  rb_raise(rb_eIndexError, "string not matched");
5865  }
5866  beg = rb_str_sublen(str, beg);
5867  rb_str_update(str, beg, str_strlen(indx, NULL), val);
5868  return val;
5869 
5870  default:
5871  /* check if indx is Range */
5872  {
5873  long beg, len;
5874  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5875  rb_str_update(str, beg, len, val);
5876  return val;
5877  }
5878  }
5879  /* FALLTHROUGH */
5880 
5881  case T_FIXNUM:
5882  idx = NUM2LONG(indx);
5883  rb_str_update(str, idx, 1, val);
5884  return val;
5885  }
5886 }
5887 
5888 /*
5889  * call-seq:
5890  * string[index] = new_string
5891  * string[start, length] = new_string
5892  * string[range] = new_string
5893  * string[regexp, capture = 0] = new_string
5894  * string[substring] = new_string
5895  *
5896  * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5897  * See {String Slices}[rdoc-ref:String@String+Slices].
5898  *
5899  * A few examples:
5900  *
5901  * s = 'foo'
5902  * s[2] = 'rtune' # => "rtune"
5903  * s # => "fortune"
5904  * s[1, 5] = 'init' # => "init"
5905  * s # => "finite"
5906  * s[3..4] = 'al' # => "al"
5907  * s # => "finale"
5908  * s[/e$/] = 'ly' # => "ly"
5909  * s # => "finally"
5910  * s['lly'] = 'ncial' # => "ncial"
5911  * s # => "financial"
5912  *
5913  */
5914 
5915 static VALUE
5916 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5917 {
5918  if (argc == 3) {
5919  if (RB_TYPE_P(argv[0], T_REGEXP)) {
5920  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5921  }
5922  else {
5923  rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5924  }
5925  return argv[2];
5926  }
5927  rb_check_arity(argc, 2, 3);
5928  return rb_str_aset(str, argv[0], argv[1]);
5929 }
5930 
5931 /*
5932  * call-seq:
5933  * insert(index, other_string) -> self
5934  *
5935  * Inserts the given +other_string+ into +self+; returns +self+.
5936  *
5937  * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5938  *
5939  * 'foo'.insert(1, 'bar') # => "fbaroo"
5940  *
5941  * If the Integer +index+ is negative, counts backward from the end of +self+
5942  * and inserts +other_string+ at offset <tt>index+1</tt>
5943  * (that is, _after_ <tt>self[index]</tt>):
5944  *
5945  * 'foo'.insert(-2, 'bar') # => "fobaro"
5946  *
5947  */
5948 
5949 static VALUE
5950 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5951 {
5952  long pos = NUM2LONG(idx);
5953 
5954  if (pos == -1) {
5955  return rb_str_append(str, str2);
5956  }
5957  else if (pos < 0) {
5958  pos++;
5959  }
5960  rb_str_update(str, pos, 0, str2);
5961  return str;
5962 }
5963 
5964 
5965 /*
5966  * call-seq:
5967  * slice!(index) -> new_string or nil
5968  * slice!(start, length) -> new_string or nil
5969  * slice!(range) -> new_string or nil
5970  * slice!(regexp, capture = 0) -> new_string or nil
5971  * slice!(substring) -> new_string or nil
5972  *
5973  * Removes and returns the substring of +self+ specified by the arguments.
5974  * See {String Slices}[rdoc-ref:String@String+Slices].
5975  *
5976  * A few examples:
5977  *
5978  * string = "This is a string"
5979  * string.slice!(2) #=> "i"
5980  * string.slice!(3..6) #=> " is "
5981  * string.slice!(/s.*t/) #=> "sa st"
5982  * string.slice!("r") #=> "r"
5983  * string #=> "Thing"
5984  *
5985  */
5986 
5987 static VALUE
5988 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5989 {
5990  VALUE result = Qnil;
5991  VALUE indx;
5992  long beg, len = 1;
5993  char *p;
5994 
5995  rb_check_arity(argc, 1, 2);
5996  str_modify_keep_cr(str);
5997  indx = argv[0];
5998  if (RB_TYPE_P(indx, T_REGEXP)) {
5999  if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
6000  VALUE match = rb_backref_get();
6001  struct re_registers *regs = RMATCH_REGS(match);
6002  int nth = 0;
6003  if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
6004  if ((nth += regs->num_regs) <= 0) return Qnil;
6005  }
6006  else if (nth >= regs->num_regs) return Qnil;
6007  beg = BEG(nth);
6008  len = END(nth) - beg;
6009  goto subseq;
6010  }
6011  else if (argc == 2) {
6012  beg = NUM2LONG(indx);
6013  len = NUM2LONG(argv[1]);
6014  goto num_index;
6015  }
6016  else if (FIXNUM_P(indx)) {
6017  beg = FIX2LONG(indx);
6018  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6019  if (!len) return Qnil;
6020  beg = p - RSTRING_PTR(str);
6021  goto subseq;
6022  }
6023  else if (RB_TYPE_P(indx, T_STRING)) {
6024  beg = rb_str_index(str, indx, 0);
6025  if (beg == -1) return Qnil;
6026  len = RSTRING_LEN(indx);
6027  result = str_duplicate(rb_cString, indx);
6028  goto squash;
6029  }
6030  else {
6031  switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
6032  case Qnil:
6033  return Qnil;
6034  case Qfalse:
6035  beg = NUM2LONG(indx);
6036  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6037  if (!len) return Qnil;
6038  beg = p - RSTRING_PTR(str);
6039  goto subseq;
6040  default:
6041  goto num_index;
6042  }
6043  }
6044 
6045  num_index:
6046  if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
6047  beg = p - RSTRING_PTR(str);
6048 
6049  subseq:
6050  result = rb_str_new(RSTRING_PTR(str)+beg, len);
6051  rb_enc_cr_str_copy_for_substr(result, str);
6052 
6053  squash:
6054  if (len > 0) {
6055  if (beg == 0) {
6056  rb_str_drop_bytes(str, len);
6057  }
6058  else {
6059  char *sptr = RSTRING_PTR(str);
6060  long slen = RSTRING_LEN(str);
6061  if (beg + len > slen) /* pathological check */
6062  len = slen - beg;
6063  memmove(sptr + beg,
6064  sptr + beg + len,
6065  slen - (beg + len));
6066  slen -= len;
6067  STR_SET_LEN(str, slen);
6068  TERM_FILL(&sptr[slen], TERM_LEN(str));
6069  }
6070  }
6071  return result;
6072 }
6073 
6074 static VALUE
6075 get_pat(VALUE pat)
6076 {
6077  VALUE val;
6078 
6079  switch (OBJ_BUILTIN_TYPE(pat)) {
6080  case T_REGEXP:
6081  return pat;
6082 
6083  case T_STRING:
6084  break;
6085 
6086  default:
6087  val = rb_check_string_type(pat);
6088  if (NIL_P(val)) {
6089  Check_Type(pat, T_REGEXP);
6090  }
6091  pat = val;
6092  }
6093 
6094  return rb_reg_regcomp(pat);
6095 }
6096 
6097 static VALUE
6098 get_pat_quoted(VALUE pat, int check)
6099 {
6100  VALUE val;
6101 
6102  switch (OBJ_BUILTIN_TYPE(pat)) {
6103  case T_REGEXP:
6104  return pat;
6105 
6106  case T_STRING:
6107  break;
6108 
6109  default:
6110  val = rb_check_string_type(pat);
6111  if (NIL_P(val)) {
6112  Check_Type(pat, T_REGEXP);
6113  }
6114  pat = val;
6115  }
6116  if (check && is_broken_string(pat)) {
6117  rb_exc_raise(rb_reg_check_preprocess(pat));
6118  }
6119  return pat;
6120 }
6121 
6122 static long
6123 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6124 {
6125  if (BUILTIN_TYPE(pat) == T_STRING) {
6126  pos = rb_str_byteindex(str, pat, pos);
6127  if (set_backref_str) {
6128  if (pos >= 0) {
6129  str = rb_str_new_frozen_String(str);
6130  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6131  }
6132  else {
6134  }
6135  }
6136  return pos;
6137  }
6138  else {
6139  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6140  }
6141 }
6142 
6143 
6144 /*
6145  * call-seq:
6146  * sub!(pattern, replacement) -> self or nil
6147  * sub!(pattern) {|match| ... } -> self or nil
6148  *
6149  * Replaces the first occurrence (not all occurrences) of the given +pattern+
6150  * on +self+; returns +self+ if a replacement occurred, +nil+ otherwise.
6151  *
6152  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6153  *
6154  * Related: String#sub, String#gsub, String#gsub!.
6155  *
6156  */
6157 
6158 static VALUE
6159 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
6160 {
6161  VALUE pat, repl, hash = Qnil;
6162  int iter = 0;
6163  long plen;
6164  int min_arity = rb_block_given_p() ? 1 : 2;
6165  long beg;
6166 
6167  rb_check_arity(argc, min_arity, 2);
6168  if (argc == 1) {
6169  iter = 1;
6170  }
6171  else {
6172  repl = argv[1];
6173  hash = rb_check_hash_type(argv[1]);
6174  if (NIL_P(hash)) {
6175  StringValue(repl);
6176  }
6177  }
6178 
6179  pat = get_pat_quoted(argv[0], 1);
6180 
6181  str_modifiable(str);
6182  beg = rb_pat_search(pat, str, 0, 1);
6183  if (beg >= 0) {
6184  rb_encoding *enc;
6185  int cr = ENC_CODERANGE(str);
6186  long beg0, end0;
6187  VALUE match, match0 = Qnil;
6188  struct re_registers *regs;
6189  char *p, *rp;
6190  long len, rlen;
6191 
6192  match = rb_backref_get();
6193  regs = RMATCH_REGS(match);
6194  if (RB_TYPE_P(pat, T_STRING)) {
6195  beg0 = beg;
6196  end0 = beg0 + RSTRING_LEN(pat);
6197  match0 = pat;
6198  }
6199  else {
6200  beg0 = BEG(0);
6201  end0 = END(0);
6202  if (iter) match0 = rb_reg_nth_match(0, match);
6203  }
6204 
6205  if (iter || !NIL_P(hash)) {
6206  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6207 
6208  if (iter) {
6209  repl = rb_obj_as_string(rb_yield(match0));
6210  }
6211  else {
6212  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6213  repl = rb_obj_as_string(repl);
6214  }
6215  str_mod_check(str, p, len);
6216  rb_check_frozen(str);
6217  }
6218  else {
6219  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6220  }
6221 
6222  enc = rb_enc_compatible(str, repl);
6223  if (!enc) {
6224  rb_encoding *str_enc = STR_ENC_GET(str);
6225  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
6226  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
6227  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
6228  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
6229  rb_enc_inspect_name(str_enc),
6230  rb_enc_inspect_name(STR_ENC_GET(repl)));
6231  }
6232  enc = STR_ENC_GET(repl);
6233  }
6234  rb_str_modify(str);
6235  rb_enc_associate(str, enc);
6236  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
6237  int cr2 = ENC_CODERANGE(repl);
6238  if (cr2 == ENC_CODERANGE_BROKEN ||
6239  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
6240  cr = ENC_CODERANGE_UNKNOWN;
6241  else
6242  cr = cr2;
6243  }
6244  plen = end0 - beg0;
6245  rlen = RSTRING_LEN(repl);
6246  len = RSTRING_LEN(str);
6247  if (rlen > plen) {
6248  RESIZE_CAPA(str, len + rlen - plen);
6249  }
6250  p = RSTRING_PTR(str);
6251  if (rlen != plen) {
6252  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
6253  }
6254  rp = RSTRING_PTR(repl);
6255  memmove(p + beg0, rp, rlen);
6256  len += rlen - plen;
6257  STR_SET_LEN(str, len);
6258  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
6259  ENC_CODERANGE_SET(str, cr);
6260 
6261  RB_GC_GUARD(match);
6262 
6263  return str;
6264  }
6265  return Qnil;
6266 }
6267 
6268 
6269 /*
6270  * call-seq:
6271  * sub(pattern, replacement) -> new_string
6272  * sub(pattern) {|match| ... } -> new_string
6273  *
6274  * Returns a copy of +self+ with only the first occurrence
6275  * (not all occurrences) of the given +pattern+ replaced.
6276  *
6277  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6278  *
6279  * Related: String#sub!, String#gsub, String#gsub!.
6280  *
6281  */
6282 
6283 static VALUE
6284 rb_str_sub(int argc, VALUE *argv, VALUE str)
6285 {
6286  str = str_duplicate(rb_cString, str);
6287  rb_str_sub_bang(argc, argv, str);
6288  return str;
6289 }
6290 
6291 static VALUE
6292 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
6293 {
6294  VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6295  long beg, beg0, end0;
6296  long offset, blen, slen, len, last;
6297  enum {STR, ITER, MAP} mode = STR;
6298  char *sp, *cp;
6299  int need_backref = -1;
6300  rb_encoding *str_enc;
6301 
6302  switch (argc) {
6303  case 1:
6304  RETURN_ENUMERATOR(str, argc, argv);
6305  mode = ITER;
6306  break;
6307  case 2:
6308  repl = argv[1];
6309  hash = rb_check_hash_type(argv[1]);
6310  if (NIL_P(hash)) {
6311  StringValue(repl);
6312  }
6313  else {
6314  mode = MAP;
6315  }
6316  break;
6317  default:
6318  rb_error_arity(argc, 1, 2);
6319  }
6320 
6321  pat = get_pat_quoted(argv[0], 1);
6322  beg = rb_pat_search(pat, str, 0, need_backref);
6323  if (beg < 0) {
6324  if (bang) return Qnil; /* no match, no substitution */
6325  return str_duplicate(rb_cString, str);
6326  }
6327 
6328  offset = 0;
6329  blen = RSTRING_LEN(str) + 30; /* len + margin */
6330  dest = rb_str_buf_new(blen);
6331  sp = RSTRING_PTR(str);
6332  slen = RSTRING_LEN(str);
6333  cp = sp;
6334  str_enc = STR_ENC_GET(str);
6335  rb_enc_associate(dest, str_enc);
6337 
6338  do {
6339  VALUE match = rb_backref_get();
6340  struct re_registers *regs = RMATCH_REGS(match);
6341  if (RB_TYPE_P(pat, T_STRING)) {
6342  beg0 = beg;
6343  end0 = beg0 + RSTRING_LEN(pat);
6344  match0 = pat;
6345  }
6346  else {
6347  beg0 = BEG(0);
6348  end0 = END(0);
6349  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
6350  }
6351 
6352  if (mode) {
6353  if (mode == ITER) {
6354  val = rb_obj_as_string(rb_yield(match0));
6355  }
6356  else {
6357  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
6358  val = rb_obj_as_string(val);
6359  }
6360  str_mod_check(str, sp, slen);
6361  if (val == dest) { /* paranoid check [ruby-dev:24827] */
6362  rb_raise(rb_eRuntimeError, "block should not cheat");
6363  }
6364  }
6365  else if (need_backref) {
6366  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6367  if (need_backref < 0) {
6368  need_backref = val != repl;
6369  }
6370  }
6371  else {
6372  val = repl;
6373  }
6374 
6375  len = beg0 - offset; /* copy pre-match substr */
6376  if (len) {
6377  rb_enc_str_buf_cat(dest, cp, len, str_enc);
6378  }
6379 
6380  rb_str_buf_append(dest, val);
6381 
6382  last = offset;
6383  offset = end0;
6384  if (beg0 == end0) {
6385  /*
6386  * Always consume at least one character of the input string
6387  * in order to prevent infinite loops.
6388  */
6389  if (RSTRING_LEN(str) <= end0) break;
6390  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
6391  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
6392  offset = end0 + len;
6393  }
6394  cp = RSTRING_PTR(str) + offset;
6395  if (offset > RSTRING_LEN(str)) break;
6396  beg = rb_pat_search(pat, str, offset, need_backref);
6397 
6398  RB_GC_GUARD(match);
6399  } while (beg >= 0);
6400  if (RSTRING_LEN(str) > offset) {
6401  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
6402  }
6403  rb_pat_search(pat, str, last, 1);
6404  if (bang) {
6405  str_shared_replace(str, dest);
6406  }
6407  else {
6408  str = dest;
6409  }
6410 
6411  return str;
6412 }
6413 
6414 
6415 /*
6416  * call-seq:
6417  * gsub!(pattern, replacement) -> self or nil
6418  * gsub!(pattern) {|match| ... } -> self or nil
6419  * gsub!(pattern) -> an_enumerator
6420  *
6421  * Performs the specified substring replacement(s) on +self+;
6422  * returns +self+ if any replacement occurred, +nil+ otherwise.
6423  *
6424  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6425  *
6426  * Returns an Enumerator if no +replacement+ and no block given.
6427  *
6428  * Related: String#sub, String#gsub, String#sub!.
6429  *
6430  */
6431 
6432 static VALUE
6433 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6434 {
6435  str_modify_keep_cr(str);
6436  return str_gsub(argc, argv, str, 1);
6437 }
6438 
6439 
6440 /*
6441  * call-seq:
6442  * gsub(pattern, replacement) -> new_string
6443  * gsub(pattern) {|match| ... } -> new_string
6444  * gsub(pattern) -> enumerator
6445  *
6446  * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6447  *
6448  * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6449  *
6450  * Returns an Enumerator if no +replacement+ and no block given.
6451  *
6452  * Related: String#sub, String#sub!, String#gsub!.
6453  *
6454  */
6455 
6456 static VALUE
6457 rb_str_gsub(int argc, VALUE *argv, VALUE str)
6458 {
6459  return str_gsub(argc, argv, str, 0);
6460 }
6461 
6462 
6463 /*
6464  * call-seq:
6465  * replace(other_string) -> self
6466  *
6467  * Replaces the contents of +self+ with the contents of +other_string+:
6468  *
6469  * s = 'foo' # => "foo"
6470  * s.replace('bar') # => "bar"
6471  *
6472  */
6473 
6474 VALUE
6476 {
6477  str_modifiable(str);
6478  if (str == str2) return str;
6479 
6480  StringValue(str2);
6481  str_discard(str);
6482  return str_replace(str, str2);
6483 }
6484 
6485 /*
6486  * call-seq:
6487  * clear -> self
6488  *
6489  * Removes the contents of +self+:
6490  *
6491  * s = 'foo' # => "foo"
6492  * s.clear # => ""
6493  *
6494  */
6495 
6496 static VALUE
6497 rb_str_clear(VALUE str)
6498 {
6499  str_discard(str);
6500  STR_SET_EMBED(str);
6501  STR_SET_LEN(str, 0);
6502  RSTRING_PTR(str)[0] = 0;
6503  if (rb_enc_asciicompat(STR_ENC_GET(str)))
6505  else
6507  return str;
6508 }
6509 
6510 /*
6511  * call-seq:
6512  * chr -> string
6513  *
6514  * Returns a string containing the first character of +self+:
6515  *
6516  * s = 'foo' # => "foo"
6517  * s.chr # => "f"
6518  *
6519  */
6520 
6521 static VALUE
6522 rb_str_chr(VALUE str)
6523 {
6524  return rb_str_substr(str, 0, 1);
6525 }
6526 
6527 /*
6528  * call-seq:
6529  * getbyte(index) -> integer or nil
6530  *
6531  * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6532  *
6533  * s = 'abcde' # => "abcde"
6534  * s.getbyte(0) # => 97
6535  * s.getbyte(-1) # => 101
6536  * s.getbyte(5) # => nil
6537  *
6538  * Related: String#setbyte.
6539  */
6540 VALUE
6541 rb_str_getbyte(VALUE str, VALUE index)
6542 {
6543  long pos = NUM2LONG(index);
6544 
6545  if (pos < 0)
6546  pos += RSTRING_LEN(str);
6547  if (pos < 0 || RSTRING_LEN(str) <= pos)
6548  return Qnil;
6549 
6550  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6551 }
6552 
6553 /*
6554  * call-seq:
6555  * setbyte(index, integer) -> integer
6556  *
6557  * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6558  *
6559  * s = 'abcde' # => "abcde"
6560  * s.setbyte(0, 98) # => 98
6561  * s # => "bbcde"
6562  *
6563  * Related: String#getbyte.
6564  */
6565 VALUE
6566 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6567 {
6568  long pos = NUM2LONG(index);
6569  long len = RSTRING_LEN(str);
6570  char *ptr, *head, *left = 0;
6571  rb_encoding *enc;
6572  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6573 
6574  if (pos < -len || len <= pos)
6575  rb_raise(rb_eIndexError, "index %ld out of string", pos);
6576  if (pos < 0)
6577  pos += len;
6578 
6579  VALUE v = rb_to_int(value);
6580  VALUE w = rb_int_and(v, INT2FIX(0xff));
6581  char byte = (char)(NUM2INT(w) & 0xFF);
6582 
6583  if (!str_independent(str))
6584  str_make_independent(str);
6585  enc = STR_ENC_GET(str);
6586  head = RSTRING_PTR(str);
6587  ptr = &head[pos];
6588  if (!STR_EMBED_P(str)) {
6589  cr = ENC_CODERANGE(str);
6590  switch (cr) {
6591  case ENC_CODERANGE_7BIT:
6592  left = ptr;
6593  *ptr = byte;
6594  if (ISASCII(byte)) goto end;
6595  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6596  if (!MBCLEN_CHARFOUND_P(nlen))
6598  else
6600  goto end;
6601  case ENC_CODERANGE_VALID:
6602  left = rb_enc_left_char_head(head, ptr, head+len, enc);
6603  width = rb_enc_precise_mbclen(left, head+len, enc);
6604  *ptr = byte;
6605  nlen = rb_enc_precise_mbclen(left, head+len, enc);
6606  if (!MBCLEN_CHARFOUND_P(nlen))
6608  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6609  ENC_CODERANGE_CLEAR(str);
6610  goto end;
6611  }
6612  }
6613  ENC_CODERANGE_CLEAR(str);
6614  *ptr = byte;
6615 
6616  end:
6617  return value;
6618 }
6619 
6620 static VALUE
6621 str_byte_substr(VALUE str, long beg, long len, int empty)
6622 {
6623  long n = RSTRING_LEN(str);
6624 
6625  if (beg > n || len < 0) return Qnil;
6626  if (beg < 0) {
6627  beg += n;
6628  if (beg < 0) return Qnil;
6629  }
6630  if (len > n - beg)
6631  len = n - beg;
6632  if (len <= 0) {
6633  if (!empty) return Qnil;
6634  len = 0;
6635  }
6636 
6637  VALUE str2 = str_subseq(str, beg, len);
6638 
6639  str_enc_copy_direct(str2, str);
6640 
6641  if (RSTRING_LEN(str2) == 0) {
6642  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6644  else
6646  }
6647  else {
6648  switch (ENC_CODERANGE(str)) {
6649  case ENC_CODERANGE_7BIT:
6651  break;
6652  default:
6654  break;
6655  }
6656  }
6657 
6658  return str2;
6659 }
6660 
6661 VALUE
6662 rb_str_byte_substr(VALUE str, VALUE beg, VALUE len)
6663 {
6664  return str_byte_substr(str, NUM2LONG(beg), NUM2LONG(len), TRUE);
6665 }
6666 
6667 static VALUE
6668 str_byte_aref(VALUE str, VALUE indx)
6669 {
6670  long idx;
6671  if (FIXNUM_P(indx)) {
6672  idx = FIX2LONG(indx);
6673  }
6674  else {
6675  /* check if indx is Range */
6676  long beg, len = RSTRING_LEN(str);
6677 
6678  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6679  case Qfalse:
6680  break;
6681  case Qnil:
6682  return Qnil;
6683  default:
6684  return str_byte_substr(str, beg, len, TRUE);
6685  }
6686 
6687  idx = NUM2LONG(indx);
6688  }
6689  return str_byte_substr(str, idx, 1, FALSE);
6690 }
6691 
6692 /*
6693  * call-seq:
6694  * byteslice(index, length = 1) -> string or nil
6695  * byteslice(range) -> string or nil
6696  *
6697  * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6698  *
6699  * With integer arguments +index+ and +length+ given,
6700  * returns the substring beginning at the given +index+
6701  * of the given +length+ (if possible),
6702  * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6703  *
6704  * s = '0123456789' # => "0123456789"
6705  * s.byteslice(2) # => "2"
6706  * s.byteslice(200) # => nil
6707  * s.byteslice(4, 3) # => "456"
6708  * s.byteslice(4, 30) # => "456789"
6709  * s.byteslice(4, -1) # => nil
6710  * s.byteslice(40, 2) # => nil
6711  *
6712  * In either case above, counts backwards from the end of +self+
6713  * if +index+ is negative:
6714  *
6715  * s = '0123456789' # => "0123456789"
6716  * s.byteslice(-4) # => "6"
6717  * s.byteslice(-4, 3) # => "678"
6718  *
6719  * With Range argument +range+ given, returns
6720  * <tt>byteslice(range.begin, range.size)</tt>:
6721  *
6722  * s = '0123456789' # => "0123456789"
6723  * s.byteslice(4..6) # => "456"
6724  * s.byteslice(-6..-4) # => "456"
6725  * s.byteslice(5..2) # => "" # range.size is zero.
6726  * s.byteslice(40..42) # => nil
6727  *
6728  * In all cases, a returned string has the same encoding as +self+:
6729  *
6730  * s.encoding # => #<Encoding:UTF-8>
6731  * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6732  *
6733  */
6734 
6735 static VALUE
6736 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6737 {
6738  if (argc == 2) {
6739  long beg = NUM2LONG(argv[0]);
6740  long len = NUM2LONG(argv[1]);
6741  return str_byte_substr(str, beg, len, TRUE);
6742  }
6743  rb_check_arity(argc, 1, 2);
6744  return str_byte_aref(str, argv[0]);
6745 }
6746 
6747 static void
6748 str_check_beg_len(VALUE str, long *beg, long *len)
6749 {
6750  long end, slen = RSTRING_LEN(str);
6751 
6752  if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6753  if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6754  rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6755  }
6756  if (*beg < 0) {
6757  *beg += slen;
6758  }
6759  RUBY_ASSERT(*beg >= 0);
6760  RUBY_ASSERT(*beg <= slen);
6761 
6762  if (*len > slen - *beg) {
6763  *len = slen - *beg;
6764  }
6765  end = *beg + *len;
6766  str_ensure_byte_pos(str, *beg);
6767  str_ensure_byte_pos(str, end);
6768 }
6769 
6770 /*
6771  * call-seq:
6772  * bytesplice(index, length, str) -> string
6773  * bytesplice(index, length, str, str_index, str_length) -> string
6774  * bytesplice(range, str) -> string
6775  * bytesplice(range, str, str_range) -> string
6776  *
6777  * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6778  * The portion of the string affected is determined using
6779  * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6780  * If the replacement string is not the same length as the text it is replacing,
6781  * the string will be adjusted accordingly.
6782  *
6783  * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6784  *
6785  * The form that take an Integer will raise an IndexError if the value is out
6786  * of range; the Range form will raise a RangeError.
6787  * If the beginning or ending offset does not land on character (codepoint)
6788  * boundary, an IndexError will be raised.
6789  */
6790 
6791 static VALUE
6792 rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6793 {
6794  long beg, len, vbeg, vlen;
6795  VALUE val;
6796  int cr;
6797 
6798  rb_check_arity(argc, 2, 5);
6799  if (!(argc == 2 || argc == 3 || argc == 5)) {
6800  rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6801  }
6802  if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6803  if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6804  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6805  rb_builtin_class_name(argv[0]));
6806  }
6807  val = argv[1];
6808  StringValue(val);
6809  if (argc == 2) {
6810  /* bytesplice(range, str) */
6811  vbeg = 0;
6812  vlen = RSTRING_LEN(val);
6813  }
6814  else {
6815  /* bytesplice(range, str, str_range) */
6816  if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6817  rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6818  rb_builtin_class_name(argv[2]));
6819  }
6820  }
6821  }
6822  else {
6823  beg = NUM2LONG(argv[0]);
6824  len = NUM2LONG(argv[1]);
6825  val = argv[2];
6826  StringValue(val);
6827  if (argc == 3) {
6828  /* bytesplice(index, length, str) */
6829  vbeg = 0;
6830  vlen = RSTRING_LEN(val);
6831  }
6832  else {
6833  /* bytesplice(index, length, str, str_index, str_length) */
6834  vbeg = NUM2LONG(argv[3]);
6835  vlen = NUM2LONG(argv[4]);
6836  }
6837  }
6838  str_check_beg_len(str, &beg, &len);
6839  str_check_beg_len(val, &vbeg, &vlen);
6840  str_modify_keep_cr(str);
6841 
6843  rb_enc_associate(str, rb_enc_check(str, val));
6844  }
6845 
6846  rb_str_update_1(str, beg, len, val, vbeg, vlen);
6848  if (cr != ENC_CODERANGE_BROKEN)
6849  ENC_CODERANGE_SET(str, cr);
6850  return str;
6851 }
6852 
6853 /*
6854  * call-seq:
6855  * reverse -> string
6856  *
6857  * Returns a new string with the characters from +self+ in reverse order.
6858  *
6859  * 'stressed'.reverse # => "desserts"
6860  *
6861  */
6862 
6863 static VALUE
6864 rb_str_reverse(VALUE str)
6865 {
6866  rb_encoding *enc;
6867  VALUE rev;
6868  char *s, *e, *p;
6869  int cr;
6870 
6871  if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6872  enc = STR_ENC_GET(str);
6873  rev = rb_str_new(0, RSTRING_LEN(str));
6874  s = RSTRING_PTR(str); e = RSTRING_END(str);
6875  p = RSTRING_END(rev);
6876  cr = ENC_CODERANGE(str);
6877 
6878  if (RSTRING_LEN(str) > 1) {
6879  if (single_byte_optimizable(str)) {
6880  while (s < e) {
6881  *--p = *s++;
6882  }
6883  }
6884  else if (cr == ENC_CODERANGE_VALID) {
6885  while (s < e) {
6886  int clen = rb_enc_fast_mbclen(s, e, enc);
6887 
6888  p -= clen;
6889  memcpy(p, s, clen);
6890  s += clen;
6891  }
6892  }
6893  else {
6894  cr = rb_enc_asciicompat(enc) ?
6896  while (s < e) {
6897  int clen = rb_enc_mbclen(s, e, enc);
6898 
6899  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6900  p -= clen;
6901  memcpy(p, s, clen);
6902  s += clen;
6903  }
6904  }
6905  }
6906  STR_SET_LEN(rev, RSTRING_LEN(str));
6907  str_enc_copy_direct(rev, str);
6908  ENC_CODERANGE_SET(rev, cr);
6909 
6910  return rev;
6911 }
6912 
6913 
6914 /*
6915  * call-seq:
6916  * reverse! -> self
6917  *
6918  * Returns +self+ with its characters reversed:
6919  *
6920  * s = 'stressed'
6921  * s.reverse! # => "desserts"
6922  * s # => "desserts"
6923  *
6924  */
6925 
6926 static VALUE
6927 rb_str_reverse_bang(VALUE str)
6928 {
6929  if (RSTRING_LEN(str) > 1) {
6930  if (single_byte_optimizable(str)) {
6931  char *s, *e, c;
6932 
6933  str_modify_keep_cr(str);
6934  s = RSTRING_PTR(str);
6935  e = RSTRING_END(str) - 1;
6936  while (s < e) {
6937  c = *s;
6938  *s++ = *e;
6939  *e-- = c;
6940  }
6941  }
6942  else {
6943  str_shared_replace(str, rb_str_reverse(str));
6944  }
6945  }
6946  else {
6947  str_modify_keep_cr(str);
6948  }
6949  return str;
6950 }
6951 
6952 
6953 /*
6954  * call-seq:
6955  * include?(other_string) -> true or false
6956  *
6957  * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6958  *
6959  * s = 'foo'
6960  * s.include?('f') # => true
6961  * s.include?('fo') # => true
6962  * s.include?('food') # => false
6963  *
6964  */
6965 
6966 VALUE
6967 rb_str_include(VALUE str, VALUE arg)
6968 {
6969  long i;
6970 
6971  StringValue(arg);
6972  i = rb_str_index(str, arg, 0);
6973 
6974  return RBOOL(i != -1);
6975 }
6976 
6977 
6978 /*
6979  * call-seq:
6980  * to_i(base = 10) -> integer
6981  *
6982  * Returns the result of interpreting leading characters in +self+
6983  * as an integer in the given +base+ (which must be in (0, 2..36)):
6984  *
6985  * '123456'.to_i # => 123456
6986  * '123def'.to_i(16) # => 1195503
6987  *
6988  * With +base+ zero, string +object+ may contain leading characters
6989  * to specify the actual base:
6990  *
6991  * '123def'.to_i(0) # => 123
6992  * '0123def'.to_i(0) # => 83
6993  * '0b123def'.to_i(0) # => 1
6994  * '0o123def'.to_i(0) # => 83
6995  * '0d123def'.to_i(0) # => 123
6996  * '0x123def'.to_i(0) # => 1195503
6997  *
6998  * Characters past a leading valid number (in the given +base+) are ignored:
6999  *
7000  * '12.345'.to_i # => 12
7001  * '12345'.to_i(2) # => 1
7002  *
7003  * Returns zero if there is no leading valid number:
7004  *
7005  * 'abcdef'.to_i # => 0
7006  * '2'.to_i(2) # => 0
7007  *
7008  */
7009 
7010 static VALUE
7011 rb_str_to_i(int argc, VALUE *argv, VALUE str)
7012 {
7013  int base = 10;
7014 
7015  if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
7016  rb_raise(rb_eArgError, "invalid radix %d", base);
7017  }
7018  return rb_str_to_inum(str, base, FALSE);
7019 }
7020 
7021 
7022 /*
7023  * call-seq:
7024  * to_f -> float
7025  *
7026  * Returns the result of interpreting leading characters in +self+ as a Float:
7027  *
7028  * '3.14159'.to_f # => 3.14159
7029  * '1.234e-2'.to_f # => 0.01234
7030  *
7031  * Characters past a leading valid number (in the given +base+) are ignored:
7032  *
7033  * '3.14 (pi to two places)'.to_f # => 3.14
7034  *
7035  * Returns zero if there is no leading valid number:
7036  *
7037  * 'abcdef'.to_f # => 0.0
7038  *
7039  */
7040 
7041 static VALUE
7042 rb_str_to_f(VALUE str)
7043 {
7044  return DBL2NUM(rb_str_to_dbl(str, FALSE));
7045 }
7046 
7047 
7048 /*
7049  * call-seq:
7050  * to_s -> self or string
7051  *
7052  * Returns +self+ if +self+ is a +String+,
7053  * or +self+ converted to a +String+ if +self+ is a subclass of +String+.
7054  */
7055 
7056 static VALUE
7057 rb_str_to_s(VALUE str)
7058 {
7059  if (rb_obj_class(str) != rb_cString) {
7060  return str_duplicate(rb_cString, str);
7061  }
7062  return str;
7063 }
7064 
7065 #if 0
7066 static void
7067 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
7068 {
7069  char s[RUBY_MAX_CHAR_LEN];
7070  int n = rb_enc_codelen(c, enc);
7071 
7072  rb_enc_mbcput(c, s, enc);
7073  rb_enc_str_buf_cat(str, s, n, enc);
7074 }
7075 #endif
7076 
7077 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
7078 
7079 int
7080 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
7081 {
7082  char buf[CHAR_ESC_LEN + 1];
7083  int l;
7084 
7085 #if SIZEOF_INT > 4
7086  c &= 0xffffffff;
7087 #endif
7088  if (unicode_p) {
7089  if (c < 0x7F && ISPRINT(c)) {
7090  snprintf(buf, CHAR_ESC_LEN, "%c", c);
7091  }
7092  else if (c < 0x10000) {
7093  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
7094  }
7095  else {
7096  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
7097  }
7098  }
7099  else {
7100  if (c < 0x100) {
7101  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
7102  }
7103  else {
7104  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
7105  }
7106  }
7107  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
7108  rb_str_buf_cat(result, buf, l);
7109  return l;
7110 }
7111 
7112 const char *
7113 ruby_escaped_char(int c)
7114 {
7115  switch (c) {
7116  case '\0': return "\\0";
7117  case '\n': return "\\n";
7118  case '\r': return "\\r";
7119  case '\t': return "\\t";
7120  case '\f': return "\\f";
7121  case '\013': return "\\v";
7122  case '\010': return "\\b";
7123  case '\007': return "\\a";
7124  case '\033': return "\\e";
7125  case '\x7f': return "\\c?";
7126  }
7127  return NULL;
7128 }
7129 
7130 VALUE
7131 rb_str_escape(VALUE str)
7132 {
7133  int encidx = ENCODING_GET(str);
7134  rb_encoding *enc = rb_enc_from_index(encidx);
7135  const char *p = RSTRING_PTR(str);
7136  const char *pend = RSTRING_END(str);
7137  const char *prev = p;
7138  char buf[CHAR_ESC_LEN + 1];
7139  VALUE result = rb_str_buf_new(0);
7140  int unicode_p = rb_enc_unicode_p(enc);
7141  int asciicompat = rb_enc_asciicompat(enc);
7142 
7143  while (p < pend) {
7144  unsigned int c;
7145  const char *cc;
7146  int n = rb_enc_precise_mbclen(p, pend, enc);
7147  if (!MBCLEN_CHARFOUND_P(n)) {
7148  if (p > prev) str_buf_cat(result, prev, p - prev);
7149  n = rb_enc_mbminlen(enc);
7150  if (pend < p + n)
7151  n = (int)(pend - p);
7152  while (n--) {
7153  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7154  str_buf_cat(result, buf, strlen(buf));
7155  prev = ++p;
7156  }
7157  continue;
7158  }
7159  n = MBCLEN_CHARFOUND_LEN(n);
7160  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7161  p += n;
7162  cc = ruby_escaped_char(c);
7163  if (cc) {
7164  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7165  str_buf_cat(result, cc, strlen(cc));
7166  prev = p;
7167  }
7168  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
7169  }
7170  else {
7171  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7172  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7173  prev = p;
7174  }
7175  }
7176  if (p > prev) str_buf_cat(result, prev, p - prev);
7178 
7179  return result;
7180 }
7181 
7182 /*
7183  * call-seq:
7184  * inspect -> string
7185  *
7186  * Returns a printable version of +self+, enclosed in double-quotes,
7187  * and with special characters escaped:
7188  *
7189  * s = "foo\tbar\tbaz\n"
7190  * s.inspect
7191  * # => "\"foo\\tbar\\tbaz\\n\""
7192  *
7193  */
7194 
7195 VALUE
7197 {
7198  int encidx = ENCODING_GET(str);
7199  rb_encoding *enc = rb_enc_from_index(encidx);
7200  const char *p, *pend, *prev;
7201  char buf[CHAR_ESC_LEN + 1];
7202  VALUE result = rb_str_buf_new(0);
7204  int unicode_p = rb_enc_unicode_p(enc);
7205  int asciicompat = rb_enc_asciicompat(enc);
7206 
7207  if (resenc == NULL) resenc = rb_default_external_encoding();
7208  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
7209  rb_enc_associate(result, resenc);
7210  str_buf_cat2(result, "\"");
7211 
7212  p = RSTRING_PTR(str); pend = RSTRING_END(str);
7213  prev = p;
7214  while (p < pend) {
7215  unsigned int c, cc;
7216  int n;
7217 
7218  n = rb_enc_precise_mbclen(p, pend, enc);
7219  if (!MBCLEN_CHARFOUND_P(n)) {
7220  if (p > prev) str_buf_cat(result, prev, p - prev);
7221  n = rb_enc_mbminlen(enc);
7222  if (pend < p + n)
7223  n = (int)(pend - p);
7224  while (n--) {
7225  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
7226  str_buf_cat(result, buf, strlen(buf));
7227  prev = ++p;
7228  }
7229  continue;
7230  }
7231  n = MBCLEN_CHARFOUND_LEN(n);
7232  c = rb_enc_mbc_to_codepoint(p, pend, enc);
7233  p += n;
7234  if ((asciicompat || unicode_p) &&
7235  (c == '"'|| c == '\\' ||
7236  (c == '#' &&
7237  p < pend &&
7239  (cc = rb_enc_codepoint(p,pend,enc),
7240  (cc == '$' || cc == '@' || cc == '{'))))) {
7241  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7242  str_buf_cat2(result, "\\");
7243  if (asciicompat || enc == resenc) {
7244  prev = p - n;
7245  continue;
7246  }
7247  }
7248  switch (c) {
7249  case '\n': cc = 'n'; break;
7250  case '\r': cc = 'r'; break;
7251  case '\t': cc = 't'; break;
7252  case '\f': cc = 'f'; break;
7253  case '\013': cc = 'v'; break;
7254  case '\010': cc = 'b'; break;
7255  case '\007': cc = 'a'; break;
7256  case 033: cc = 'e'; break;
7257  default: cc = 0; break;
7258  }
7259  if (cc) {
7260  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7261  buf[0] = '\\';
7262  buf[1] = (char)cc;
7263  str_buf_cat(result, buf, 2);
7264  prev = p;
7265  continue;
7266  }
7267  /* The special casing of 0x85 (NEXT_LINE) here is because
7268  * Oniguruma historically treats it as printable, but it
7269  * doesn't match the print POSIX bracket class or character
7270  * property in regexps.
7271  *
7272  * See Ruby Bug #16842 for details:
7273  * https://bugs.ruby-lang.org/issues/16842
7274  */
7275  if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
7276  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
7277  continue;
7278  }
7279  else {
7280  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
7281  rb_str_buf_cat_escaped_char(result, c, unicode_p);
7282  prev = p;
7283  continue;
7284  }
7285  }
7286  if (p > prev) str_buf_cat(result, prev, p - prev);
7287  str_buf_cat2(result, "\"");
7288 
7289  return result;
7290 }
7291 
7292 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
7293 
7294 /*
7295  * call-seq:
7296  * dump -> string
7297  *
7298  * Returns a printable version of +self+, enclosed in double-quotes,
7299  * with special characters escaped, and with non-printing characters
7300  * replaced by hexadecimal notation:
7301  *
7302  * "hello \n ''".dump # => "\"hello \\n ''\""
7303  * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7304  *
7305  * Related: String#undump (inverse of String#dump).
7306  *
7307  */
7308 
7309 VALUE
7311 {
7312  int encidx = rb_enc_get_index(str);
7313  rb_encoding *enc = rb_enc_from_index(encidx);
7314  long len;
7315  const char *p, *pend;
7316  char *q, *qend;
7317  VALUE result;
7318  int u8 = (encidx == rb_utf8_encindex());
7319  static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
7320 
7321  len = 2; /* "" */
7322  if (!rb_enc_asciicompat(enc)) {
7323  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
7324  len += strlen(enc->name);
7325  }
7326 
7327  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7328  while (p < pend) {
7329  int clen;
7330  unsigned char c = *p++;
7331 
7332  switch (c) {
7333  case '"': case '\\':
7334  case '\n': case '\r':
7335  case '\t': case '\f':
7336  case '\013': case '\010': case '\007': case '\033':
7337  clen = 2;
7338  break;
7339 
7340  case '#':
7341  clen = IS_EVSTR(p, pend) ? 2 : 1;
7342  break;
7343 
7344  default:
7345  if (ISPRINT(c)) {
7346  clen = 1;
7347  }
7348  else {
7349  if (u8 && c > 0x7F) { /* \u notation */
7350  int n = rb_enc_precise_mbclen(p-1, pend, enc);
7351  if (MBCLEN_CHARFOUND_P(n)) {
7352  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7353  if (cc <= 0xFFFF)
7354  clen = 6; /* \uXXXX */
7355  else if (cc <= 0xFFFFF)
7356  clen = 9; /* \u{XXXXX} */
7357  else
7358  clen = 10; /* \u{XXXXXX} */
7359  p += MBCLEN_CHARFOUND_LEN(n)-1;
7360  break;
7361  }
7362  }
7363  clen = 4; /* \xNN */
7364  }
7365  break;
7366  }
7367 
7368  if (clen > LONG_MAX - len) {
7369  rb_raise(rb_eRuntimeError, "string size too big");
7370  }
7371  len += clen;
7372  }
7373 
7374  result = rb_str_new(0, len);
7375  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
7376  q = RSTRING_PTR(result); qend = q + len + 1;
7377 
7378  *q++ = '"';
7379  while (p < pend) {
7380  unsigned char c = *p++;
7381 
7382  if (c == '"' || c == '\\') {
7383  *q++ = '\\';
7384  *q++ = c;
7385  }
7386  else if (c == '#') {
7387  if (IS_EVSTR(p, pend)) *q++ = '\\';
7388  *q++ = '#';
7389  }
7390  else if (c == '\n') {
7391  *q++ = '\\';
7392  *q++ = 'n';
7393  }
7394  else if (c == '\r') {
7395  *q++ = '\\';
7396  *q++ = 'r';
7397  }
7398  else if (c == '\t') {
7399  *q++ = '\\';
7400  *q++ = 't';
7401  }
7402  else if (c == '\f') {
7403  *q++ = '\\';
7404  *q++ = 'f';
7405  }
7406  else if (c == '\013') {
7407  *q++ = '\\';
7408  *q++ = 'v';
7409  }
7410  else if (c == '\010') {
7411  *q++ = '\\';
7412  *q++ = 'b';
7413  }
7414  else if (c == '\007') {
7415  *q++ = '\\';
7416  *q++ = 'a';
7417  }
7418  else if (c == '\033') {
7419  *q++ = '\\';
7420  *q++ = 'e';
7421  }
7422  else if (ISPRINT(c)) {
7423  *q++ = c;
7424  }
7425  else {
7426  *q++ = '\\';
7427  if (u8) {
7428  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7429  if (MBCLEN_CHARFOUND_P(n)) {
7430  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7431  p += n;
7432  if (cc <= 0xFFFF)
7433  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7434  else
7435  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7436  q += strlen(q);
7437  continue;
7438  }
7439  }
7440  snprintf(q, qend-q, "x%02X", c);
7441  q += 3;
7442  }
7443  }
7444  *q++ = '"';
7445  *q = '\0';
7446  if (!rb_enc_asciicompat(enc)) {
7447  snprintf(q, qend-q, nonascii_suffix, enc->name);
7448  encidx = rb_ascii8bit_encindex();
7449  }
7450  /* result from dump is ASCII */
7451  rb_enc_associate_index(result, encidx);
7453  return result;
7454 }
7455 
7456 static int
7457 unescape_ascii(unsigned int c)
7458 {
7459  switch (c) {
7460  case 'n':
7461  return '\n';
7462  case 'r':
7463  return '\r';
7464  case 't':
7465  return '\t';
7466  case 'f':
7467  return '\f';
7468  case 'v':
7469  return '\13';
7470  case 'b':
7471  return '\010';
7472  case 'a':
7473  return '\007';
7474  case 'e':
7475  return 033;
7476  }
7477  UNREACHABLE_RETURN(-1);
7478 }
7479 
7480 static void
7481 undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7482 {
7483  const char *s = *ss;
7484  unsigned int c;
7485  int codelen;
7486  size_t hexlen;
7487  unsigned char buf[6];
7488  static rb_encoding *enc_utf8 = NULL;
7489 
7490  switch (*s) {
7491  case '\\':
7492  case '"':
7493  case '#':
7494  rb_str_cat(undumped, s, 1); /* cat itself */
7495  s++;
7496  break;
7497  case 'n':
7498  case 'r':
7499  case 't':
7500  case 'f':
7501  case 'v':
7502  case 'b':
7503  case 'a':
7504  case 'e':
7505  *buf = unescape_ascii(*s);
7506  rb_str_cat(undumped, (char *)buf, 1);
7507  s++;
7508  break;
7509  case 'u':
7510  if (*binary) {
7511  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7512  }
7513  *utf8 = true;
7514  if (++s >= s_end) {
7515  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7516  }
7517  if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7518  if (*penc != enc_utf8) {
7519  *penc = enc_utf8;
7520  rb_enc_associate(undumped, enc_utf8);
7521  }
7522  if (*s == '{') { /* handle \u{...} form */
7523  s++;
7524  for (;;) {
7525  if (s >= s_end) {
7526  rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7527  }
7528  if (*s == '}') {
7529  s++;
7530  break;
7531  }
7532  if (ISSPACE(*s)) {
7533  s++;
7534  continue;
7535  }
7536  c = scan_hex(s, s_end-s, &hexlen);
7537  if (hexlen == 0 || hexlen > 6) {
7538  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7539  }
7540  if (c > 0x10ffff) {
7541  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7542  }
7543  if (0xd800 <= c && c <= 0xdfff) {
7544  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7545  }
7546  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7547  rb_str_cat(undumped, (char *)buf, codelen);
7548  s += hexlen;
7549  }
7550  }
7551  else { /* handle \uXXXX form */
7552  c = scan_hex(s, 4, &hexlen);
7553  if (hexlen != 4) {
7554  rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7555  }
7556  if (0xd800 <= c && c <= 0xdfff) {
7557  rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7558  }
7559  codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7560  rb_str_cat(undumped, (char *)buf, codelen);
7561  s += hexlen;
7562  }
7563  break;
7564  case 'x':
7565  if (*utf8) {
7566  rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7567  }
7568  *binary = true;
7569  if (++s >= s_end) {
7570  rb_raise(rb_eRuntimeError, "invalid hex escape");
7571  }
7572  *buf = scan_hex(s, 2, &hexlen);
7573  if (hexlen != 2) {
7574  rb_raise(rb_eRuntimeError, "invalid hex escape");
7575  }
7576  rb_str_cat(undumped, (char *)buf, 1);
7577  s += hexlen;
7578  break;
7579  default:
7580  rb_str_cat(undumped, s-1, 2);
7581  s++;
7582  }
7583 
7584  *ss = s;
7585 }
7586 
7587 static VALUE rb_str_is_ascii_only_p(VALUE str);
7588 
7589 /*
7590  * call-seq:
7591  * undump -> string
7592  *
7593  * Returns an unescaped version of +self+:
7594  *
7595  * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7596  * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7597  * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7598  * s_undumped == s_orig # => true
7599  *
7600  * Related: String#dump (inverse of String#undump).
7601  *
7602  */
7603 
7604 static VALUE
7605 str_undump(VALUE str)
7606 {
7607  const char *s = RSTRING_PTR(str);
7608  const char *s_end = RSTRING_END(str);
7609  rb_encoding *enc = rb_enc_get(str);
7610  VALUE undumped = rb_enc_str_new(s, 0L, enc);
7611  bool utf8 = false;
7612  bool binary = false;
7613  int w;
7614 
7615  rb_must_asciicompat(str);
7616  if (rb_str_is_ascii_only_p(str) == Qfalse) {
7617  rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7618  }
7619  if (!str_null_check(str, &w)) {
7620  rb_raise(rb_eRuntimeError, "string contains null byte");
7621  }
7622  if (RSTRING_LEN(str) < 2) goto invalid_format;
7623  if (*s != '"') goto invalid_format;
7624 
7625  /* strip '"' at the start */
7626  s++;
7627 
7628  for (;;) {
7629  if (s >= s_end) {
7630  rb_raise(rb_eRuntimeError, "unterminated dumped string");
7631  }
7632 
7633  if (*s == '"') {
7634  /* epilogue */
7635  s++;
7636  if (s == s_end) {
7637  /* ascii compatible dumped string */
7638  break;
7639  }
7640  else {
7641  static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7642  static const char dup_suffix[] = ".dup";
7643  const char *encname;
7644  int encidx;
7645  ptrdiff_t size;
7646 
7647  /* check separately for strings dumped by older versions */
7648  size = sizeof(dup_suffix) - 1;
7649  if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7650 
7651  size = sizeof(force_encoding_suffix) - 1;
7652  if (s_end - s <= size) goto invalid_format;
7653  if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7654  s += size;
7655 
7656  if (utf8) {
7657  rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7658  }
7659 
7660  encname = s;
7661  s = memchr(s, '"', s_end-s);
7662  size = s - encname;
7663  if (!s) goto invalid_format;
7664  if (s_end - s != 2) goto invalid_format;
7665  if (s[0] != '"' || s[1] != ')') goto invalid_format;
7666 
7667  encidx = rb_enc_find_index2(encname, (long)size);
7668  if (encidx < 0) {
7669  rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7670  }
7671  rb_enc_associate_index(undumped, encidx);
7672  }
7673  break;
7674  }
7675 
7676  if (*s == '\\') {
7677  s++;
7678  if (s >= s_end) {
7679  rb_raise(rb_eRuntimeError, "invalid escape");
7680  }
7681  undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7682  }
7683  else {
7684  rb_str_cat(undumped, s++, 1);
7685  }
7686  }
7687 
7688  RB_GC_GUARD(str);
7689 
7690  return undumped;
7691 invalid_format:
7692  rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7693 }
7694 
7695 static void
7696 rb_str_check_dummy_enc(rb_encoding *enc)
7697 {
7698  if (rb_enc_dummy_p(enc)) {
7699  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7700  rb_enc_name(enc));
7701  }
7702 }
7703 
7704 static rb_encoding *
7705 str_true_enc(VALUE str)
7706 {
7707  rb_encoding *enc = STR_ENC_GET(str);
7708  rb_str_check_dummy_enc(enc);
7709  return enc;
7710 }
7711 
7712 static OnigCaseFoldType
7713 check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7714 {
7715  if (argc==0)
7716  return flags;
7717  if (argc>2)
7718  rb_raise(rb_eArgError, "too many options");
7719  if (argv[0]==sym_turkic) {
7720  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7721  if (argc==2) {
7722  if (argv[1]==sym_lithuanian)
7723  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7724  else
7725  rb_raise(rb_eArgError, "invalid second option");
7726  }
7727  }
7728  else if (argv[0]==sym_lithuanian) {
7729  flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7730  if (argc==2) {
7731  if (argv[1]==sym_turkic)
7732  flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7733  else
7734  rb_raise(rb_eArgError, "invalid second option");
7735  }
7736  }
7737  else if (argc>1)
7738  rb_raise(rb_eArgError, "too many options");
7739  else if (argv[0]==sym_ascii)
7740  flags |= ONIGENC_CASE_ASCII_ONLY;
7741  else if (argv[0]==sym_fold) {
7742  if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7743  flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7744  else
7745  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7746  }
7747  else
7748  rb_raise(rb_eArgError, "invalid option");
7749  return flags;
7750 }
7751 
7752 static inline bool
7753 case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7754 {
7755  if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7756  return true;
7757  return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7758 }
7759 
7760 /* 16 should be long enough to absorb any kind of single character length increase */
7761 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
7762 #ifndef CASEMAP_DEBUG
7763 # define CASEMAP_DEBUG 0
7764 #endif
7765 
7766 struct mapping_buffer;
7767 typedef struct mapping_buffer {
7768  size_t capa;
7769  size_t used;
7770  struct mapping_buffer *next;
7771  OnigUChar space[FLEX_ARY_LEN];
7772 } mapping_buffer;
7773 
7774 static void
7775 mapping_buffer_free(void *p)
7776 {
7777  mapping_buffer *previous_buffer;
7778  mapping_buffer *current_buffer = p;
7779  while (current_buffer) {
7780  previous_buffer = current_buffer;
7781  current_buffer = current_buffer->next;
7782  ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7783  }
7784 }
7785 
7786 static const rb_data_type_t mapping_buffer_type = {
7787  "mapping_buffer",
7788  {0, mapping_buffer_free,},
7789  0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7790 };
7791 
7792 static VALUE
7793 rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7794 {
7795  VALUE target;
7796 
7797  const OnigUChar *source_current, *source_end;
7798  int target_length = 0;
7799  VALUE buffer_anchor;
7800  mapping_buffer *current_buffer = 0;
7801  mapping_buffer **pre_buffer;
7802  size_t buffer_count = 0;
7803  int buffer_length_or_invalid;
7804 
7805  if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7806 
7807  source_current = (OnigUChar*)RSTRING_PTR(source);
7808  source_end = (OnigUChar*)RSTRING_END(source);
7809 
7810  buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7811  pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7812  while (source_current < source_end) {
7813  /* increase multiplier using buffer count to converge quickly */
7814  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7815  if (CASEMAP_DEBUG) {
7816  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7817  }
7818  current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7819  *pre_buffer = current_buffer;
7820  pre_buffer = &current_buffer->next;
7821  current_buffer->next = NULL;
7822  current_buffer->capa = capa;
7823  buffer_length_or_invalid = enc->case_map(flags,
7824  &source_current, source_end,
7825  current_buffer->space,
7826  current_buffer->space+current_buffer->capa,
7827  enc);
7828  if (buffer_length_or_invalid < 0) {
7829  current_buffer = DATA_PTR(buffer_anchor);
7830  DATA_PTR(buffer_anchor) = 0;
7831  mapping_buffer_free(current_buffer);
7832  rb_raise(rb_eArgError, "input string invalid");
7833  }
7834  target_length += current_buffer->used = buffer_length_or_invalid;
7835  }
7836  if (CASEMAP_DEBUG) {
7837  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7838  }
7839 
7840  if (buffer_count==1) {
7841  target = rb_str_new((const char*)current_buffer->space, target_length);
7842  }
7843  else {
7844  char *target_current;
7845 
7846  target = rb_str_new(0, target_length);
7847  target_current = RSTRING_PTR(target);
7848  current_buffer = DATA_PTR(buffer_anchor);
7849  while (current_buffer) {
7850  memcpy(target_current, current_buffer->space, current_buffer->used);
7851  target_current += current_buffer->used;
7852  current_buffer = current_buffer->next;
7853  }
7854  }
7855  current_buffer = DATA_PTR(buffer_anchor);
7856  DATA_PTR(buffer_anchor) = 0;
7857  mapping_buffer_free(current_buffer);
7858 
7859  RB_GC_GUARD(buffer_anchor);
7860 
7861  /* TODO: check about string terminator character */
7862  str_enc_copy_direct(target, source);
7863  /*ENC_CODERANGE_SET(mapped, cr);*/
7864 
7865  return target;
7866 }
7867 
7868 static VALUE
7869 rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7870 {
7871  const OnigUChar *source_current, *source_end;
7872  OnigUChar *target_current, *target_end;
7873  long old_length = RSTRING_LEN(source);
7874  int length_or_invalid;
7875 
7876  if (old_length == 0) return Qnil;
7877 
7878  source_current = (OnigUChar*)RSTRING_PTR(source);
7879  source_end = (OnigUChar*)RSTRING_END(source);
7880  if (source == target) {
7881  target_current = (OnigUChar*)source_current;
7882  target_end = (OnigUChar*)source_end;
7883  }
7884  else {
7885  target_current = (OnigUChar*)RSTRING_PTR(target);
7886  target_end = (OnigUChar*)RSTRING_END(target);
7887  }
7888 
7889  length_or_invalid = onigenc_ascii_only_case_map(flags,
7890  &source_current, source_end,
7891  target_current, target_end, enc);
7892  if (length_or_invalid < 0)
7893  rb_raise(rb_eArgError, "input string invalid");
7894  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7895  fprintf(stderr, "problem with rb_str_ascii_casemap"
7896  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7897  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7898  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7899  }
7900 
7901  str_enc_copy(target, source);
7902 
7903  return target;
7904 }
7905 
7906 static bool
7907 upcase_single(VALUE str)
7908 {
7909  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7910  bool modified = false;
7911 
7912  while (s < send) {
7913  unsigned int c = *(unsigned char*)s;
7914 
7915  if ('a' <= c && c <= 'z') {
7916  *s = 'A' + (c - 'a');
7917  modified = true;
7918  }
7919  s++;
7920  }
7921  return modified;
7922 }
7923 
7924 /*
7925  * call-seq:
7926  * upcase!(*options) -> self or nil
7927  *
7928  * Upcases the characters in +self+;
7929  * returns +self+ if any changes were made, +nil+ otherwise:
7930  *
7931  * s = 'Hello World!' # => "Hello World!"
7932  * s.upcase! # => "HELLO WORLD!"
7933  * s # => "HELLO WORLD!"
7934  * s.upcase! # => nil
7935  *
7936  * The casing may be affected by the given +options+;
7937  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7938  *
7939  * Related: String#upcase, String#downcase, String#downcase!.
7940  *
7941  */
7942 
7943 static VALUE
7944 rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7945 {
7946  rb_encoding *enc;
7947  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7948 
7949  flags = check_case_options(argc, argv, flags);
7950  str_modify_keep_cr(str);
7951  enc = str_true_enc(str);
7952  if (case_option_single_p(flags, enc, str)) {
7953  if (upcase_single(str))
7954  flags |= ONIGENC_CASE_MODIFIED;
7955  }
7956  else if (flags&ONIGENC_CASE_ASCII_ONLY)
7957  rb_str_ascii_casemap(str, str, &flags, enc);
7958  else
7959  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7960 
7961  if (ONIGENC_CASE_MODIFIED&flags) return str;
7962  return Qnil;
7963 }
7964 
7965 
7966 /*
7967  * call-seq:
7968  * upcase(*options) -> string
7969  *
7970  * Returns a string containing the upcased characters in +self+:
7971  *
7972  * s = 'Hello World!' # => "Hello World!"
7973  * s.upcase # => "HELLO WORLD!"
7974  *
7975  * The casing may be affected by the given +options+;
7976  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7977  *
7978  * Related: String#upcase!, String#downcase, String#downcase!.
7979  *
7980  */
7981 
7982 static VALUE
7983 rb_str_upcase(int argc, VALUE *argv, VALUE str)
7984 {
7985  rb_encoding *enc;
7986  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7987  VALUE ret;
7988 
7989  flags = check_case_options(argc, argv, flags);
7990  enc = str_true_enc(str);
7991  if (case_option_single_p(flags, enc, str)) {
7992  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7993  str_enc_copy_direct(ret, str);
7994  upcase_single(ret);
7995  }
7996  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7997  ret = rb_str_new(0, RSTRING_LEN(str));
7998  rb_str_ascii_casemap(str, ret, &flags, enc);
7999  }
8000  else {
8001  ret = rb_str_casemap(str, &flags, enc);
8002  }
8003 
8004  return ret;
8005 }
8006 
8007 static bool
8008 downcase_single(VALUE str)
8009 {
8010  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
8011  bool modified = false;
8012 
8013  while (s < send) {
8014  unsigned int c = *(unsigned char*)s;
8015 
8016  if ('A' <= c && c <= 'Z') {
8017  *s = 'a' + (c - 'A');
8018  modified = true;
8019  }
8020  s++;
8021  }
8022 
8023  return modified;
8024 }
8025 
8026 /*
8027  * call-seq:
8028  * downcase!(*options) -> self or nil
8029  *
8030  * Downcases the characters in +self+;
8031  * returns +self+ if any changes were made, +nil+ otherwise:
8032  *
8033  * s = 'Hello World!' # => "Hello World!"
8034  * s.downcase! # => "hello world!"
8035  * s # => "hello world!"
8036  * s.downcase! # => nil
8037  *
8038  * The casing may be affected by the given +options+;
8039  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8040  *
8041  * Related: String#downcase, String#upcase, String#upcase!.
8042  *
8043  */
8044 
8045 static VALUE
8046 rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
8047 {
8048  rb_encoding *enc;
8049  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8050 
8051  flags = check_case_options(argc, argv, flags);
8052  str_modify_keep_cr(str);
8053  enc = str_true_enc(str);
8054  if (case_option_single_p(flags, enc, str)) {
8055  if (downcase_single(str))
8056  flags |= ONIGENC_CASE_MODIFIED;
8057  }
8058  else if (flags&ONIGENC_CASE_ASCII_ONLY)
8059  rb_str_ascii_casemap(str, str, &flags, enc);
8060  else
8061  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8062 
8063  if (ONIGENC_CASE_MODIFIED&flags) return str;
8064  return Qnil;
8065 }
8066 
8067 
8068 /*
8069  * call-seq:
8070  * downcase(*options) -> string
8071  *
8072  * Returns a string containing the downcased characters in +self+:
8073  *
8074  * s = 'Hello World!' # => "Hello World!"
8075  * s.downcase # => "hello world!"
8076  *
8077  * The casing may be affected by the given +options+;
8078  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8079  *
8080  * Related: String#downcase!, String#upcase, String#upcase!.
8081  *
8082  */
8083 
8084 static VALUE
8085 rb_str_downcase(int argc, VALUE *argv, VALUE str)
8086 {
8087  rb_encoding *enc;
8088  OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
8089  VALUE ret;
8090 
8091  flags = check_case_options(argc, argv, flags);
8092  enc = str_true_enc(str);
8093  if (case_option_single_p(flags, enc, str)) {
8094  ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
8095  str_enc_copy_direct(ret, str);
8096  downcase_single(ret);
8097  }
8098  else if (flags&ONIGENC_CASE_ASCII_ONLY) {
8099  ret = rb_str_new(0, RSTRING_LEN(str));
8100  rb_str_ascii_casemap(str, ret, &flags, enc);
8101  }
8102  else {
8103  ret = rb_str_casemap(str, &flags, enc);
8104  }
8105 
8106  return ret;
8107 }
8108 
8109 
8110 /*
8111  * call-seq:
8112  * capitalize!(*options) -> self or nil
8113  *
8114  * Upcases the first character in +self+;
8115  * downcases the remaining characters;
8116  * returns +self+ if any changes were made, +nil+ otherwise:
8117  *
8118  * s = 'hello World!' # => "hello World!"
8119  * s.capitalize! # => "Hello world!"
8120  * s # => "Hello world!"
8121  * s.capitalize! # => nil
8122  *
8123  * The casing may be affected by the given +options+;
8124  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8125  *
8126  * Related: String#capitalize.
8127  *
8128  */
8129 
8130 static VALUE
8131 rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
8132 {
8133  rb_encoding *enc;
8134  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8135 
8136  flags = check_case_options(argc, argv, flags);
8137  str_modify_keep_cr(str);
8138  enc = str_true_enc(str);
8139  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8140  if (flags&ONIGENC_CASE_ASCII_ONLY)
8141  rb_str_ascii_casemap(str, str, &flags, enc);
8142  else
8143  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8144 
8145  if (ONIGENC_CASE_MODIFIED&flags) return str;
8146  return Qnil;
8147 }
8148 
8149 
8150 /*
8151  * call-seq:
8152  * capitalize(*options) -> string
8153  *
8154  * Returns a string containing the characters in +self+;
8155  * the first character is upcased;
8156  * the remaining characters are downcased:
8157  *
8158  * s = 'hello World!' # => "hello World!"
8159  * s.capitalize # => "Hello world!"
8160  *
8161  * The casing may be affected by the given +options+;
8162  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8163  *
8164  * Related: String#capitalize!.
8165  *
8166  */
8167 
8168 static VALUE
8169 rb_str_capitalize(int argc, VALUE *argv, VALUE str)
8170 {
8171  rb_encoding *enc;
8172  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
8173  VALUE ret;
8174 
8175  flags = check_case_options(argc, argv, flags);
8176  enc = str_true_enc(str);
8177  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
8178  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8179  ret = rb_str_new(0, RSTRING_LEN(str));
8180  rb_str_ascii_casemap(str, ret, &flags, enc);
8181  }
8182  else {
8183  ret = rb_str_casemap(str, &flags, enc);
8184  }
8185  return ret;
8186 }
8187 
8188 
8189 /*
8190  * call-seq:
8191  * swapcase!(*options) -> self or nil
8192  *
8193  * Upcases each lowercase character in +self+;
8194  * downcases uppercase character;
8195  * returns +self+ if any changes were made, +nil+ otherwise:
8196  *
8197  * s = 'Hello World!' # => "Hello World!"
8198  * s.swapcase! # => "hELLO wORLD!"
8199  * s # => "hELLO wORLD!"
8200  * ''.swapcase! # => nil
8201  *
8202  * The casing may be affected by the given +options+;
8203  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8204  *
8205  * Related: String#swapcase.
8206  *
8207  */
8208 
8209 static VALUE
8210 rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
8211 {
8212  rb_encoding *enc;
8213  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8214 
8215  flags = check_case_options(argc, argv, flags);
8216  str_modify_keep_cr(str);
8217  enc = str_true_enc(str);
8218  if (flags&ONIGENC_CASE_ASCII_ONLY)
8219  rb_str_ascii_casemap(str, str, &flags, enc);
8220  else
8221  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
8222 
8223  if (ONIGENC_CASE_MODIFIED&flags) return str;
8224  return Qnil;
8225 }
8226 
8227 
8228 /*
8229  * call-seq:
8230  * swapcase(*options) -> string
8231  *
8232  * Returns a string containing the characters in +self+, with cases reversed;
8233  * each uppercase character is downcased;
8234  * each lowercase character is upcased:
8235  *
8236  * s = 'Hello World!' # => "Hello World!"
8237  * s.swapcase # => "hELLO wORLD!"
8238  *
8239  * The casing may be affected by the given +options+;
8240  * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
8241  *
8242  * Related: String#swapcase!.
8243  *
8244  */
8245 
8246 static VALUE
8247 rb_str_swapcase(int argc, VALUE *argv, VALUE str)
8248 {
8249  rb_encoding *enc;
8250  OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
8251  VALUE ret;
8252 
8253  flags = check_case_options(argc, argv, flags);
8254  enc = str_true_enc(str);
8255  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
8256  if (flags&ONIGENC_CASE_ASCII_ONLY) {
8257  ret = rb_str_new(0, RSTRING_LEN(str));
8258  rb_str_ascii_casemap(str, ret, &flags, enc);
8259  }
8260  else {
8261  ret = rb_str_casemap(str, &flags, enc);
8262  }
8263  return ret;
8264 }
8265 
8266 typedef unsigned char *USTR;
8267 
8268 struct tr {
8269  int gen;
8270  unsigned int now, max;
8271  char *p, *pend;
8272 };
8273 
8274 static unsigned int
8275 trnext(struct tr *t, rb_encoding *enc)
8276 {
8277  int n;
8278 
8279  for (;;) {
8280  nextpart:
8281  if (!t->gen) {
8282  if (t->p == t->pend) return -1;
8283  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
8284  t->p += n;
8285  }
8286  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8287  t->p += n;
8288  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
8289  t->p += n;
8290  if (t->p < t->pend) {
8291  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
8292  t->p += n;
8293  if (t->now > c) {
8294  if (t->now < 0x80 && c < 0x80) {
8296  "invalid range \"%c-%c\" in string transliteration",
8297  t->now, c);
8298  }
8299  else {
8300  rb_raise(rb_eArgError, "invalid range in string transliteration");
8301  }
8302  continue; /* not reached */
8303  }
8304  else if (t->now < c) {
8305  t->gen = 1;
8306  t->max = c;
8307  }
8308  }
8309  }
8310  return t->now;
8311  }
8312  else {
8313  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
8314  if (t->now == t->max) {
8315  t->gen = 0;
8316  goto nextpart;
8317  }
8318  }
8319  if (t->now < t->max) {
8320  return t->now;
8321  }
8322  else {
8323  t->gen = 0;
8324  return t->max;
8325  }
8326  }
8327  }
8328 }
8329 
8330 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
8331 
8332 static VALUE
8333 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
8334 {
8335  const unsigned int errc = -1;
8336  unsigned int trans[256];
8337  rb_encoding *enc, *e1, *e2;
8338  struct tr trsrc, trrepl;
8339  int cflag = 0;
8340  unsigned int c, c0, last = 0;
8341  int modify = 0, i, l;
8342  unsigned char *s, *send;
8343  VALUE hash = 0;
8344  int singlebyte = single_byte_optimizable(str);
8345  int termlen;
8346  int cr;
8347 
8348 #define CHECK_IF_ASCII(c) \
8349  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
8350  (cr = ENC_CODERANGE_VALID) : 0)
8351 
8352  StringValue(src);
8353  StringValue(repl);
8354  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8355  if (RSTRING_LEN(repl) == 0) {
8356  return rb_str_delete_bang(1, &src, str);
8357  }
8358 
8359  cr = ENC_CODERANGE(str);
8360  e1 = rb_enc_check(str, src);
8361  e2 = rb_enc_check(str, repl);
8362  if (e1 == e2) {
8363  enc = e1;
8364  }
8365  else {
8366  enc = rb_enc_check(src, repl);
8367  }
8368  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
8369  if (RSTRING_LEN(src) > 1 &&
8370  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
8371  trsrc.p + l < trsrc.pend) {
8372  cflag = 1;
8373  trsrc.p += l;
8374  }
8375  trrepl.p = RSTRING_PTR(repl);
8376  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
8377  trsrc.gen = trrepl.gen = 0;
8378  trsrc.now = trrepl.now = 0;
8379  trsrc.max = trrepl.max = 0;
8380 
8381  if (cflag) {
8382  for (i=0; i<256; i++) {
8383  trans[i] = 1;
8384  }
8385  while ((c = trnext(&trsrc, enc)) != errc) {
8386  if (c < 256) {
8387  trans[c] = errc;
8388  }
8389  else {
8390  if (!hash) hash = rb_hash_new();
8391  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
8392  }
8393  }
8394  while ((c = trnext(&trrepl, enc)) != errc)
8395  /* retrieve last replacer */;
8396  last = trrepl.now;
8397  for (i=0; i<256; i++) {
8398  if (trans[i] != errc) {
8399  trans[i] = last;
8400  }
8401  }
8402  }
8403  else {
8404  unsigned int r;
8405 
8406  for (i=0; i<256; i++) {
8407  trans[i] = errc;
8408  }
8409  while ((c = trnext(&trsrc, enc)) != errc) {
8410  r = trnext(&trrepl, enc);
8411  if (r == errc) r = trrepl.now;
8412  if (c < 256) {
8413  trans[c] = r;
8414  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8415  }
8416  else {
8417  if (!hash) hash = rb_hash_new();
8418  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8419  }
8420  }
8421  }
8422 
8423  if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8424  cr = ENC_CODERANGE_7BIT;
8425  str_modify_keep_cr(str);
8426  s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8427  termlen = rb_enc_mbminlen(enc);
8428  if (sflag) {
8429  int clen, tlen;
8430  long offset, max = RSTRING_LEN(str);
8431  unsigned int save = -1;
8432  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8433 
8434  while (s < send) {
8435  int may_modify = 0;
8436 
8437  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8438  if (!MBCLEN_CHARFOUND_P(r)) {
8439  xfree(buf);
8440  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8441  }
8442  clen = MBCLEN_CHARFOUND_LEN(r);
8443  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8444 
8445  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8446 
8447  s += clen;
8448  if (c < 256) {
8449  c = trans[c];
8450  }
8451  else if (hash) {
8452  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8453  if (NIL_P(tmp)) {
8454  if (cflag) c = last;
8455  else c = errc;
8456  }
8457  else if (cflag) c = errc;
8458  else c = NUM2INT(tmp);
8459  }
8460  else {
8461  c = errc;
8462  }
8463  if (c != (unsigned int)-1) {
8464  if (save == c) {
8465  CHECK_IF_ASCII(c);
8466  continue;
8467  }
8468  save = c;
8469  tlen = rb_enc_codelen(c, enc);
8470  modify = 1;
8471  }
8472  else {
8473  save = -1;
8474  c = c0;
8475  if (enc != e1) may_modify = 1;
8476  }
8477  if ((offset = t - buf) + tlen > max) {
8478  size_t MAYBE_UNUSED(old) = max + termlen;
8479  max = offset + tlen + (send - s);
8480  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8481  t = buf + offset;
8482  }
8483  rb_enc_mbcput(c, t, enc);
8484  if (may_modify && memcmp(s, t, tlen) != 0) {
8485  modify = 1;
8486  }
8487  CHECK_IF_ASCII(c);
8488  t += tlen;
8489  }
8490  if (!STR_EMBED_P(str)) {
8491  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8492  }
8493  TERM_FILL((char *)t, termlen);
8494  RSTRING(str)->as.heap.ptr = (char *)buf;
8495  STR_SET_LEN(str, t - buf);
8496  STR_SET_NOEMBED(str);
8497  RSTRING(str)->as.heap.aux.capa = max;
8498  }
8499  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8500  while (s < send) {
8501  c = (unsigned char)*s;
8502  if (trans[c] != errc) {
8503  if (!cflag) {
8504  c = trans[c];
8505  *s = c;
8506  modify = 1;
8507  }
8508  else {
8509  *s = last;
8510  modify = 1;
8511  }
8512  }
8513  CHECK_IF_ASCII(c);
8514  s++;
8515  }
8516  }
8517  else {
8518  int clen, tlen;
8519  long offset, max = (long)((send - s) * 1.2);
8520  unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8521 
8522  while (s < send) {
8523  int may_modify = 0;
8524 
8525  int r = rb_enc_precise_mbclen((char *)s, (char *)send, e1);
8526  if (!MBCLEN_CHARFOUND_P(r)) {
8527  xfree(buf);
8528  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(e1));
8529  }
8530  clen = MBCLEN_CHARFOUND_LEN(r);
8531  c0 = c = rb_enc_mbc_to_codepoint((char *)s, (char *)send, e1);
8532 
8533  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8534 
8535  if (c < 256) {
8536  c = trans[c];
8537  }
8538  else if (hash) {
8539  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8540  if (NIL_P(tmp)) {
8541  if (cflag) c = last;
8542  else c = errc;
8543  }
8544  else if (cflag) c = errc;
8545  else c = NUM2INT(tmp);
8546  }
8547  else {
8548  c = cflag ? last : errc;
8549  }
8550  if (c != errc) {
8551  tlen = rb_enc_codelen(c, enc);
8552  modify = 1;
8553  }
8554  else {
8555  c = c0;
8556  if (enc != e1) may_modify = 1;
8557  }
8558  if ((offset = t - buf) + tlen > max) {
8559  size_t MAYBE_UNUSED(old) = max + termlen;
8560  max = offset + tlen + (long)((send - s) * 1.2);
8561  SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8562  t = buf + offset;
8563  }
8564  if (s != t) {
8565  rb_enc_mbcput(c, t, enc);
8566  if (may_modify && memcmp(s, t, tlen) != 0) {
8567  modify = 1;
8568  }
8569  }
8570  CHECK_IF_ASCII(c);
8571  s += clen;
8572  t += tlen;
8573  }
8574  if (!STR_EMBED_P(str)) {
8575  ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8576  }
8577  TERM_FILL((char *)t, termlen);
8578  RSTRING(str)->as.heap.ptr = (char *)buf;
8579  STR_SET_LEN(str, t - buf);
8580  STR_SET_NOEMBED(str);
8581  RSTRING(str)->as.heap.aux.capa = max;
8582  }
8583 
8584  if (modify) {
8585  if (cr != ENC_CODERANGE_BROKEN)
8586  ENC_CODERANGE_SET(str, cr);
8587  rb_enc_associate(str, enc);
8588  return str;
8589  }
8590  return Qnil;
8591 }
8592 
8593 
8594 /*
8595  * call-seq:
8596  * tr!(selector, replacements) -> self or nil
8597  *
8598  * Like String#tr, but modifies +self+ in place.
8599  * Returns +self+ if any changes were made, +nil+ otherwise.
8600  *
8601  */
8602 
8603 static VALUE
8604 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8605 {
8606  return tr_trans(str, src, repl, 0);
8607 }
8608 
8609 
8610 /*
8611  * call-seq:
8612  * tr(selector, replacements) -> new_string
8613  *
8614  * Returns a copy of +self+ with each character specified by string +selector+
8615  * translated to the corresponding character in string +replacements+.
8616  * The correspondence is _positional_:
8617  *
8618  * - Each occurrence of the first character specified by +selector+
8619  * is translated to the first character in +replacements+.
8620  * - Each occurrence of the second character specified by +selector+
8621  * is translated to the second character in +replacements+.
8622  * - And so on.
8623  *
8624  * Example:
8625  *
8626  * 'hello'.tr('el', 'ip') #=> "hippo"
8627  *
8628  * If +replacements+ is shorter than +selector+,
8629  * it is implicitly padded with its own last character:
8630  *
8631  * 'hello'.tr('aeiou', '-') # => "h-ll-"
8632  * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8633  *
8634  * Arguments +selector+ and +replacements+ must be valid character selectors
8635  * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8636  * and may use any of its valid forms, including negation, ranges, and escaping:
8637  *
8638  * # Negation.
8639  * 'hello'.tr('^aeiou', '-') # => "-e--o"
8640  * # Ranges.
8641  * 'ibm'.tr('b-z', 'a-z') # => "hal"
8642  * # Escapes.
8643  * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8644  * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8645  * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8646  *
8647  */
8648 
8649 static VALUE
8650 rb_str_tr(VALUE str, VALUE src, VALUE repl)
8651 {
8652  str = str_duplicate(rb_cString, str);
8653  tr_trans(str, src, repl, 0);
8654  return str;
8655 }
8656 
8657 #define TR_TABLE_MAX (UCHAR_MAX+1)
8658 #define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8659 static void
8660 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8661  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8662 {
8663  const unsigned int errc = -1;
8664  char buf[TR_TABLE_MAX];
8665  struct tr tr;
8666  unsigned int c;
8667  VALUE table = 0, ptable = 0;
8668  int i, l, cflag = 0;
8669 
8670  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8671  tr.gen = tr.now = tr.max = 0;
8672 
8673  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8674  cflag = 1;
8675  tr.p += l;
8676  }
8677  if (first) {
8678  for (i=0; i<TR_TABLE_MAX; i++) {
8679  stable[i] = 1;
8680  }
8681  stable[TR_TABLE_MAX] = cflag;
8682  }
8683  else if (stable[TR_TABLE_MAX] && !cflag) {
8684  stable[TR_TABLE_MAX] = 0;
8685  }
8686  for (i=0; i<TR_TABLE_MAX; i++) {
8687  buf[i] = cflag;
8688  }
8689 
8690  while ((c = trnext(&tr, enc)) != errc) {
8691  if (c < TR_TABLE_MAX) {
8692  buf[(unsigned char)c] = !cflag;
8693  }
8694  else {
8695  VALUE key = UINT2NUM(c);
8696 
8697  if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8698  if (cflag) {
8699  ptable = *ctablep;
8700  table = ptable ? ptable : rb_hash_new();
8701  *ctablep = table;
8702  }
8703  else {
8704  table = rb_hash_new();
8705  ptable = *tablep;
8706  *tablep = table;
8707  }
8708  }
8709  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8710  rb_hash_aset(table, key, Qtrue);
8711  }
8712  }
8713  }
8714  for (i=0; i<TR_TABLE_MAX; i++) {
8715  stable[i] = stable[i] && buf[i];
8716  }
8717  if (!table && !cflag) {
8718  *tablep = 0;
8719  }
8720 }
8721 
8722 
8723 static int
8724 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8725 {
8726  if (c < TR_TABLE_MAX) {
8727  return table[c] != 0;
8728  }
8729  else {
8730  VALUE v = UINT2NUM(c);
8731 
8732  if (del) {
8733  if (!NIL_P(rb_hash_lookup(del, v)) &&
8734  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8735  return TRUE;
8736  }
8737  }
8738  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8739  return FALSE;
8740  }
8741  return table[TR_TABLE_MAX] ? TRUE : FALSE;
8742  }
8743 }
8744 
8745 /*
8746  * call-seq:
8747  * delete!(*selectors) -> self or nil
8748  *
8749  * Like String#delete, but modifies +self+ in place.
8750  * Returns +self+ if any changes were made, +nil+ otherwise.
8751  *
8752  */
8753 
8754 static VALUE
8755 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8756 {
8757  char squeez[TR_TABLE_SIZE];
8758  rb_encoding *enc = 0;
8759  char *s, *send, *t;
8760  VALUE del = 0, nodel = 0;
8761  int modify = 0;
8762  int i, ascompat, cr;
8763 
8764  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8766  for (i=0; i<argc; i++) {
8767  VALUE s = argv[i];
8768 
8769  StringValue(s);
8770  enc = rb_enc_check(str, s);
8771  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8772  }
8773 
8774  str_modify_keep_cr(str);
8775  ascompat = rb_enc_asciicompat(enc);
8776  s = t = RSTRING_PTR(str);
8777  send = RSTRING_END(str);
8778  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8779  while (s < send) {
8780  unsigned int c;
8781  int clen;
8782 
8783  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8784  if (squeez[c]) {
8785  modify = 1;
8786  }
8787  else {
8788  if (t != s) *t = c;
8789  t++;
8790  }
8791  s++;
8792  }
8793  else {
8794  c = rb_enc_codepoint_len(s, send, &clen, enc);
8795 
8796  if (tr_find(c, squeez, del, nodel)) {
8797  modify = 1;
8798  }
8799  else {
8800  if (t != s) rb_enc_mbcput(c, t, enc);
8801  t += clen;
8802  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
8803  }
8804  s += clen;
8805  }
8806  }
8807  TERM_FILL(t, TERM_LEN(str));
8808  STR_SET_LEN(str, t - RSTRING_PTR(str));
8809  ENC_CODERANGE_SET(str, cr);
8810 
8811  if (modify) return str;
8812  return Qnil;
8813 }
8814 
8815 
8816 /*
8817  * call-seq:
8818  * delete(*selectors) -> new_string
8819  *
8820  * Returns a copy of +self+ with characters specified by +selectors+ removed
8821  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8822  *
8823  * "hello".delete "l","lo" #=> "heo"
8824  * "hello".delete "lo" #=> "he"
8825  * "hello".delete "aeiou", "^e" #=> "hell"
8826  * "hello".delete "ej-m" #=> "ho"
8827  *
8828  */
8829 
8830 static VALUE
8831 rb_str_delete(int argc, VALUE *argv, VALUE str)
8832 {
8833  str = str_duplicate(rb_cString, str);
8834  rb_str_delete_bang(argc, argv, str);
8835  return str;
8836 }
8837 
8838 
8839 /*
8840  * call-seq:
8841  * squeeze!(*selectors) -> self or nil
8842  *
8843  * Like String#squeeze, but modifies +self+ in place.
8844  * Returns +self+ if any changes were made, +nil+ otherwise.
8845  */
8846 
8847 static VALUE
8848 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8849 {
8850  char squeez[TR_TABLE_SIZE];
8851  rb_encoding *enc = 0;
8852  VALUE del = 0, nodel = 0;
8853  unsigned char *s, *send, *t;
8854  int i, modify = 0;
8855  int ascompat, singlebyte = single_byte_optimizable(str);
8856  unsigned int save;
8857 
8858  if (argc == 0) {
8859  enc = STR_ENC_GET(str);
8860  }
8861  else {
8862  for (i=0; i<argc; i++) {
8863  VALUE s = argv[i];
8864 
8865  StringValue(s);
8866  enc = rb_enc_check(str, s);
8867  if (singlebyte && !single_byte_optimizable(s))
8868  singlebyte = 0;
8869  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8870  }
8871  }
8872 
8873  str_modify_keep_cr(str);
8874  s = t = (unsigned char *)RSTRING_PTR(str);
8875  if (!s || RSTRING_LEN(str) == 0) return Qnil;
8876  send = (unsigned char *)RSTRING_END(str);
8877  save = -1;
8878  ascompat = rb_enc_asciicompat(enc);
8879 
8880  if (singlebyte) {
8881  while (s < send) {
8882  unsigned int c = *s++;
8883  if (c != save || (argc > 0 && !squeez[c])) {
8884  *t++ = save = c;
8885  }
8886  }
8887  }
8888  else {
8889  while (s < send) {
8890  unsigned int c;
8891  int clen;
8892 
8893  if (ascompat && (c = *s) < 0x80) {
8894  if (c != save || (argc > 0 && !squeez[c])) {
8895  *t++ = save = c;
8896  }
8897  s++;
8898  }
8899  else {
8900  c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8901 
8902  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8903  if (t != s) rb_enc_mbcput(c, t, enc);
8904  save = c;
8905  t += clen;
8906  }
8907  s += clen;
8908  }
8909  }
8910  }
8911 
8912  TERM_FILL((char *)t, TERM_LEN(str));
8913  if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8914  STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8915  modify = 1;
8916  }
8917 
8918  if (modify) return str;
8919  return Qnil;
8920 }
8921 
8922 
8923 /*
8924  * call-seq:
8925  * squeeze(*selectors) -> new_string
8926  *
8927  * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8928  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8929  *
8930  * "Squeezed" means that each multiple-character run of a selected character
8931  * is squeezed down to a single character;
8932  * with no arguments given, squeezes all characters:
8933  *
8934  * "yellow moon".squeeze #=> "yelow mon"
8935  * " now is the".squeeze(" ") #=> " now is the"
8936  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8937  *
8938  */
8939 
8940 static VALUE
8941 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8942 {
8943  str = str_duplicate(rb_cString, str);
8944  rb_str_squeeze_bang(argc, argv, str);
8945  return str;
8946 }
8947 
8948 
8949 /*
8950  * call-seq:
8951  * tr_s!(selector, replacements) -> self or nil
8952  *
8953  * Like String#tr_s, but modifies +self+ in place.
8954  * Returns +self+ if any changes were made, +nil+ otherwise.
8955  *
8956  * Related: String#squeeze!.
8957  */
8958 
8959 static VALUE
8960 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8961 {
8962  return tr_trans(str, src, repl, 1);
8963 }
8964 
8965 
8966 /*
8967  * call-seq:
8968  * tr_s(selector, replacements) -> string
8969  *
8970  * Like String#tr, but also squeezes the modified portions of the translated string;
8971  * returns a new string (translated and squeezed).
8972  *
8973  * 'hello'.tr_s('l', 'r') #=> "hero"
8974  * 'hello'.tr_s('el', '-') #=> "h-o"
8975  * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8976  *
8977  * Related: String#squeeze.
8978  *
8979  */
8980 
8981 static VALUE
8982 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8983 {
8984  str = str_duplicate(rb_cString, str);
8985  tr_trans(str, src, repl, 1);
8986  return str;
8987 }
8988 
8989 
8990 /*
8991  * call-seq:
8992  * count(*selectors) -> integer
8993  *
8994  * Returns the total number of characters in +self+
8995  * that are specified by the given +selectors+
8996  * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8997  *
8998  * a = "hello world"
8999  * a.count "lo" #=> 5
9000  * a.count "lo", "o" #=> 2
9001  * a.count "hello", "^l" #=> 4
9002  * a.count "ej-m" #=> 4
9003  *
9004  * "hello^world".count "\\^aeiou" #=> 4
9005  * "hello-world".count "a\\-eo" #=> 4
9006  *
9007  * c = "hello world\\r\\n"
9008  * c.count "\\" #=> 2
9009  * c.count "\\A" #=> 0
9010  * c.count "X-\\w" #=> 3
9011  */
9012 
9013 static VALUE
9014 rb_str_count(int argc, VALUE *argv, VALUE str)
9015 {
9016  char table[TR_TABLE_SIZE];
9017  rb_encoding *enc = 0;
9018  VALUE del = 0, nodel = 0, tstr;
9019  char *s, *send;
9020  int i;
9021  int ascompat;
9022  size_t n = 0;
9023 
9025 
9026  tstr = argv[0];
9027  StringValue(tstr);
9028  enc = rb_enc_check(str, tstr);
9029  if (argc == 1) {
9030  const char *ptstr;
9031  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
9032  (ptstr = RSTRING_PTR(tstr),
9033  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
9034  !is_broken_string(str)) {
9035  int clen;
9036  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
9037 
9038  s = RSTRING_PTR(str);
9039  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9040  send = RSTRING_END(str);
9041  while (s < send) {
9042  if (*(unsigned char*)s++ == c) n++;
9043  }
9044  return SIZET2NUM(n);
9045  }
9046  }
9047 
9048  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
9049  for (i=1; i<argc; i++) {
9050  tstr = argv[i];
9051  StringValue(tstr);
9052  enc = rb_enc_check(str, tstr);
9053  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
9054  }
9055 
9056  s = RSTRING_PTR(str);
9057  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
9058  send = RSTRING_END(str);
9059  ascompat = rb_enc_asciicompat(enc);
9060  while (s < send) {
9061  unsigned int c;
9062 
9063  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
9064  if (table[c]) {
9065  n++;
9066  }
9067  s++;
9068  }
9069  else {
9070  int clen;
9071  c = rb_enc_codepoint_len(s, send, &clen, enc);
9072  if (tr_find(c, table, del, nodel)) {
9073  n++;
9074  }
9075  s += clen;
9076  }
9077  }
9078 
9079  return SIZET2NUM(n);
9080 }
9081 
9082 static VALUE
9083 rb_fs_check(VALUE val)
9084 {
9085  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
9086  val = rb_check_string_type(val);
9087  if (NIL_P(val)) return 0;
9088  }
9089  return val;
9090 }
9091 
9092 static const char isspacetable[256] = {
9093  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
9094  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9095  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9096  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9097  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9098  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9099  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9100  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9101  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9102  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9103  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9104  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9105  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9106  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9107  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9108  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
9109 };
9110 
9111 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
9112 
9113 static long
9114 split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
9115 {
9116  if (empty_count >= 0 && len == 0) {
9117  return empty_count + 1;
9118  }
9119  if (empty_count > 0) {
9120  /* make different substrings */
9121  if (result) {
9122  do {
9123  rb_ary_push(result, str_new_empty_String(str));
9124  } while (--empty_count > 0);
9125  }
9126  else {
9127  do {
9128  rb_yield(str_new_empty_String(str));
9129  } while (--empty_count > 0);
9130  }
9131  }
9132  str = rb_str_subseq(str, beg, len);
9133  if (result) {
9134  rb_ary_push(result, str);
9135  }
9136  else {
9137  rb_yield(str);
9138  }
9139  return empty_count;
9140 }
9141 
9142 typedef enum {
9143  SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
9144 } split_type_t;
9145 
9146 static split_type_t
9147 literal_split_pattern(VALUE spat, split_type_t default_type)
9148 {
9149  rb_encoding *enc = STR_ENC_GET(spat);
9150  const char *ptr;
9151  long len;
9152  RSTRING_GETMEM(spat, ptr, len);
9153  if (len == 0) {
9154  /* Special case - split into chars */
9155  return SPLIT_TYPE_CHARS;
9156  }
9157  else if (rb_enc_asciicompat(enc)) {
9158  if (len == 1 && ptr[0] == ' ') {
9159  return SPLIT_TYPE_AWK;
9160  }
9161  }
9162  else {
9163  int l;
9164  if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
9165  return SPLIT_TYPE_AWK;
9166  }
9167  }
9168  return default_type;
9169 }
9170 
9171 /*
9172  * call-seq:
9173  * split(field_sep = $;, limit = 0) -> array
9174  * split(field_sep = $;, limit = 0) {|substring| ... } -> self
9175  *
9176  * :include: doc/string/split.rdoc
9177  *
9178  */
9179 
9180 static VALUE
9181 rb_str_split_m(int argc, VALUE *argv, VALUE str)
9182 {
9183  rb_encoding *enc;
9184  VALUE spat;
9185  VALUE limit;
9186  split_type_t split_type;
9187  long beg, end, i = 0, empty_count = -1;
9188  int lim = 0;
9189  VALUE result, tmp;
9190 
9191  result = rb_block_given_p() ? Qfalse : Qnil;
9192  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
9193  lim = NUM2INT(limit);
9194  if (lim <= 0) limit = Qnil;
9195  else if (lim == 1) {
9196  if (RSTRING_LEN(str) == 0)
9197  return result ? rb_ary_new2(0) : str;
9198  tmp = str_duplicate(rb_cString, str);
9199  if (!result) {
9200  rb_yield(tmp);
9201  return str;
9202  }
9203  return rb_ary_new3(1, tmp);
9204  }
9205  i = 1;
9206  }
9207  if (NIL_P(limit) && !lim) empty_count = 0;
9208 
9209  enc = STR_ENC_GET(str);
9210  split_type = SPLIT_TYPE_REGEXP;
9211  if (!NIL_P(spat)) {
9212  spat = get_pat_quoted(spat, 0);
9213  }
9214  else if (NIL_P(spat = rb_fs)) {
9215  split_type = SPLIT_TYPE_AWK;
9216  }
9217  else if (!(spat = rb_fs_check(spat))) {
9218  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
9219  }
9220  else {
9221  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
9222  }
9223  if (split_type != SPLIT_TYPE_AWK) {
9224  switch (BUILTIN_TYPE(spat)) {
9225  case T_REGEXP:
9226  rb_reg_options(spat); /* check if uninitialized */
9227  tmp = RREGEXP_SRC(spat);
9228  split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
9229  if (split_type == SPLIT_TYPE_AWK) {
9230  spat = tmp;
9231  split_type = SPLIT_TYPE_STRING;
9232  }
9233  break;
9234 
9235  case T_STRING:
9236  mustnot_broken(spat);
9237  split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
9238  break;
9239 
9240  default:
9242  }
9243  }
9244 
9245 #define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
9246 
9247  beg = 0;
9248  char *ptr = RSTRING_PTR(str);
9249  char *eptr = RSTRING_END(str);
9250  if (split_type == SPLIT_TYPE_AWK) {
9251  char *bptr = ptr;
9252  int skip = 1;
9253  unsigned int c;
9254 
9255  if (result) result = rb_ary_new();
9256  end = beg;
9257  if (is_ascii_string(str)) {
9258  while (ptr < eptr) {
9259  c = (unsigned char)*ptr++;
9260  if (skip) {
9261  if (ascii_isspace(c)) {
9262  beg = ptr - bptr;
9263  }
9264  else {
9265  end = ptr - bptr;
9266  skip = 0;
9267  if (!NIL_P(limit) && lim <= i) break;
9268  }
9269  }
9270  else if (ascii_isspace(c)) {
9271  SPLIT_STR(beg, end-beg);
9272  skip = 1;
9273  beg = ptr - bptr;
9274  if (!NIL_P(limit)) ++i;
9275  }
9276  else {
9277  end = ptr - bptr;
9278  }
9279  }
9280  }
9281  else {
9282  while (ptr < eptr) {
9283  int n;
9284 
9285  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
9286  ptr += n;
9287  if (skip) {
9288  if (rb_isspace(c)) {
9289  beg = ptr - bptr;
9290  }
9291  else {
9292  end = ptr - bptr;
9293  skip = 0;
9294  if (!NIL_P(limit) && lim <= i) break;
9295  }
9296  }
9297  else if (rb_isspace(c)) {
9298  SPLIT_STR(beg, end-beg);
9299  skip = 1;
9300  beg = ptr - bptr;
9301  if (!NIL_P(limit)) ++i;
9302  }
9303  else {
9304  end = ptr - bptr;
9305  }
9306  }
9307  }
9308  }
9309  else if (split_type == SPLIT_TYPE_STRING) {
9310  char *str_start = ptr;
9311  char *substr_start = ptr;
9312  char *sptr = RSTRING_PTR(spat);
9313  long slen = RSTRING_LEN(spat);
9314 
9315  if (result) result = rb_ary_new();
9316  mustnot_broken(str);
9317  enc = rb_enc_check(str, spat);
9318  while (ptr < eptr &&
9319  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
9320  /* Check we are at the start of a char */
9321  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
9322  if (t != ptr + end) {
9323  ptr = t;
9324  continue;
9325  }
9326  SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
9327  ptr += end + slen;
9328  substr_start = ptr;
9329  if (!NIL_P(limit) && lim <= ++i) break;
9330  }
9331  beg = ptr - str_start;
9332  }
9333  else if (split_type == SPLIT_TYPE_CHARS) {
9334  char *str_start = ptr;
9335  int n;
9336 
9337  if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
9338  mustnot_broken(str);
9339  enc = rb_enc_get(str);
9340  while (ptr < eptr &&
9341  (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
9342  SPLIT_STR(ptr - str_start, n);
9343  ptr += n;
9344  if (!NIL_P(limit) && lim <= ++i) break;
9345  }
9346  beg = ptr - str_start;
9347  }
9348  else {
9349  if (result) result = rb_ary_new();
9350  long len = RSTRING_LEN(str);
9351  long start = beg;
9352  long idx;
9353  int last_null = 0;
9354  struct re_registers *regs;
9355  VALUE match = 0;
9356 
9357  for (; rb_reg_search(spat, str, start, 0) >= 0;
9358  (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
9359  match = rb_backref_get();
9360  if (!result) rb_match_busy(match);
9361  regs = RMATCH_REGS(match);
9362  end = BEG(0);
9363  if (start == end && BEG(0) == END(0)) {
9364  if (!ptr) {
9365  SPLIT_STR(0, 0);
9366  break;
9367  }
9368  else if (last_null == 1) {
9369  SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
9370  beg = start;
9371  }
9372  else {
9373  if (start == len)
9374  start++;
9375  else
9376  start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
9377  last_null = 1;
9378  continue;
9379  }
9380  }
9381  else {
9382  SPLIT_STR(beg, end-beg);
9383  beg = start = END(0);
9384  }
9385  last_null = 0;
9386 
9387  for (idx=1; idx < regs->num_regs; idx++) {
9388  if (BEG(idx) == -1) continue;
9389  SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
9390  }
9391  if (!NIL_P(limit) && lim <= ++i) break;
9392  }
9393  if (match) rb_match_unbusy(match);
9394  }
9395  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
9396  SPLIT_STR(beg, RSTRING_LEN(str)-beg);
9397  }
9398 
9399  return result ? result : str;
9400 }
9401 
9402 VALUE
9403 rb_str_split(VALUE str, const char *sep0)
9404 {
9405  VALUE sep;
9406 
9407  StringValue(str);
9408  sep = rb_str_new_cstr(sep0);
9409  return rb_str_split_m(1, &sep, str);
9410 }
9411 
9412 #define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
9413 
9414 static inline int
9415 enumerator_element(VALUE ary, VALUE e)
9416 {
9417  if (ary) {
9418  rb_ary_push(ary, e);
9419  return 0;
9420  }
9421  else {
9422  rb_yield(e);
9423  return 1;
9424  }
9425 }
9426 
9427 #define ENUM_ELEM(ary, e) enumerator_element(ary, e)
9428 
9429 static const char *
9430 chomp_newline(const char *p, const char *e, rb_encoding *enc)
9431 {
9432  const char *prev = rb_enc_prev_char(p, e, e, enc);
9433  if (rb_enc_is_newline(prev, e, enc)) {
9434  e = prev;
9435  prev = rb_enc_prev_char(p, e, e, enc);
9436  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9437  e = prev;
9438  }
9439  return e;
9440 }
9441 
9442 static VALUE
9443 get_rs(void)
9444 {
9445  VALUE rs = rb_rs;
9446  if (!NIL_P(rs) &&
9447  (!RB_TYPE_P(rs, T_STRING) ||
9448  RSTRING_LEN(rs) != 1 ||
9449  RSTRING_PTR(rs)[0] != '\n')) {
9450  rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9451  }
9452  return rs;
9453 }
9454 
9455 #define rb_rs get_rs()
9456 
9457 static VALUE
9458 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9459 {
9460  rb_encoding *enc;
9461  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9462  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9463  long pos, len, rslen;
9464  int rsnewline = 0;
9465 
9466  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9467  rs = rb_rs;
9468  if (!NIL_P(opts)) {
9469  static ID keywords[1];
9470  if (!keywords[0]) {
9471  keywords[0] = rb_intern_const("chomp");
9472  }
9473  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9474  chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9475  }
9476 
9477  if (NIL_P(rs)) {
9478  if (!ENUM_ELEM(ary, str)) {
9479  return ary;
9480  }
9481  else {
9482  return orig;
9483  }
9484  }
9485 
9486  if (!RSTRING_LEN(str)) goto end;
9487  str = rb_str_new_frozen(str);
9488  ptr = subptr = RSTRING_PTR(str);
9489  pend = RSTRING_END(str);
9490  len = RSTRING_LEN(str);
9491  StringValue(rs);
9492  rslen = RSTRING_LEN(rs);
9493 
9494  if (rs == rb_default_rs)
9495  enc = rb_enc_get(str);
9496  else
9497  enc = rb_enc_check(str, rs);
9498 
9499  if (rslen == 0) {
9500  /* paragraph mode */
9501  int n;
9502  const char *eol = NULL;
9503  subend = subptr;
9504  while (subend < pend) {
9505  long chomp_rslen = 0;
9506  do {
9507  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9508  n = 0;
9509  rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9510  if (rb_enc_is_newline(subend + n, pend, enc)) {
9511  if (eol == subend) break;
9512  subend += rslen;
9513  if (subptr) {
9514  eol = subend;
9515  chomp_rslen = -rslen;
9516  }
9517  }
9518  else {
9519  if (!subptr) subptr = subend;
9520  subend += rslen;
9521  }
9522  rslen = 0;
9523  } while (subend < pend);
9524  if (!subptr) break;
9525  if (rslen == 0) chomp_rslen = 0;
9526  line = rb_str_subseq(str, subptr - ptr,
9527  subend - subptr + (chomp ? chomp_rslen : rslen));
9528  if (ENUM_ELEM(ary, line)) {
9529  str_mod_check(str, ptr, len);
9530  }
9531  subptr = eol = NULL;
9532  }
9533  goto end;
9534  }
9535  else {
9536  rsptr = RSTRING_PTR(rs);
9537  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9538  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9539  rsnewline = 1;
9540  }
9541  }
9542 
9543  if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9544  rs = rb_str_new(rsptr, rslen);
9545  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9546  rsptr = RSTRING_PTR(rs);
9547  rslen = RSTRING_LEN(rs);
9548  }
9549 
9550  while (subptr < pend) {
9551  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9552  if (pos < 0) break;
9553  hit = subptr + pos;
9554  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9555  if (hit != adjusted) {
9556  subptr = adjusted;
9557  continue;
9558  }
9559  subend = hit += rslen;
9560  if (chomp) {
9561  if (rsnewline) {
9562  subend = chomp_newline(subptr, subend, enc);
9563  }
9564  else {
9565  subend -= rslen;
9566  }
9567  }
9568  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9569  if (ENUM_ELEM(ary, line)) {
9570  str_mod_check(str, ptr, len);
9571  }
9572  subptr = hit;
9573  }
9574 
9575  if (subptr != pend) {
9576  if (chomp) {
9577  if (rsnewline) {
9578  pend = chomp_newline(subptr, pend, enc);
9579  }
9580  else if (pend - subptr >= rslen &&
9581  memcmp(pend - rslen, rsptr, rslen) == 0) {
9582  pend -= rslen;
9583  }
9584  }
9585  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9586  ENUM_ELEM(ary, line);
9587  RB_GC_GUARD(str);
9588  }
9589 
9590  end:
9591  if (ary)
9592  return ary;
9593  else
9594  return orig;
9595 }
9596 
9597 /*
9598  * call-seq:
9599  * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9600  * each_line(line_sep = $/, chomp: false) -> enumerator
9601  *
9602  * :include: doc/string/each_line.rdoc
9603  *
9604  */
9605 
9606 static VALUE
9607 rb_str_each_line(int argc, VALUE *argv, VALUE str)
9608 {
9609  RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9610  return rb_str_enumerate_lines(argc, argv, str, 0);
9611 }
9612 
9613 /*
9614  * call-seq:
9615  * lines(Line_sep = $/, chomp: false) -> array_of_strings
9616  *
9617  * Forms substrings ("lines") of +self+ according to the given arguments
9618  * (see String#each_line for details); returns the lines in an array.
9619  *
9620  */
9621 
9622 static VALUE
9623 rb_str_lines(int argc, VALUE *argv, VALUE str)
9624 {
9625  VALUE ary = WANTARRAY("lines", 0);
9626  return rb_str_enumerate_lines(argc, argv, str, ary);
9627 }
9628 
9629 static VALUE
9630 rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9631 {
9632  return LONG2FIX(RSTRING_LEN(str));
9633 }
9634 
9635 static VALUE
9636 rb_str_enumerate_bytes(VALUE str, VALUE ary)
9637 {
9638  long i;
9639 
9640  for (i=0; i<RSTRING_LEN(str); i++) {
9641  ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9642  }
9643  if (ary)
9644  return ary;
9645  else
9646  return str;
9647 }
9648 
9649 /*
9650  * call-seq:
9651  * each_byte {|byte| ... } -> self
9652  * each_byte -> enumerator
9653  *
9654  * :include: doc/string/each_byte.rdoc
9655  *
9656  */
9657 
9658 static VALUE
9659 rb_str_each_byte(VALUE str)
9660 {
9661  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9662  return rb_str_enumerate_bytes(str, 0);
9663 }
9664 
9665 /*
9666  * call-seq:
9667  * bytes -> array_of_bytes
9668  *
9669  * :include: doc/string/bytes.rdoc
9670  *
9671  */
9672 
9673 static VALUE
9674 rb_str_bytes(VALUE str)
9675 {
9676  VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9677  return rb_str_enumerate_bytes(str, ary);
9678 }
9679 
9680 static VALUE
9681 rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9682 {
9683  return rb_str_length(str);
9684 }
9685 
9686 static VALUE
9687 rb_str_enumerate_chars(VALUE str, VALUE ary)
9688 {
9689  VALUE orig = str;
9690  long i, len, n;
9691  const char *ptr;
9692  rb_encoding *enc;
9693 
9694  str = rb_str_new_frozen(str);
9695  ptr = RSTRING_PTR(str);
9696  len = RSTRING_LEN(str);
9697  enc = rb_enc_get(str);
9698 
9700  for (i = 0; i < len; i += n) {
9701  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9702  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9703  }
9704  }
9705  else {
9706  for (i = 0; i < len; i += n) {
9707  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9708  ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9709  }
9710  }
9711  RB_GC_GUARD(str);
9712  if (ary)
9713  return ary;
9714  else
9715  return orig;
9716 }
9717 
9718 /*
9719  * call-seq:
9720  * each_char {|c| ... } -> self
9721  * each_char -> enumerator
9722  *
9723  * :include: doc/string/each_char.rdoc
9724  *
9725  */
9726 
9727 static VALUE
9728 rb_str_each_char(VALUE str)
9729 {
9730  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9731  return rb_str_enumerate_chars(str, 0);
9732 }
9733 
9734 /*
9735  * call-seq:
9736  * chars -> array_of_characters
9737  *
9738  * :include: doc/string/chars.rdoc
9739  *
9740  */
9741 
9742 static VALUE
9743 rb_str_chars(VALUE str)
9744 {
9745  VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9746  return rb_str_enumerate_chars(str, ary);
9747 }
9748 
9749 static VALUE
9750 rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9751 {
9752  VALUE orig = str;
9753  int n;
9754  unsigned int c;
9755  const char *ptr, *end;
9756  rb_encoding *enc;
9757 
9758  if (single_byte_optimizable(str))
9759  return rb_str_enumerate_bytes(str, ary);
9760 
9761  str = rb_str_new_frozen(str);
9762  ptr = RSTRING_PTR(str);
9763  end = RSTRING_END(str);
9764  enc = STR_ENC_GET(str);
9765 
9766  while (ptr < end) {
9767  c = rb_enc_codepoint_len(ptr, end, &n, enc);
9768  ENUM_ELEM(ary, UINT2NUM(c));
9769  ptr += n;
9770  }
9771  RB_GC_GUARD(str);
9772  if (ary)
9773  return ary;
9774  else
9775  return orig;
9776 }
9777 
9778 /*
9779  * call-seq:
9780  * each_codepoint {|integer| ... } -> self
9781  * each_codepoint -> enumerator
9782  *
9783  * :include: doc/string/each_codepoint.rdoc
9784  *
9785  */
9786 
9787 static VALUE
9788 rb_str_each_codepoint(VALUE str)
9789 {
9790  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9791  return rb_str_enumerate_codepoints(str, 0);
9792 }
9793 
9794 /*
9795  * call-seq:
9796  * codepoints -> array_of_integers
9797  *
9798  * :include: doc/string/codepoints.rdoc
9799  *
9800  */
9801 
9802 static VALUE
9803 rb_str_codepoints(VALUE str)
9804 {
9805  VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9806  return rb_str_enumerate_codepoints(str, ary);
9807 }
9808 
9809 static regex_t *
9810 get_reg_grapheme_cluster(rb_encoding *enc)
9811 {
9812  int encidx = rb_enc_to_index(enc);
9813 
9814  const OnigUChar source_ascii[] = "\\X";
9815  const OnigUChar *source = source_ascii;
9816  size_t source_len = sizeof(source_ascii) - 1;
9817 
9818  switch (encidx) {
9819 #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9820 #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9821 #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9822 #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9823 #define CASE_UTF(e) \
9824  case ENCINDEX_UTF_##e: { \
9825  static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9826  source = source_UTF_##e; \
9827  source_len = sizeof(source_UTF_##e); \
9828  break; \
9829  }
9830  CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9831 #undef CASE_UTF
9832 #undef CHARS_16BE
9833 #undef CHARS_16LE
9834 #undef CHARS_32BE
9835 #undef CHARS_32LE
9836  }
9837 
9838  regex_t *reg_grapheme_cluster;
9839  OnigErrorInfo einfo;
9840  int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9841  ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9842  if (r) {
9843  UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9844  onig_error_code_to_str(message, r, &einfo);
9845  rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9846  }
9847 
9848  return reg_grapheme_cluster;
9849 }
9850 
9851 static regex_t *
9852 get_cached_reg_grapheme_cluster(rb_encoding *enc)
9853 {
9854  int encidx = rb_enc_to_index(enc);
9855  static regex_t *reg_grapheme_cluster_utf8 = NULL;
9856 
9857  if (encidx == rb_utf8_encindex()) {
9858  if (!reg_grapheme_cluster_utf8) {
9859  reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9860  }
9861 
9862  return reg_grapheme_cluster_utf8;
9863  }
9864 
9865  return NULL;
9866 }
9867 
9868 static VALUE
9869 rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9870 {
9871  size_t grapheme_cluster_count = 0;
9872  rb_encoding *enc = get_encoding(str);
9873  const char *ptr, *end;
9874 
9875  if (!rb_enc_unicode_p(enc)) {
9876  return rb_str_length(str);
9877  }
9878 
9879  bool cached_reg_grapheme_cluster = true;
9880  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9881  if (!reg_grapheme_cluster) {
9882  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9883  cached_reg_grapheme_cluster = false;
9884  }
9885 
9886  ptr = RSTRING_PTR(str);
9887  end = RSTRING_END(str);
9888 
9889  while (ptr < end) {
9890  OnigPosition len = onig_match(reg_grapheme_cluster,
9891  (const OnigUChar *)ptr, (const OnigUChar *)end,
9892  (const OnigUChar *)ptr, NULL, 0);
9893  if (len <= 0) break;
9894  grapheme_cluster_count++;
9895  ptr += len;
9896  }
9897 
9898  if (!cached_reg_grapheme_cluster) {
9899  onig_free(reg_grapheme_cluster);
9900  }
9901 
9902  return SIZET2NUM(grapheme_cluster_count);
9903 }
9904 
9905 static VALUE
9906 rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9907 {
9908  VALUE orig = str;
9909  rb_encoding *enc = get_encoding(str);
9910  const char *ptr0, *ptr, *end;
9911 
9912  if (!rb_enc_unicode_p(enc)) {
9913  return rb_str_enumerate_chars(str, ary);
9914  }
9915 
9916  if (!ary) str = rb_str_new_frozen(str);
9917 
9918  bool cached_reg_grapheme_cluster = true;
9919  regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9920  if (!reg_grapheme_cluster) {
9921  reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9922  cached_reg_grapheme_cluster = false;
9923  }
9924 
9925  ptr0 = ptr = RSTRING_PTR(str);
9926  end = RSTRING_END(str);
9927 
9928  while (ptr < end) {
9929  OnigPosition len = onig_match(reg_grapheme_cluster,
9930  (const OnigUChar *)ptr, (const OnigUChar *)end,
9931  (const OnigUChar *)ptr, NULL, 0);
9932  if (len <= 0) break;
9933  ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9934  ptr += len;
9935  }
9936 
9937  if (!cached_reg_grapheme_cluster) {
9938  onig_free(reg_grapheme_cluster);
9939  }
9940 
9941  RB_GC_GUARD(str);
9942  if (ary)
9943  return ary;
9944  else
9945  return orig;
9946 }
9947 
9948 /*
9949  * call-seq:
9950  * each_grapheme_cluster {|gc| ... } -> self
9951  * each_grapheme_cluster -> enumerator
9952  *
9953  * :include: doc/string/each_grapheme_cluster.rdoc
9954  *
9955  */
9956 
9957 static VALUE
9958 rb_str_each_grapheme_cluster(VALUE str)
9959 {
9960  RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9961  return rb_str_enumerate_grapheme_clusters(str, 0);
9962 }
9963 
9964 /*
9965  * call-seq:
9966  * grapheme_clusters -> array_of_grapheme_clusters
9967  *
9968  * :include: doc/string/grapheme_clusters.rdoc
9969  *
9970  */
9971 
9972 static VALUE
9973 rb_str_grapheme_clusters(VALUE str)
9974 {
9975  VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9976  return rb_str_enumerate_grapheme_clusters(str, ary);
9977 }
9978 
9979 static long
9980 chopped_length(VALUE str)
9981 {
9982  rb_encoding *enc = STR_ENC_GET(str);
9983  const char *p, *p2, *beg, *end;
9984 
9985  beg = RSTRING_PTR(str);
9986  end = beg + RSTRING_LEN(str);
9987  if (beg >= end) return 0;
9988  p = rb_enc_prev_char(beg, end, end, enc);
9989  if (!p) return 0;
9990  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9991  p2 = rb_enc_prev_char(beg, p, end, enc);
9992  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9993  }
9994  return p - beg;
9995 }
9996 
9997 /*
9998  * call-seq:
9999  * chop! -> self or nil
10000  *
10001  * Like String#chop, but modifies +self+ in place;
10002  * returns +nil+ if +self+ is empty, +self+ otherwise.
10003  *
10004  * Related: String#chomp!.
10005  */
10006 
10007 static VALUE
10008 rb_str_chop_bang(VALUE str)
10009 {
10010  str_modify_keep_cr(str);
10011  if (RSTRING_LEN(str) > 0) {
10012  long len;
10013  len = chopped_length(str);
10014  STR_SET_LEN(str, len);
10015  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10016  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10017  ENC_CODERANGE_CLEAR(str);
10018  }
10019  return str;
10020  }
10021  return Qnil;
10022 }
10023 
10024 
10025 /*
10026  * call-seq:
10027  * chop -> new_string
10028  *
10029  * :include: doc/string/chop.rdoc
10030  *
10031  */
10032 
10033 static VALUE
10034 rb_str_chop(VALUE str)
10035 {
10036  return rb_str_subseq(str, 0, chopped_length(str));
10037 }
10038 
10039 static long
10040 smart_chomp(VALUE str, const char *e, const char *p)
10041 {
10042  rb_encoding *enc = rb_enc_get(str);
10043  if (rb_enc_mbminlen(enc) > 1) {
10044  const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10045  if (rb_enc_is_newline(pp, e, enc)) {
10046  e = pp;
10047  }
10048  pp = e - rb_enc_mbminlen(enc);
10049  if (pp >= p) {
10050  pp = rb_enc_left_char_head(p, pp, e, enc);
10051  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10052  e = pp;
10053  }
10054  }
10055  }
10056  else {
10057  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
10058  case '\n':
10059  if (--e > p && *(e-1) == '\r') {
10060  --e;
10061  }
10062  break;
10063  case '\r':
10064  --e;
10065  break;
10066  }
10067  }
10068  return e - p;
10069 }
10070 
10071 static long
10072 chompped_length(VALUE str, VALUE rs)
10073 {
10074  rb_encoding *enc;
10075  int newline;
10076  char *pp, *e, *rsptr;
10077  long rslen;
10078  char *const p = RSTRING_PTR(str);
10079  long len = RSTRING_LEN(str);
10080 
10081  if (len == 0) return 0;
10082  e = p + len;
10083  if (rs == rb_default_rs) {
10084  return smart_chomp(str, e, p);
10085  }
10086 
10087  enc = rb_enc_get(str);
10088  RSTRING_GETMEM(rs, rsptr, rslen);
10089  if (rslen == 0) {
10090  if (rb_enc_mbminlen(enc) > 1) {
10091  while (e > p) {
10092  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
10093  if (!rb_enc_is_newline(pp, e, enc)) break;
10094  e = pp;
10095  pp -= rb_enc_mbminlen(enc);
10096  if (pp >= p) {
10097  pp = rb_enc_left_char_head(p, pp, e, enc);
10098  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
10099  e = pp;
10100  }
10101  }
10102  }
10103  }
10104  else {
10105  while (e > p && *(e-1) == '\n') {
10106  --e;
10107  if (e > p && *(e-1) == '\r')
10108  --e;
10109  }
10110  }
10111  return e - p;
10112  }
10113  if (rslen > len) return len;
10114 
10115  enc = rb_enc_get(rs);
10116  newline = rsptr[rslen-1];
10117  if (rslen == rb_enc_mbminlen(enc)) {
10118  if (rslen == 1) {
10119  if (newline == '\n')
10120  return smart_chomp(str, e, p);
10121  }
10122  else {
10123  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
10124  return smart_chomp(str, e, p);
10125  }
10126  }
10127 
10128  enc = rb_enc_check(str, rs);
10129  if (is_broken_string(rs)) {
10130  return len;
10131  }
10132  pp = e - rslen;
10133  if (p[len-1] == newline &&
10134  (rslen <= 1 ||
10135  memcmp(rsptr, pp, rslen) == 0)) {
10136  if (at_char_boundary(p, pp, e, enc))
10137  return len - rslen;
10138  RB_GC_GUARD(rs);
10139  }
10140  return len;
10141 }
10142 
10148 static VALUE
10149 chomp_rs(int argc, const VALUE *argv)
10150 {
10151  rb_check_arity(argc, 0, 1);
10152  if (argc > 0) {
10153  VALUE rs = argv[0];
10154  if (!NIL_P(rs)) StringValue(rs);
10155  return rs;
10156  }
10157  else {
10158  return rb_rs;
10159  }
10160 }
10161 
10162 VALUE
10163 rb_str_chomp_string(VALUE str, VALUE rs)
10164 {
10165  long olen = RSTRING_LEN(str);
10166  long len = chompped_length(str, rs);
10167  if (len >= olen) return Qnil;
10168  str_modify_keep_cr(str);
10169  STR_SET_LEN(str, len);
10170  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10171  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10172  ENC_CODERANGE_CLEAR(str);
10173  }
10174  return str;
10175 }
10176 
10177 /*
10178  * call-seq:
10179  * chomp!(line_sep = $/) -> self or nil
10180  *
10181  * Like String#chomp, but modifies +self+ in place;
10182  * returns +nil+ if no modification made, +self+ otherwise.
10183  *
10184  */
10185 
10186 static VALUE
10187 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
10188 {
10189  VALUE rs;
10190  str_modifiable(str);
10191  if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
10192  rs = chomp_rs(argc, argv);
10193  if (NIL_P(rs)) return Qnil;
10194  return rb_str_chomp_string(str, rs);
10195 }
10196 
10197 
10198 /*
10199  * call-seq:
10200  * chomp(line_sep = $/) -> new_string
10201  *
10202  * :include: doc/string/chomp.rdoc
10203  *
10204  */
10205 
10206 static VALUE
10207 rb_str_chomp(int argc, VALUE *argv, VALUE str)
10208 {
10209  VALUE rs = chomp_rs(argc, argv);
10210  if (NIL_P(rs)) return str_duplicate(rb_cString, str);
10211  return rb_str_subseq(str, 0, chompped_length(str, rs));
10212 }
10213 
10214 static long
10215 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10216 {
10217  const char *const start = s;
10218 
10219  if (!s || s >= e) return 0;
10220 
10221  /* remove spaces at head */
10222  if (single_byte_optimizable(str)) {
10223  while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
10224  }
10225  else {
10226  while (s < e) {
10227  int n;
10228  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
10229 
10230  if (cc && !rb_isspace(cc)) break;
10231  s += n;
10232  }
10233  }
10234  return s - start;
10235 }
10236 
10237 /*
10238  * call-seq:
10239  * lstrip! -> self or nil
10240  *
10241  * Like String#lstrip, except that any modifications are made in +self+;
10242  * returns +self+ if any modification are made, +nil+ otherwise.
10243  *
10244  * Related: String#rstrip!, String#strip!.
10245  */
10246 
10247 static VALUE
10248 rb_str_lstrip_bang(VALUE str)
10249 {
10250  rb_encoding *enc;
10251  char *start, *s;
10252  long olen, loffset;
10253 
10254  str_modify_keep_cr(str);
10255  enc = STR_ENC_GET(str);
10256  RSTRING_GETMEM(str, start, olen);
10257  loffset = lstrip_offset(str, start, start+olen, enc);
10258  if (loffset > 0) {
10259  long len = olen-loffset;
10260  s = start + loffset;
10261  memmove(start, s, len);
10262  STR_SET_LEN(str, len);
10263  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10264  return str;
10265  }
10266  return Qnil;
10267 }
10268 
10269 
10270 /*
10271  * call-seq:
10272  * lstrip -> new_string
10273  *
10274  * Returns a copy of +self+ with leading whitespace removed;
10275  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10276  *
10277  * whitespace = "\x00\t\n\v\f\r "
10278  * s = whitespace + 'abc' + whitespace
10279  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10280  * s.lstrip # => "abc\u0000\t\n\v\f\r "
10281  *
10282  * Related: String#rstrip, String#strip.
10283  */
10284 
10285 static VALUE
10286 rb_str_lstrip(VALUE str)
10287 {
10288  char *start;
10289  long len, loffset;
10290  RSTRING_GETMEM(str, start, len);
10291  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
10292  if (loffset <= 0) return str_duplicate(rb_cString, str);
10293  return rb_str_subseq(str, loffset, len - loffset);
10294 }
10295 
10296 static long
10297 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
10298 {
10299  const char *t;
10300 
10301  rb_str_check_dummy_enc(enc);
10303  rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
10304  }
10305  if (!s || s >= e) return 0;
10306  t = e;
10307 
10308  /* remove trailing spaces or '\0's */
10309  if (single_byte_optimizable(str)) {
10310  unsigned char c;
10311  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
10312  }
10313  else {
10314  char *tp;
10315 
10316  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
10317  unsigned int c = rb_enc_codepoint(tp, e, enc);
10318  if (c && !rb_isspace(c)) break;
10319  t = tp;
10320  }
10321  }
10322  return e - t;
10323 }
10324 
10325 /*
10326  * call-seq:
10327  * rstrip! -> self or nil
10328  *
10329  * Like String#rstrip, except that any modifications are made in +self+;
10330  * returns +self+ if any modification are made, +nil+ otherwise.
10331  *
10332  * Related: String#lstrip!, String#strip!.
10333  */
10334 
10335 static VALUE
10336 rb_str_rstrip_bang(VALUE str)
10337 {
10338  rb_encoding *enc;
10339  char *start;
10340  long olen, roffset;
10341 
10342  str_modify_keep_cr(str);
10343  enc = STR_ENC_GET(str);
10344  RSTRING_GETMEM(str, start, olen);
10345  roffset = rstrip_offset(str, start, start+olen, enc);
10346  if (roffset > 0) {
10347  long len = olen - roffset;
10348 
10349  STR_SET_LEN(str, len);
10350  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10351  return str;
10352  }
10353  return Qnil;
10354 }
10355 
10356 
10357 /*
10358  * call-seq:
10359  * rstrip -> new_string
10360  *
10361  * Returns a copy of the receiver with trailing whitespace removed;
10362  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10363  *
10364  * whitespace = "\x00\t\n\v\f\r "
10365  * s = whitespace + 'abc' + whitespace
10366  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10367  * s.rstrip # => "\u0000\t\n\v\f\r abc"
10368  *
10369  * Related: String#lstrip, String#strip.
10370  */
10371 
10372 static VALUE
10373 rb_str_rstrip(VALUE str)
10374 {
10375  rb_encoding *enc;
10376  char *start;
10377  long olen, roffset;
10378 
10379  enc = STR_ENC_GET(str);
10380  RSTRING_GETMEM(str, start, olen);
10381  roffset = rstrip_offset(str, start, start+olen, enc);
10382 
10383  if (roffset <= 0) return str_duplicate(rb_cString, str);
10384  return rb_str_subseq(str, 0, olen-roffset);
10385 }
10386 
10387 
10388 /*
10389  * call-seq:
10390  * strip! -> self or nil
10391  *
10392  * Like String#strip, except that any modifications are made in +self+;
10393  * returns +self+ if any modification are made, +nil+ otherwise.
10394  *
10395  * Related: String#lstrip!, String#strip!.
10396  */
10397 
10398 static VALUE
10399 rb_str_strip_bang(VALUE str)
10400 {
10401  char *start;
10402  long olen, loffset, roffset;
10403  rb_encoding *enc;
10404 
10405  str_modify_keep_cr(str);
10406  enc = STR_ENC_GET(str);
10407  RSTRING_GETMEM(str, start, olen);
10408  loffset = lstrip_offset(str, start, start+olen, enc);
10409  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10410 
10411  if (loffset > 0 || roffset > 0) {
10412  long len = olen-roffset;
10413  if (loffset > 0) {
10414  len -= loffset;
10415  memmove(start, start + loffset, len);
10416  }
10417  STR_SET_LEN(str, len);
10418  TERM_FILL(start+len, rb_enc_mbminlen(enc));
10419  return str;
10420  }
10421  return Qnil;
10422 }
10423 
10424 
10425 /*
10426  * call-seq:
10427  * strip -> new_string
10428  *
10429  * Returns a copy of the receiver with leading and trailing whitespace removed;
10430  * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10431  *
10432  * whitespace = "\x00\t\n\v\f\r "
10433  * s = whitespace + 'abc' + whitespace
10434  * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10435  * s.strip # => "abc"
10436  *
10437  * Related: String#lstrip, String#rstrip.
10438  */
10439 
10440 static VALUE
10441 rb_str_strip(VALUE str)
10442 {
10443  char *start;
10444  long olen, loffset, roffset;
10445  rb_encoding *enc = STR_ENC_GET(str);
10446 
10447  RSTRING_GETMEM(str, start, olen);
10448  loffset = lstrip_offset(str, start, start+olen, enc);
10449  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10450 
10451  if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10452  return rb_str_subseq(str, loffset, olen-loffset-roffset);
10453 }
10454 
10455 static VALUE
10456 scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10457 {
10458  VALUE result = Qnil;
10459  long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10460  if (pos >= 0) {
10461  VALUE match;
10462  struct re_registers *regs;
10463  if (BUILTIN_TYPE(pat) == T_STRING) {
10464  regs = NULL;
10465  end = pos + RSTRING_LEN(pat);
10466  }
10467  else {
10468  match = rb_backref_get();
10469  regs = RMATCH_REGS(match);
10470  pos = BEG(0);
10471  end = END(0);
10472  }
10473 
10474  if (pos == end) {
10475  rb_encoding *enc = STR_ENC_GET(str);
10476  /*
10477  * Always consume at least one character of the input string
10478  */
10479  if (RSTRING_LEN(str) > end)
10480  *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10481  RSTRING_END(str), enc);
10482  else
10483  *start = end + 1;
10484  }
10485  else {
10486  *start = end;
10487  }
10488 
10489  if (!regs || regs->num_regs == 1) {
10490  result = rb_str_subseq(str, pos, end - pos);
10491  return result;
10492  }
10493  else {
10494  result = rb_ary_new2(regs->num_regs);
10495  for (int i = 1; i < regs->num_regs; i++) {
10496  VALUE s = Qnil;
10497  if (BEG(i) >= 0) {
10498  s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10499  }
10500 
10501  rb_ary_push(result, s);
10502  }
10503  }
10504 
10505  RB_GC_GUARD(match);
10506  }
10507 
10508  return result;
10509 }
10510 
10511 
10512 /*
10513  * call-seq:
10514  * scan(string_or_regexp) -> array
10515  * scan(string_or_regexp) {|matches| ... } -> self
10516  *
10517  * Matches a pattern against +self+; the pattern is:
10518  *
10519  * - +string_or_regexp+ itself, if it is a Regexp.
10520  * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10521  *
10522  * Iterates through +self+, generating a collection of matching results:
10523  *
10524  * - If the pattern contains no groups, each result is the
10525  * matched string, <code>$&</code>.
10526  * - If the pattern contains groups, each result is an array
10527  * containing one entry per group.
10528  *
10529  * With no block given, returns an array of the results:
10530  *
10531  * s = 'cruel world'
10532  * s.scan(/\w+/) # => ["cruel", "world"]
10533  * s.scan(/.../) # => ["cru", "el ", "wor"]
10534  * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10535  * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10536  *
10537  * With a block given, calls the block with each result; returns +self+:
10538  *
10539  * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10540  * print "\n"
10541  * s.scan(/(.)(.)/) {|x,y| print y, x }
10542  * print "\n"
10543  *
10544  * Output:
10545  *
10546  * <<cruel>> <<world>>
10547  * rceu lowlr
10548  *
10549  */
10550 
10551 static VALUE
10552 rb_str_scan(VALUE str, VALUE pat)
10553 {
10554  VALUE result;
10555  long start = 0;
10556  long last = -1, prev = 0;
10557  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10558 
10559  pat = get_pat_quoted(pat, 1);
10560  mustnot_broken(str);
10561  if (!rb_block_given_p()) {
10562  VALUE ary = rb_ary_new();
10563 
10564  while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10565  last = prev;
10566  prev = start;
10567  rb_ary_push(ary, result);
10568  }
10569  if (last >= 0) rb_pat_search(pat, str, last, 1);
10570  else rb_backref_set(Qnil);
10571  return ary;
10572  }
10573 
10574  while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10575  last = prev;
10576  prev = start;
10577  rb_yield(result);
10578  str_mod_check(str, p, len);
10579  }
10580  if (last >= 0) rb_pat_search(pat, str, last, 1);
10581  return str;
10582 }
10583 
10584 
10585 /*
10586  * call-seq:
10587  * hex -> integer
10588  *
10589  * Interprets the leading substring of +self+ as a string of hexadecimal digits
10590  * (with an optional sign and an optional <code>0x</code>) and returns the
10591  * corresponding number;
10592  * returns zero if there is no such leading substring:
10593  *
10594  * '0x0a'.hex # => 10
10595  * '-1234'.hex # => -4660
10596  * '0'.hex # => 0
10597  * 'non-numeric'.hex # => 0
10598  *
10599  * Related: String#oct.
10600  *
10601  */
10602 
10603 static VALUE
10604 rb_str_hex(VALUE str)
10605 {
10606  return rb_str_to_inum(str, 16, FALSE);
10607 }
10608 
10609 
10610 /*
10611  * call-seq:
10612  * oct -> integer
10613  *
10614  * Interprets the leading substring of +self+ as a string of octal digits
10615  * (with an optional sign) and returns the corresponding number;
10616  * returns zero if there is no such leading substring:
10617  *
10618  * '123'.oct # => 83
10619  * '-377'.oct # => -255
10620  * '0377non-numeric'.oct # => 255
10621  * 'non-numeric'.oct # => 0
10622  *
10623  * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10624  * see Kernel#Integer.
10625  *
10626  * Related: String#hex.
10627  *
10628  */
10629 
10630 static VALUE
10631 rb_str_oct(VALUE str)
10632 {
10633  return rb_str_to_inum(str, -8, FALSE);
10634 }
10635 
10636 #ifndef HAVE_CRYPT_R
10637 # include "ruby/thread_native.h"
10638 # include "ruby/atomic.h"
10639 
10640 static struct {
10641  rb_nativethread_lock_t lock;
10642 } crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10643 
10644 static void
10645 crypt_mutex_initialize(void)
10646 {
10647 }
10648 #endif
10649 
10650 /*
10651  * call-seq:
10652  * crypt(salt_str) -> new_string
10653  *
10654  * Returns the string generated by calling <code>crypt(3)</code>
10655  * standard library function with <code>str</code> and
10656  * <code>salt_str</code>, in this order, as its arguments. Please do
10657  * not use this method any longer. It is legacy; provided only for
10658  * backward compatibility with ruby scripts in earlier days. It is
10659  * bad to use in contemporary programs for several reasons:
10660  *
10661  * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10662  * run. The generated string lacks data portability.
10663  *
10664  * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10665  * (i.e. silently ends up in unexpected results).
10666  *
10667  * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10668  * thread safe.
10669  *
10670  * * So-called "traditional" usage of <code>crypt(3)</code> is very
10671  * very very weak. According to its manpage, Linux's traditional
10672  * <code>crypt(3)</code> output has only 2**56 variations; too
10673  * easy to brute force today. And this is the default behaviour.
10674  *
10675  * * In order to make things robust some OSes implement so-called
10676  * "modular" usage. To go through, you have to do a complex
10677  * build-up of the <code>salt_str</code> parameter, by hand.
10678  * Failure in generation of a proper salt string tends not to
10679  * yield any errors; typos in parameters are normally not
10680  * detectable.
10681  *
10682  * * For instance, in the following example, the second invocation
10683  * of String#crypt is wrong; it has a typo in "round=" (lacks
10684  * "s"). However the call does not fail and something unexpected
10685  * is generated.
10686  *
10687  * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10688  * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10689  *
10690  * * Even in the "modular" mode, some hash functions are considered
10691  * archaic and no longer recommended at all; for instance module
10692  * <code>$1$</code> is officially abandoned by its author: see
10693  * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10694  * instance module <code>$3$</code> is considered completely
10695  * broken: see the manpage of FreeBSD.
10696  *
10697  * * On some OS such as Mac OS, there is no modular mode. Yet, as
10698  * written above, <code>crypt(3)</code> on Mac OS never fails.
10699  * This means even if you build up a proper salt string it
10700  * generates a traditional DES hash anyways, and there is no way
10701  * for you to be aware of.
10702  *
10703  * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10704  *
10705  * If for some reason you cannot migrate to other secure contemporary
10706  * password hashing algorithms, install the string-crypt gem and
10707  * <code>require 'string/crypt'</code> to continue using it.
10708  */
10709 
10710 static VALUE
10711 rb_str_crypt(VALUE str, VALUE salt)
10712 {
10713 #ifdef HAVE_CRYPT_R
10714  VALUE databuf;
10715  struct crypt_data *data;
10716 # define CRYPT_END() ALLOCV_END(databuf)
10717 #else
10718  extern char *crypt(const char *, const char *);
10719 # define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10720 #endif
10721  VALUE result;
10722  const char *s, *saltp;
10723  char *res;
10724 #ifdef BROKEN_CRYPT
10725  char salt_8bit_clean[3];
10726 #endif
10727 
10728  StringValue(salt);
10729  mustnot_wchar(str);
10730  mustnot_wchar(salt);
10731  s = StringValueCStr(str);
10732  saltp = RSTRING_PTR(salt);
10733  if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10734  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10735  }
10736 
10737 #ifdef BROKEN_CRYPT
10738  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10739  salt_8bit_clean[0] = saltp[0] & 0x7f;
10740  salt_8bit_clean[1] = saltp[1] & 0x7f;
10741  salt_8bit_clean[2] = '\0';
10742  saltp = salt_8bit_clean;
10743  }
10744 #endif
10745 #ifdef HAVE_CRYPT_R
10746  data = ALLOCV(databuf, sizeof(struct crypt_data));
10747 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10748  data->initialized = 0;
10749 # endif
10750  res = crypt_r(s, saltp, data);
10751 #else
10752  crypt_mutex_initialize();
10753  rb_nativethread_lock_lock(&crypt_mutex.lock);
10754  res = crypt(s, saltp);
10755 #endif
10756  if (!res) {
10757  int err = errno;
10758  CRYPT_END();
10759  rb_syserr_fail(err, "crypt");
10760  }
10761  result = rb_str_new_cstr(res);
10762  CRYPT_END();
10763  return result;
10764 }
10765 
10766 
10767 /*
10768  * call-seq:
10769  * ord -> integer
10770  *
10771  * :include: doc/string/ord.rdoc
10772  *
10773  */
10774 
10775 static VALUE
10776 rb_str_ord(VALUE s)
10777 {
10778  unsigned int c;
10779 
10780  c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10781  return UINT2NUM(c);
10782 }
10783 /*
10784  * call-seq:
10785  * sum(n = 16) -> integer
10786  *
10787  * :include: doc/string/sum.rdoc
10788  *
10789  */
10790 
10791 static VALUE
10792 rb_str_sum(int argc, VALUE *argv, VALUE str)
10793 {
10794  int bits = 16;
10795  char *ptr, *p, *pend;
10796  long len;
10797  VALUE sum = INT2FIX(0);
10798  unsigned long sum0 = 0;
10799 
10800  if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10801  bits = 0;
10802  }
10803  ptr = p = RSTRING_PTR(str);
10804  len = RSTRING_LEN(str);
10805  pend = p + len;
10806 
10807  while (p < pend) {
10808  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10809  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10810  str_mod_check(str, ptr, len);
10811  sum0 = 0;
10812  }
10813  sum0 += (unsigned char)*p;
10814  p++;
10815  }
10816 
10817  if (bits == 0) {
10818  if (sum0) {
10819  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10820  }
10821  }
10822  else {
10823  if (sum == INT2FIX(0)) {
10824  if (bits < (int)sizeof(long)*CHAR_BIT) {
10825  sum0 &= (((unsigned long)1)<<bits)-1;
10826  }
10827  sum = LONG2FIX(sum0);
10828  }
10829  else {
10830  VALUE mod;
10831 
10832  if (sum0) {
10833  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10834  }
10835 
10836  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10837  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10838  sum = rb_funcall(sum, '&', 1, mod);
10839  }
10840  }
10841  return sum;
10842 }
10843 
10844 static VALUE
10845 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10846 {
10847  rb_encoding *enc;
10848  VALUE w;
10849  long width, len, flen = 1, fclen = 1;
10850  VALUE res;
10851  char *p;
10852  const char *f = " ";
10853  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10854  VALUE pad;
10855  int singlebyte = 1, cr;
10856  int termlen;
10857 
10858  rb_scan_args(argc, argv, "11", &w, &pad);
10859  enc = STR_ENC_GET(str);
10860  termlen = rb_enc_mbminlen(enc);
10861  width = NUM2LONG(w);
10862  if (argc == 2) {
10863  StringValue(pad);
10864  enc = rb_enc_check(str, pad);
10865  f = RSTRING_PTR(pad);
10866  flen = RSTRING_LEN(pad);
10867  fclen = str_strlen(pad, enc); /* rb_enc_check */
10868  singlebyte = single_byte_optimizable(pad);
10869  if (flen == 0 || fclen == 0) {
10870  rb_raise(rb_eArgError, "zero width padding");
10871  }
10872  }
10873  len = str_strlen(str, enc); /* rb_enc_check */
10874  if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10875  n = width - len;
10876  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10877  rlen = n - llen;
10878  cr = ENC_CODERANGE(str);
10879  if (flen > 1) {
10880  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10881  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10882  }
10883  size = RSTRING_LEN(str);
10884  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10885  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10886  (len += llen2 + rlen2) >= LONG_MAX - size) {
10887  rb_raise(rb_eArgError, "argument too big");
10888  }
10889  len += size;
10890  res = str_enc_new(rb_cString, 0, len, enc);
10891  p = RSTRING_PTR(res);
10892  if (flen <= 1) {
10893  memset(p, *f, llen);
10894  p += llen;
10895  }
10896  else {
10897  while (llen >= fclen) {
10898  memcpy(p,f,flen);
10899  p += flen;
10900  llen -= fclen;
10901  }
10902  if (llen > 0) {
10903  memcpy(p, f, llen2);
10904  p += llen2;
10905  }
10906  }
10907  memcpy(p, RSTRING_PTR(str), size);
10908  p += size;
10909  if (flen <= 1) {
10910  memset(p, *f, rlen);
10911  p += rlen;
10912  }
10913  else {
10914  while (rlen >= fclen) {
10915  memcpy(p,f,flen);
10916  p += flen;
10917  rlen -= fclen;
10918  }
10919  if (rlen > 0) {
10920  memcpy(p, f, rlen2);
10921  p += rlen2;
10922  }
10923  }
10924  TERM_FILL(p, termlen);
10925  STR_SET_LEN(res, p-RSTRING_PTR(res));
10926 
10927  if (argc == 2)
10928  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10929  if (cr != ENC_CODERANGE_BROKEN)
10930  ENC_CODERANGE_SET(res, cr);
10931 
10932  RB_GC_GUARD(pad);
10933  return res;
10934 }
10935 
10936 
10937 /*
10938  * call-seq:
10939  * ljust(size, pad_string = ' ') -> new_string
10940  *
10941  * :include: doc/string/ljust.rdoc
10942  *
10943  * Related: String#rjust, String#center.
10944  *
10945  */
10946 
10947 static VALUE
10948 rb_str_ljust(int argc, VALUE *argv, VALUE str)
10949 {
10950  return rb_str_justify(argc, argv, str, 'l');
10951 }
10952 
10953 /*
10954  * call-seq:
10955  * rjust(size, pad_string = ' ') -> new_string
10956  *
10957  * :include: doc/string/rjust.rdoc
10958  *
10959  * Related: String#ljust, String#center.
10960  *
10961  */
10962 
10963 static VALUE
10964 rb_str_rjust(int argc, VALUE *argv, VALUE str)
10965 {
10966  return rb_str_justify(argc, argv, str, 'r');
10967 }
10968 
10969 
10970 /*
10971  * call-seq:
10972  * center(size, pad_string = ' ') -> new_string
10973  *
10974  * :include: doc/string/center.rdoc
10975  *
10976  * Related: String#ljust, String#rjust.
10977  *
10978  */
10979 
10980 static VALUE
10981 rb_str_center(int argc, VALUE *argv, VALUE str)
10982 {
10983  return rb_str_justify(argc, argv, str, 'c');
10984 }
10985 
10986 /*
10987  * call-seq:
10988  * partition(string_or_regexp) -> [head, match, tail]
10989  *
10990  * :include: doc/string/partition.rdoc
10991  *
10992  */
10993 
10994 static VALUE
10995 rb_str_partition(VALUE str, VALUE sep)
10996 {
10997  long pos;
10998 
10999  sep = get_pat_quoted(sep, 0);
11000  if (RB_TYPE_P(sep, T_REGEXP)) {
11001  if (rb_reg_search(sep, str, 0, 0) < 0) {
11002  goto failed;
11003  }
11004  VALUE match = rb_backref_get();
11005  struct re_registers *regs = RMATCH_REGS(match);
11006 
11007  pos = BEG(0);
11008  sep = rb_str_subseq(str, pos, END(0) - pos);
11009  }
11010  else {
11011  pos = rb_str_index(str, sep, 0);
11012  if (pos < 0) goto failed;
11013  }
11014  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11015  sep,
11016  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11017  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11018 
11019  failed:
11020  return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
11021 }
11022 
11023 /*
11024  * call-seq:
11025  * rpartition(sep) -> [head, match, tail]
11026  *
11027  * :include: doc/string/rpartition.rdoc
11028  *
11029  */
11030 
11031 static VALUE
11032 rb_str_rpartition(VALUE str, VALUE sep)
11033 {
11034  long pos = RSTRING_LEN(str);
11035 
11036  sep = get_pat_quoted(sep, 0);
11037  if (RB_TYPE_P(sep, T_REGEXP)) {
11038  if (rb_reg_search(sep, str, pos, 1) < 0) {
11039  goto failed;
11040  }
11041  VALUE match = rb_backref_get();
11042  struct re_registers *regs = RMATCH_REGS(match);
11043 
11044  pos = BEG(0);
11045  sep = rb_str_subseq(str, pos, END(0) - pos);
11046  }
11047  else {
11048  pos = rb_str_sublen(str, pos);
11049  pos = rb_str_rindex(str, sep, pos);
11050  if (pos < 0) {
11051  goto failed;
11052  }
11053  }
11054 
11055  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
11056  sep,
11057  rb_str_subseq(str, pos+RSTRING_LEN(sep),
11058  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
11059  failed:
11060  return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
11061 }
11062 
11063 /*
11064  * call-seq:
11065  * start_with?(*string_or_regexp) -> true or false
11066  *
11067  * :include: doc/string/start_with_p.rdoc
11068  *
11069  */
11070 
11071 static VALUE
11072 rb_str_start_with(int argc, VALUE *argv, VALUE str)
11073 {
11074  int i;
11075 
11076  for (i=0; i<argc; i++) {
11077  VALUE tmp = argv[i];
11078  if (RB_TYPE_P(tmp, T_REGEXP)) {
11079  if (rb_reg_start_with_p(tmp, str))
11080  return Qtrue;
11081  }
11082  else {
11083  const char *p, *s, *e;
11084  long slen, tlen;
11085  rb_encoding *enc;
11086 
11087  StringValue(tmp);
11088  enc = rb_enc_check(str, tmp);
11089  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11090  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11091  p = RSTRING_PTR(str);
11092  e = p + slen;
11093  s = p + tlen;
11094  if (!at_char_right_boundary(p, s, e, enc))
11095  continue;
11096  if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
11097  return Qtrue;
11098  }
11099  }
11100  return Qfalse;
11101 }
11102 
11103 /*
11104  * call-seq:
11105  * end_with?(*strings) -> true or false
11106  *
11107  * :include: doc/string/end_with_p.rdoc
11108  *
11109  */
11110 
11111 static VALUE
11112 rb_str_end_with(int argc, VALUE *argv, VALUE str)
11113 {
11114  int i;
11115 
11116  for (i=0; i<argc; i++) {
11117  VALUE tmp = argv[i];
11118  const char *p, *s, *e;
11119  long slen, tlen;
11120  rb_encoding *enc;
11121 
11122  StringValue(tmp);
11123  enc = rb_enc_check(str, tmp);
11124  if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
11125  if ((slen = RSTRING_LEN(str)) < tlen) continue;
11126  p = RSTRING_PTR(str);
11127  e = p + slen;
11128  s = e - tlen;
11129  if (!at_char_boundary(p, s, e, enc))
11130  continue;
11131  if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
11132  return Qtrue;
11133  }
11134  return Qfalse;
11135 }
11136 
11146 static long
11147 deleted_prefix_length(VALUE str, VALUE prefix)
11148 {
11149  const char *strptr, *prefixptr;
11150  long olen, prefixlen;
11151  rb_encoding *enc = rb_enc_get(str);
11152 
11153  StringValue(prefix);
11154 
11155  if (!is_broken_string(prefix) ||
11156  !rb_enc_asciicompat(enc) ||
11157  !rb_enc_asciicompat(rb_enc_get(prefix))) {
11158  enc = rb_enc_check(str, prefix);
11159  }
11160 
11161  /* return 0 if not start with prefix */
11162  prefixlen = RSTRING_LEN(prefix);
11163  if (prefixlen <= 0) return 0;
11164  olen = RSTRING_LEN(str);
11165  if (olen < prefixlen) return 0;
11166  strptr = RSTRING_PTR(str);
11167  prefixptr = RSTRING_PTR(prefix);
11168  if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
11169  if (is_broken_string(prefix)) {
11170  if (!is_broken_string(str)) {
11171  /* prefix in a valid string cannot be broken */
11172  return 0;
11173  }
11174  const char *strend = strptr + olen;
11175  const char *after_prefix = strptr + prefixlen;
11176  if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
11177  /* prefix does not end at char-boundary */
11178  return 0;
11179  }
11180  }
11181  /* prefix part in `str` also should be valid. */
11182 
11183  return prefixlen;
11184 }
11185 
11186 /*
11187  * call-seq:
11188  * delete_prefix!(prefix) -> self or nil
11189  *
11190  * Like String#delete_prefix, except that +self+ is modified in place.
11191  * Returns +self+ if the prefix is removed, +nil+ otherwise.
11192  *
11193  */
11194 
11195 static VALUE
11196 rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
11197 {
11198  long prefixlen;
11199  str_modify_keep_cr(str);
11200 
11201  prefixlen = deleted_prefix_length(str, prefix);
11202  if (prefixlen <= 0) return Qnil;
11203 
11204  return rb_str_drop_bytes(str, prefixlen);
11205 }
11206 
11207 /*
11208  * call-seq:
11209  * delete_prefix(prefix) -> new_string
11210  *
11211  * :include: doc/string/delete_prefix.rdoc
11212  *
11213  */
11214 
11215 static VALUE
11216 rb_str_delete_prefix(VALUE str, VALUE prefix)
11217 {
11218  long prefixlen;
11219 
11220  prefixlen = deleted_prefix_length(str, prefix);
11221  if (prefixlen <= 0) return str_duplicate(rb_cString, str);
11222 
11223  return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
11224 }
11225 
11235 static long
11236 deleted_suffix_length(VALUE str, VALUE suffix)
11237 {
11238  const char *strptr, *suffixptr;
11239  long olen, suffixlen;
11240  rb_encoding *enc;
11241 
11242  StringValue(suffix);
11243  if (is_broken_string(suffix)) return 0;
11244  enc = rb_enc_check(str, suffix);
11245 
11246  /* return 0 if not start with suffix */
11247  suffixlen = RSTRING_LEN(suffix);
11248  if (suffixlen <= 0) return 0;
11249  olen = RSTRING_LEN(str);
11250  if (olen < suffixlen) return 0;
11251  strptr = RSTRING_PTR(str);
11252  suffixptr = RSTRING_PTR(suffix);
11253  const char *strend = strptr + olen;
11254  const char *before_suffix = strend - suffixlen;
11255  if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
11256  if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
11257 
11258  return suffixlen;
11259 }
11260 
11261 /*
11262  * call-seq:
11263  * delete_suffix!(suffix) -> self or nil
11264  *
11265  * Like String#delete_suffix, except that +self+ is modified in place.
11266  * Returns +self+ if the suffix is removed, +nil+ otherwise.
11267  *
11268  */
11269 
11270 static VALUE
11271 rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
11272 {
11273  long olen, suffixlen, len;
11274  str_modifiable(str);
11275 
11276  suffixlen = deleted_suffix_length(str, suffix);
11277  if (suffixlen <= 0) return Qnil;
11278 
11279  olen = RSTRING_LEN(str);
11280  str_modify_keep_cr(str);
11281  len = olen - suffixlen;
11282  STR_SET_LEN(str, len);
11283  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
11284  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
11285  ENC_CODERANGE_CLEAR(str);
11286  }
11287  return str;
11288 }
11289 
11290 /*
11291  * call-seq:
11292  * delete_suffix(suffix) -> new_string
11293  *
11294  * :include: doc/string/delete_suffix.rdoc
11295  *
11296  */
11297 
11298 static VALUE
11299 rb_str_delete_suffix(VALUE str, VALUE suffix)
11300 {
11301  long suffixlen;
11302 
11303  suffixlen = deleted_suffix_length(str, suffix);
11304  if (suffixlen <= 0) return str_duplicate(rb_cString, str);
11305 
11306  return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
11307 }
11308 
11309 void
11310 rb_str_setter(VALUE val, ID id, VALUE *var)
11311 {
11312  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
11313  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
11314  }
11315  *var = val;
11316 }
11317 
11318 static void
11319 rb_fs_setter(VALUE val, ID id, VALUE *var)
11320 {
11321  val = rb_fs_check(val);
11322  if (!val) {
11324  "value of %"PRIsVALUE" must be String or Regexp",
11325  rb_id2str(id));
11326  }
11327  if (!NIL_P(val)) {
11328  rb_warn_deprecated("'$;'", NULL);
11329  }
11330  *var = val;
11331 }
11332 
11333 
11334 /*
11335  * call-seq:
11336  * force_encoding(encoding) -> self
11337  *
11338  * :include: doc/string/force_encoding.rdoc
11339  *
11340  */
11341 
11342 static VALUE
11343 rb_str_force_encoding(VALUE str, VALUE enc)
11344 {
11345  str_modifiable(str);
11346 
11347  rb_encoding *encoding = rb_to_encoding(enc);
11348  int idx = rb_enc_to_index(encoding);
11349 
11350  // If the encoding is unchanged, we do nothing.
11351  if (ENCODING_GET(str) == idx) {
11352  return str;
11353  }
11354 
11355  rb_enc_associate_index(str, idx);
11356 
11357  // If the coderange was 7bit and the new encoding is ASCII-compatible
11358  // we can keep the coderange.
11359  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
11360  return str;
11361  }
11362 
11363  ENC_CODERANGE_CLEAR(str);
11364  return str;
11365 }
11366 
11367 /*
11368  * call-seq:
11369  * b -> string
11370  *
11371  * :include: doc/string/b.rdoc
11372  *
11373  */
11374 
11375 static VALUE
11376 rb_str_b(VALUE str)
11377 {
11378  VALUE str2;
11379  if (STR_EMBED_P(str)) {
11380  str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
11381  }
11382  else {
11383  str2 = str_alloc_heap(rb_cString);
11384  }
11385  str_replace_shared_without_enc(str2, str);
11386 
11387  if (rb_enc_asciicompat(STR_ENC_GET(str))) {
11388  // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
11389  // If we know the receiver's code range then we know the result's code range.
11390  int cr = ENC_CODERANGE(str);
11391  switch (cr) {
11392  case ENC_CODERANGE_7BIT:
11394  break;
11395  case ENC_CODERANGE_BROKEN:
11396  case ENC_CODERANGE_VALID:
11398  break;
11399  default:
11400  ENC_CODERANGE_CLEAR(str2);
11401  break;
11402  }
11403  }
11404 
11405  return str2;
11406 }
11407 
11408 /*
11409  * call-seq:
11410  * valid_encoding? -> true or false
11411  *
11412  * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
11413  *
11414  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
11415  * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
11416  * "\x80".force_encoding("UTF-8").valid_encoding? # => false
11417  */
11418 
11419 static VALUE
11420 rb_str_valid_encoding_p(VALUE str)
11421 {
11422  int cr = rb_enc_str_coderange(str);
11423 
11424  return RBOOL(cr != ENC_CODERANGE_BROKEN);
11425 }
11426 
11427 /*
11428  * call-seq:
11429  * ascii_only? -> true or false
11430  *
11431  * Returns +true+ if +self+ contains only ASCII characters,
11432  * +false+ otherwise:
11433  *
11434  * 'abc'.ascii_only? # => true
11435  * "abc\u{6666}".ascii_only? # => false
11436  *
11437  */
11438 
11439 static VALUE
11440 rb_str_is_ascii_only_p(VALUE str)
11441 {
11442  int cr = rb_enc_str_coderange(str);
11443 
11444  return RBOOL(cr == ENC_CODERANGE_7BIT);
11445 }
11446 
11447 VALUE
11449 {
11450  static const char ellipsis[] = "...";
11451  const long ellipsislen = sizeof(ellipsis) - 1;
11452  rb_encoding *const enc = rb_enc_get(str);
11453  const long blen = RSTRING_LEN(str);
11454  const char *const p = RSTRING_PTR(str), *e = p + blen;
11455  VALUE estr, ret = 0;
11456 
11457  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11458  if (len * rb_enc_mbminlen(enc) >= blen ||
11459  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11460  ret = str;
11461  }
11462  else if (len <= ellipsislen ||
11463  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11464  if (rb_enc_asciicompat(enc)) {
11465  ret = rb_str_new(ellipsis, len);
11466  rb_enc_associate(ret, enc);
11467  }
11468  else {
11469  estr = rb_usascii_str_new(ellipsis, len);
11470  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11471  }
11472  }
11473  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11474  rb_str_cat(ret, ellipsis, ellipsislen);
11475  }
11476  else {
11477  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11478  rb_enc_from_encoding(enc), 0, Qnil);
11479  rb_str_append(ret, estr);
11480  }
11481  return ret;
11482 }
11483 
11484 static VALUE
11485 str_compat_and_valid(VALUE str, rb_encoding *enc)
11486 {
11487  int cr;
11488  str = StringValue(str);
11489  cr = rb_enc_str_coderange(str);
11490  if (cr == ENC_CODERANGE_BROKEN) {
11491  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11492  }
11493  else {
11494  rb_encoding *e = STR_ENC_GET(str);
11495  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11496  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11497  rb_enc_inspect_name(enc), rb_enc_inspect_name(e));
11498  }
11499  }
11500  return str;
11501 }
11502 
11503 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11504 
11505 VALUE
11507 {
11508  rb_encoding *enc = STR_ENC_GET(str);
11509  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11510 }
11511 
11512 VALUE
11513 rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11514 {
11515  int cr = ENC_CODERANGE_UNKNOWN;
11516  if (enc == STR_ENC_GET(str)) {
11517  /* cached coderange makes sense only when enc equals the
11518  * actual encoding of str */
11519  cr = ENC_CODERANGE(str);
11520  }
11521  return enc_str_scrub(enc, str, repl, cr);
11522 }
11523 
11524 static VALUE
11525 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11526 {
11527  int encidx;
11528  VALUE buf = Qnil;
11529  const char *rep, *p, *e, *p1, *sp;
11530  long replen = -1;
11531  long slen;
11532 
11533  if (rb_block_given_p()) {
11534  if (!NIL_P(repl))
11535  rb_raise(rb_eArgError, "both of block and replacement given");
11536  replen = 0;
11537  }
11538 
11539  if (ENC_CODERANGE_CLEAN_P(cr))
11540  return Qnil;
11541 
11542  if (!NIL_P(repl)) {
11543  repl = str_compat_and_valid(repl, enc);
11544  }
11545 
11546  if (rb_enc_dummy_p(enc)) {
11547  return Qnil;
11548  }
11549  encidx = rb_enc_to_index(enc);
11550 
11551 #define DEFAULT_REPLACE_CHAR(str) do { \
11552  static const char replace[sizeof(str)-1] = str; \
11553  rep = replace; replen = (int)sizeof(replace); \
11554  } while (0)
11555 
11556  slen = RSTRING_LEN(str);
11557  p = RSTRING_PTR(str);
11558  e = RSTRING_END(str);
11559  p1 = p;
11560  sp = p;
11561 
11562  if (rb_enc_asciicompat(enc)) {
11563  int rep7bit_p;
11564  if (!replen) {
11565  rep = NULL;
11566  rep7bit_p = FALSE;
11567  }
11568  else if (!NIL_P(repl)) {
11569  rep = RSTRING_PTR(repl);
11570  replen = RSTRING_LEN(repl);
11571  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11572  }
11573  else if (encidx == rb_utf8_encindex()) {
11574  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11575  rep7bit_p = FALSE;
11576  }
11577  else {
11578  DEFAULT_REPLACE_CHAR("?");
11579  rep7bit_p = TRUE;
11580  }
11581  cr = ENC_CODERANGE_7BIT;
11582 
11583  p = search_nonascii(p, e);
11584  if (!p) {
11585  p = e;
11586  }
11587  while (p < e) {
11588  int ret = rb_enc_precise_mbclen(p, e, enc);
11589  if (MBCLEN_NEEDMORE_P(ret)) {
11590  break;
11591  }
11592  else if (MBCLEN_CHARFOUND_P(ret)) {
11593  cr = ENC_CODERANGE_VALID;
11594  p += MBCLEN_CHARFOUND_LEN(ret);
11595  }
11596  else if (MBCLEN_INVALID_P(ret)) {
11597  /*
11598  * p1~p: valid ascii/multibyte chars
11599  * p ~e: invalid bytes + unknown bytes
11600  */
11601  long clen = rb_enc_mbmaxlen(enc);
11602  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11603  if (p > p1) {
11604  rb_str_buf_cat(buf, p1, p - p1);
11605  }
11606 
11607  if (e - p < clen) clen = e - p;
11608  if (clen <= 2) {
11609  clen = 1;
11610  }
11611  else {
11612  const char *q = p;
11613  clen--;
11614  for (; clen > 1; clen--) {
11615  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11616  if (MBCLEN_NEEDMORE_P(ret)) break;
11617  if (MBCLEN_INVALID_P(ret)) continue;
11618  UNREACHABLE;
11619  }
11620  }
11621  if (rep) {
11622  rb_str_buf_cat(buf, rep, replen);
11623  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11624  }
11625  else {
11626  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11627  str_mod_check(str, sp, slen);
11628  repl = str_compat_and_valid(repl, enc);
11629  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11630  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11631  cr = ENC_CODERANGE_VALID;
11632  }
11633  p += clen;
11634  p1 = p;
11635  p = search_nonascii(p, e);
11636  if (!p) {
11637  p = e;
11638  break;
11639  }
11640  }
11641  else {
11642  UNREACHABLE;
11643  }
11644  }
11645  if (NIL_P(buf)) {
11646  if (p == e) {
11647  ENC_CODERANGE_SET(str, cr);
11648  return Qnil;
11649  }
11650  buf = rb_str_buf_new(RSTRING_LEN(str));
11651  }
11652  if (p1 < p) {
11653  rb_str_buf_cat(buf, p1, p - p1);
11654  }
11655  if (p < e) {
11656  if (rep) {
11657  rb_str_buf_cat(buf, rep, replen);
11658  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11659  }
11660  else {
11661  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11662  str_mod_check(str, sp, slen);
11663  repl = str_compat_and_valid(repl, enc);
11664  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11665  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
11666  cr = ENC_CODERANGE_VALID;
11667  }
11668  }
11669  }
11670  else {
11671  /* ASCII incompatible */
11672  long mbminlen = rb_enc_mbminlen(enc);
11673  if (!replen) {
11674  rep = NULL;
11675  }
11676  else if (!NIL_P(repl)) {
11677  rep = RSTRING_PTR(repl);
11678  replen = RSTRING_LEN(repl);
11679  }
11680  else if (encidx == ENCINDEX_UTF_16BE) {
11681  DEFAULT_REPLACE_CHAR("\xFF\xFD");
11682  }
11683  else if (encidx == ENCINDEX_UTF_16LE) {
11684  DEFAULT_REPLACE_CHAR("\xFD\xFF");
11685  }
11686  else if (encidx == ENCINDEX_UTF_32BE) {
11687  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11688  }
11689  else if (encidx == ENCINDEX_UTF_32LE) {
11690  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11691  }
11692  else {
11693  DEFAULT_REPLACE_CHAR("?");
11694  }
11695 
11696  while (p < e) {
11697  int ret = rb_enc_precise_mbclen(p, e, enc);
11698  if (MBCLEN_NEEDMORE_P(ret)) {
11699  break;
11700  }
11701  else if (MBCLEN_CHARFOUND_P(ret)) {
11702  p += MBCLEN_CHARFOUND_LEN(ret);
11703  }
11704  else if (MBCLEN_INVALID_P(ret)) {
11705  const char *q = p;
11706  long clen = rb_enc_mbmaxlen(enc);
11707  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11708  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11709 
11710  if (e - p < clen) clen = e - p;
11711  if (clen <= mbminlen * 2) {
11712  clen = mbminlen;
11713  }
11714  else {
11715  clen -= mbminlen;
11716  for (; clen > mbminlen; clen-=mbminlen) {
11717  ret = rb_enc_precise_mbclen(q, q + clen, enc);
11718  if (MBCLEN_NEEDMORE_P(ret)) break;
11719  if (MBCLEN_INVALID_P(ret)) continue;
11720  UNREACHABLE;
11721  }
11722  }
11723  if (rep) {
11724  rb_str_buf_cat(buf, rep, replen);
11725  }
11726  else {
11727  repl = rb_yield(rb_enc_str_new(p, clen, enc));
11728  str_mod_check(str, sp, slen);
11729  repl = str_compat_and_valid(repl, enc);
11730  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11731  }
11732  p += clen;
11733  p1 = p;
11734  }
11735  else {
11736  UNREACHABLE;
11737  }
11738  }
11739  if (NIL_P(buf)) {
11740  if (p == e) {
11742  return Qnil;
11743  }
11744  buf = rb_str_buf_new(RSTRING_LEN(str));
11745  }
11746  if (p1 < p) {
11747  rb_str_buf_cat(buf, p1, p - p1);
11748  }
11749  if (p < e) {
11750  if (rep) {
11751  rb_str_buf_cat(buf, rep, replen);
11752  }
11753  else {
11754  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11755  str_mod_check(str, sp, slen);
11756  repl = str_compat_and_valid(repl, enc);
11757  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11758  }
11759  }
11760  cr = ENC_CODERANGE_VALID;
11761  }
11762  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11763  return buf;
11764 }
11765 
11766 /*
11767  * call-seq:
11768  * scrub(replacement_string = default_replacement) -> new_string
11769  * scrub{|bytes| ... } -> new_string
11770  *
11771  * :include: doc/string/scrub.rdoc
11772  *
11773  */
11774 static VALUE
11775 str_scrub(int argc, VALUE *argv, VALUE str)
11776 {
11777  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11778  VALUE new = rb_str_scrub(str, repl);
11779  return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11780 }
11781 
11782 /*
11783  * call-seq:
11784  * scrub! -> self
11785  * scrub!(replacement_string = default_replacement) -> self
11786  * scrub!{|bytes| ... } -> self
11787  *
11788  * Like String#scrub, except that any replacements are made in +self+.
11789  *
11790  */
11791 static VALUE
11792 str_scrub_bang(int argc, VALUE *argv, VALUE str)
11793 {
11794  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11795  VALUE new = rb_str_scrub(str, repl);
11796  if (!NIL_P(new)) rb_str_replace(str, new);
11797  return str;
11798 }
11799 
11800 static ID id_normalize;
11801 static ID id_normalized_p;
11802 static VALUE mUnicodeNormalize;
11803 
11804 static VALUE
11805 unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11806 {
11807  static int UnicodeNormalizeRequired = 0;
11808  VALUE argv2[2];
11809 
11810  if (!UnicodeNormalizeRequired) {
11811  rb_require("unicode_normalize/normalize.rb");
11812  UnicodeNormalizeRequired = 1;
11813  }
11814  argv2[0] = str;
11815  if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11816  return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11817 }
11818 
11819 /*
11820  * call-seq:
11821  * unicode_normalize(form = :nfc) -> string
11822  *
11823  * Returns a copy of +self+ with
11824  * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11825  *
11826  * Argument +form+ must be one of the following symbols
11827  * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11828  *
11829  * - +:nfc+: Canonical decomposition, followed by canonical composition.
11830  * - +:nfd+: Canonical decomposition.
11831  * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11832  * - +:nfkd+: Compatibility decomposition.
11833  *
11834  * The encoding of +self+ must be one of:
11835  *
11836  * - Encoding::UTF_8
11837  * - Encoding::UTF_16BE
11838  * - Encoding::UTF_16LE
11839  * - Encoding::UTF_32BE
11840  * - Encoding::UTF_32LE
11841  * - Encoding::GB18030
11842  * - Encoding::UCS_2BE
11843  * - Encoding::UCS_4BE
11844  *
11845  * Examples:
11846  *
11847  * "a\u0300".unicode_normalize # => "a"
11848  * "\u00E0".unicode_normalize(:nfd) # => "a "
11849  *
11850  * Related: String#unicode_normalize!, String#unicode_normalized?.
11851  */
11852 static VALUE
11853 rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11854 {
11855  return unicode_normalize_common(argc, argv, str, id_normalize);
11856 }
11857 
11858 /*
11859  * call-seq:
11860  * unicode_normalize!(form = :nfc) -> self
11861  *
11862  * Like String#unicode_normalize, except that the normalization
11863  * is performed on +self+.
11864  *
11865  * Related String#unicode_normalized?.
11866  *
11867  */
11868 static VALUE
11869 rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11870 {
11871  return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11872 }
11873 
11874 /* call-seq:
11875  * unicode_normalized?(form = :nfc) -> true or false
11876  *
11877  * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11878  * +false+ otherwise.
11879  * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11880  *
11881  * Examples:
11882  *
11883  * "a\u0300".unicode_normalized? # => false
11884  * "a\u0300".unicode_normalized?(:nfd) # => true
11885  * "\u00E0".unicode_normalized? # => true
11886  * "\u00E0".unicode_normalized?(:nfd) # => false
11887  *
11888  *
11889  * Raises an exception if +self+ is not in a Unicode encoding:
11890  *
11891  * s = "\xE0".force_encoding('ISO-8859-1')
11892  * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11893  *
11894  * Related: String#unicode_normalize, String#unicode_normalize!.
11895  *
11896  */
11897 static VALUE
11898 rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11899 {
11900  return unicode_normalize_common(argc, argv, str, id_normalized_p);
11901 }
11902 
11903 /**********************************************************************
11904  * Document-class: Symbol
11905  *
11906  * A +Symbol+ object represents a named identifier inside the Ruby interpreter.
11907  *
11908  * You can create a +Symbol+ object explicitly with:
11909  *
11910  * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11911  *
11912  * The same +Symbol+ object will be
11913  * created for a given name or string for the duration of a program's
11914  * execution, regardless of the context or meaning of that name. Thus
11915  * if <code>Fred</code> is a constant in one context, a method in
11916  * another, and a class in a third, the +Symbol+ <code>:Fred</code>
11917  * will be the same object in all three contexts.
11918  *
11919  * module One
11920  * class Fred
11921  * end
11922  * $f1 = :Fred
11923  * end
11924  * module Two
11925  * Fred = 1
11926  * $f2 = :Fred
11927  * end
11928  * def Fred()
11929  * end
11930  * $f3 = :Fred
11931  * $f1.object_id #=> 2514190
11932  * $f2.object_id #=> 2514190
11933  * $f3.object_id #=> 2514190
11934  *
11935  * Constant, method, and variable names are returned as symbols:
11936  *
11937  * module One
11938  * Two = 2
11939  * def three; 3 end
11940  * @four = 4
11941  * @@five = 5
11942  * $six = 6
11943  * end
11944  * seven = 7
11945  *
11946  * One.constants
11947  * # => [:Two]
11948  * One.instance_methods(true)
11949  * # => [:three]
11950  * One.instance_variables
11951  * # => [:@four]
11952  * One.class_variables
11953  * # => [:@@five]
11954  * global_variables.grep(/six/)
11955  * # => [:$six]
11956  * local_variables
11957  * # => [:seven]
11958  *
11959  * A +Symbol+ object differs from a String object in that
11960  * a +Symbol+ object represents an identifier, while a String object
11961  * represents text or data.
11962  *
11963  * == What's Here
11964  *
11965  * First, what's elsewhere. \Class +Symbol+:
11966  *
11967  * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11968  * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11969  *
11970  * Here, class +Symbol+ provides methods that are useful for:
11971  *
11972  * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11973  * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11974  * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11975  *
11976  * === Methods for Querying
11977  *
11978  * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11979  * - #=~: Returns the index of the first substring in symbol that matches a
11980  * given Regexp or other object; returns +nil+ if no match is found.
11981  * - #[], #slice : Returns a substring of symbol
11982  * determined by a given index, start/length, or range, or string.
11983  * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11984  * - #encoding: Returns the Encoding object that represents the encoding
11985  * of symbol.
11986  * - #end_with?: Returns +true+ if symbol ends with
11987  * any of the given strings.
11988  * - #match: Returns a MatchData object if symbol
11989  * matches a given Regexp; +nil+ otherwise.
11990  * - #match?: Returns +true+ if symbol
11991  * matches a given Regexp; +false+ otherwise.
11992  * - #length, #size: Returns the number of characters in symbol.
11993  * - #start_with?: Returns +true+ if symbol starts with
11994  * any of the given strings.
11995  *
11996  * === Methods for Comparing
11997  *
11998  * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11999  * or larger than symbol.
12000  * - #==, #===: Returns +true+ if a given symbol has the same content and
12001  * encoding.
12002  * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
12003  * symbol is smaller than, equal to, or larger than symbol.
12004  * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
12005  * after Unicode case folding; +false+ otherwise.
12006  *
12007  * === Methods for Converting
12008  *
12009  * - #capitalize: Returns symbol with the first character upcased
12010  * and all other characters downcased.
12011  * - #downcase: Returns symbol with all characters downcased.
12012  * - #inspect: Returns the string representation of +self+ as a symbol literal.
12013  * - #name: Returns the frozen string corresponding to symbol.
12014  * - #succ, #next: Returns the symbol that is the successor to symbol.
12015  * - #swapcase: Returns symbol with all upcase characters downcased
12016  * and all downcase characters upcased.
12017  * - #to_proc: Returns a Proc object which responds to the method named by symbol.
12018  * - #to_s, #id2name: Returns the string corresponding to +self+.
12019  * - #to_sym, #intern: Returns +self+.
12020  * - #upcase: Returns symbol with all characters upcased.
12021  *
12022  */
12023 
12024 
12025 /*
12026  * call-seq:
12027  * symbol == object -> true or false
12028  *
12029  * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
12030  */
12031 
12032 #define sym_equal rb_obj_equal
12033 
12034 static int
12035 sym_printable(const char *s, const char *send, rb_encoding *enc)
12036 {
12037  while (s < send) {
12038  int n;
12039  int c = rb_enc_precise_mbclen(s, send, enc);
12040 
12041  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
12042  n = MBCLEN_CHARFOUND_LEN(c);
12043  c = rb_enc_mbc_to_codepoint(s, send, enc);
12044  if (!rb_enc_isprint(c, enc)) return FALSE;
12045  s += n;
12046  }
12047  return TRUE;
12048 }
12049 
12050 int
12051 rb_str_symname_p(VALUE sym)
12052 {
12053  rb_encoding *enc;
12054  const char *ptr;
12055  long len;
12057 
12058  if (resenc == NULL) resenc = rb_default_external_encoding();
12059  enc = STR_ENC_GET(sym);
12060  ptr = RSTRING_PTR(sym);
12061  len = RSTRING_LEN(sym);
12062  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
12063  !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
12064  return FALSE;
12065  }
12066  return TRUE;
12067 }
12068 
12069 VALUE
12070 rb_str_quote_unprintable(VALUE str)
12071 {
12072  rb_encoding *enc;
12073  const char *ptr;
12074  long len;
12075  rb_encoding *resenc;
12076 
12077  Check_Type(str, T_STRING);
12078  resenc = rb_default_internal_encoding();
12079  if (resenc == NULL) resenc = rb_default_external_encoding();
12080  enc = STR_ENC_GET(str);
12081  ptr = RSTRING_PTR(str);
12082  len = RSTRING_LEN(str);
12083  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
12084  !sym_printable(ptr, ptr + len, enc)) {
12085  return rb_str_escape(str);
12086  }
12087  return str;
12088 }
12089 
12090 VALUE
12091 rb_id_quote_unprintable(ID id)
12092 {
12093  VALUE str = rb_id2str(id);
12094  if (!rb_str_symname_p(str)) {
12095  return rb_str_escape(str);
12096  }
12097  return str;
12098 }
12099 
12100 /*
12101  * call-seq:
12102  * inspect -> string
12103  *
12104  * Returns a string representation of +self+ (including the leading colon):
12105  *
12106  * :foo.inspect # => ":foo"
12107  *
12108  * Related: Symbol#to_s, Symbol#name.
12109  *
12110  */
12111 
12112 static VALUE
12113 sym_inspect(VALUE sym)
12114 {
12115  VALUE str = rb_sym2str(sym);
12116  const char *ptr;
12117  long len;
12118  char *dest;
12119 
12120  if (!rb_str_symname_p(str)) {
12121  str = rb_str_inspect(str);
12122  len = RSTRING_LEN(str);
12123  rb_str_resize(str, len + 1);
12124  dest = RSTRING_PTR(str);
12125  memmove(dest + 1, dest, len);
12126  }
12127  else {
12128  rb_encoding *enc = STR_ENC_GET(str);
12129  VALUE orig_str = str;
12130 
12131  len = RSTRING_LEN(orig_str);
12132  str = rb_enc_str_new(0, len + 1, enc);
12133 
12134  // Get data pointer after allocation
12135  ptr = RSTRING_PTR(orig_str);
12136  dest = RSTRING_PTR(str);
12137  memcpy(dest + 1, ptr, len);
12138 
12139  RB_GC_GUARD(orig_str);
12140  }
12141  dest[0] = ':';
12142 
12144 
12145  return str;
12146 }
12147 
12148 VALUE
12150 {
12151  VALUE str = str_new_shared(rb_cString, rb_sym2str(sym));
12152  FL_SET_RAW(str, STR_CHILLED_SYMBOL_TO_S);
12153  return str;
12154 }
12155 
12156 VALUE
12157 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
12158 {
12159  VALUE obj;
12160 
12161  if (argc < 1) {
12162  rb_raise(rb_eArgError, "no receiver given");
12163  }
12164  obj = argv[0];
12165  return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
12166 }
12167 
12168 /*
12169  * call-seq:
12170  * succ
12171  *
12172  * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
12173  *
12174  * :foo.succ # => :fop
12175  *
12176  * Related: String#succ.
12177  */
12178 
12179 static VALUE
12180 sym_succ(VALUE sym)
12181 {
12182  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
12183 }
12184 
12185 /*
12186  * call-seq:
12187  * symbol <=> object -> -1, 0, +1, or nil
12188  *
12189  * If +object+ is a symbol,
12190  * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
12191  *
12192  * :bar <=> :foo # => -1
12193  * :foo <=> :foo # => 0
12194  * :foo <=> :bar # => 1
12195  *
12196  * Otherwise, returns +nil+:
12197  *
12198  * :foo <=> 'bar' # => nil
12199  *
12200  * Related: String#<=>.
12201  */
12202 
12203 static VALUE
12204 sym_cmp(VALUE sym, VALUE other)
12205 {
12206  if (!SYMBOL_P(other)) {
12207  return Qnil;
12208  }
12209  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
12210 }
12211 
12212 /*
12213  * call-seq:
12214  * casecmp(object) -> -1, 0, 1, or nil
12215  *
12216  * :include: doc/symbol/casecmp.rdoc
12217  *
12218  */
12219 
12220 static VALUE
12221 sym_casecmp(VALUE sym, VALUE other)
12222 {
12223  if (!SYMBOL_P(other)) {
12224  return Qnil;
12225  }
12226  return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
12227 }
12228 
12229 /*
12230  * call-seq:
12231  * casecmp?(object) -> true, false, or nil
12232  *
12233  * :include: doc/symbol/casecmp_p.rdoc
12234  *
12235  */
12236 
12237 static VALUE
12238 sym_casecmp_p(VALUE sym, VALUE other)
12239 {
12240  if (!SYMBOL_P(other)) {
12241  return Qnil;
12242  }
12243  return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
12244 }
12245 
12246 /*
12247  * call-seq:
12248  * symbol =~ object -> integer or nil
12249  *
12250  * Equivalent to <tt>symbol.to_s =~ object</tt>,
12251  * including possible updates to global variables;
12252  * see String#=~.
12253  *
12254  */
12255 
12256 static VALUE
12257 sym_match(VALUE sym, VALUE other)
12258 {
12259  return rb_str_match(rb_sym2str(sym), other);
12260 }
12261 
12262 /*
12263  * call-seq:
12264  * match(pattern, offset = 0) -> matchdata or nil
12265  * match(pattern, offset = 0) {|matchdata| } -> object
12266  *
12267  * Equivalent to <tt>self.to_s.match</tt>,
12268  * including possible updates to global variables;
12269  * see String#match.
12270  *
12271  */
12272 
12273 static VALUE
12274 sym_match_m(int argc, VALUE *argv, VALUE sym)
12275 {
12276  return rb_str_match_m(argc, argv, rb_sym2str(sym));
12277 }
12278 
12279 /*
12280  * call-seq:
12281  * match?(pattern, offset) -> true or false
12282  *
12283  * Equivalent to <tt>sym.to_s.match?</tt>;
12284  * see String#match.
12285  *
12286  */
12287 
12288 static VALUE
12289 sym_match_m_p(int argc, VALUE *argv, VALUE sym)
12290 {
12291  return rb_str_match_m_p(argc, argv, sym);
12292 }
12293 
12294 /*
12295  * call-seq:
12296  * symbol[index] -> string or nil
12297  * symbol[start, length] -> string or nil
12298  * symbol[range] -> string or nil
12299  * symbol[regexp, capture = 0] -> string or nil
12300  * symbol[substring] -> string or nil
12301  *
12302  * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
12303  *
12304  */
12305 
12306 static VALUE
12307 sym_aref(int argc, VALUE *argv, VALUE sym)
12308 {
12309  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
12310 }
12311 
12312 /*
12313  * call-seq:
12314  * length -> integer
12315  *
12316  * Equivalent to <tt>self.to_s.length</tt>; see String#length.
12317  */
12318 
12319 static VALUE
12320 sym_length(VALUE sym)
12321 {
12322  return rb_str_length(rb_sym2str(sym));
12323 }
12324 
12325 /*
12326  * call-seq:
12327  * empty? -> true or false
12328  *
12329  * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
12330  *
12331  */
12332 
12333 static VALUE
12334 sym_empty(VALUE sym)
12335 {
12336  return rb_str_empty(rb_sym2str(sym));
12337 }
12338 
12339 /*
12340  * call-seq:
12341  * upcase(*options) -> symbol
12342  *
12343  * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
12344  *
12345  * See String#upcase.
12346  *
12347  */
12348 
12349 static VALUE
12350 sym_upcase(int argc, VALUE *argv, VALUE sym)
12351 {
12352  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
12353 }
12354 
12355 /*
12356  * call-seq:
12357  * downcase(*options) -> symbol
12358  *
12359  * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
12360  *
12361  * See String#downcase.
12362  *
12363  * Related: Symbol#upcase.
12364  *
12365  */
12366 
12367 static VALUE
12368 sym_downcase(int argc, VALUE *argv, VALUE sym)
12369 {
12370  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
12371 }
12372 
12373 /*
12374  * call-seq:
12375  * capitalize(*options) -> symbol
12376  *
12377  * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
12378  *
12379  * See String#capitalize.
12380  *
12381  */
12382 
12383 static VALUE
12384 sym_capitalize(int argc, VALUE *argv, VALUE sym)
12385 {
12386  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
12387 }
12388 
12389 /*
12390  * call-seq:
12391  * swapcase(*options) -> symbol
12392  *
12393  * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
12394  *
12395  * See String#swapcase.
12396  *
12397  */
12398 
12399 static VALUE
12400 sym_swapcase(int argc, VALUE *argv, VALUE sym)
12401 {
12402  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
12403 }
12404 
12405 /*
12406  * call-seq:
12407  * start_with?(*string_or_regexp) -> true or false
12408  *
12409  * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
12410  *
12411  */
12412 
12413 static VALUE
12414 sym_start_with(int argc, VALUE *argv, VALUE sym)
12415 {
12416  return rb_str_start_with(argc, argv, rb_sym2str(sym));
12417 }
12418 
12419 /*
12420  * call-seq:
12421  * end_with?(*strings) -> true or false
12422  *
12423  *
12424  * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
12425  *
12426  */
12427 
12428 static VALUE
12429 sym_end_with(int argc, VALUE *argv, VALUE sym)
12430 {
12431  return rb_str_end_with(argc, argv, rb_sym2str(sym));
12432 }
12433 
12434 /*
12435  * call-seq:
12436  * encoding -> encoding
12437  *
12438  * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12439  *
12440  */
12441 
12442 static VALUE
12443 sym_encoding(VALUE sym)
12444 {
12445  return rb_obj_encoding(rb_sym2str(sym));
12446 }
12447 
12448 static VALUE
12449 string_for_symbol(VALUE name)
12450 {
12451  if (!RB_TYPE_P(name, T_STRING)) {
12452  VALUE tmp = rb_check_string_type(name);
12453  if (NIL_P(tmp)) {
12454  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol nor a string",
12455  name);
12456  }
12457  name = tmp;
12458  }
12459  return name;
12460 }
12461 
12462 ID
12464 {
12465  if (SYMBOL_P(name)) {
12466  return SYM2ID(name);
12467  }
12468  name = string_for_symbol(name);
12469  return rb_intern_str(name);
12470 }
12471 
12472 VALUE
12474 {
12475  if (SYMBOL_P(name)) {
12476  return name;
12477  }
12478  name = string_for_symbol(name);
12479  return rb_str_intern(name);
12480 }
12481 
12482 /*
12483  * call-seq:
12484  * Symbol.all_symbols -> array_of_symbols
12485  *
12486  * Returns an array of all symbols currently in Ruby's symbol table:
12487  *
12488  * Symbol.all_symbols.size # => 9334
12489  * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12490  *
12491  */
12492 
12493 static VALUE
12494 sym_all_symbols(VALUE _)
12495 {
12496  return rb_sym_all_symbols();
12497 }
12498 
12499 VALUE
12501 {
12502  return rb_fstring(str);
12503 }
12504 
12505 VALUE
12506 rb_interned_str(const char *ptr, long len)
12507 {
12508  struct RString fake_str;
12509  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
12510 }
12511 
12512 VALUE
12514 {
12515  return rb_interned_str(ptr, strlen(ptr));
12516 }
12517 
12518 VALUE
12519 rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12520 {
12521  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12522  rb_enc_autoload(enc);
12523  }
12524 
12525  struct RString fake_str;
12526  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
12527 }
12528 
12529 VALUE
12530 rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
12531 {
12532  if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12533  rb_enc_autoload(enc);
12534  }
12535 
12536  struct RString fake_str;
12537  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
12538 }
12539 
12540 VALUE
12542 {
12543  return rb_enc_interned_str(ptr, strlen(ptr), enc);
12544 }
12545 
12546 #if USE_YJIT
12547 void
12548 rb_yjit_str_concat_codepoint(VALUE str, VALUE codepoint)
12549 {
12551  ssize_t code = RB_NUM2SSIZE(codepoint);
12552 
12553  if (RB_LIKELY(code >= 0 && code < 0xff)) {
12554  rb_str_buf_cat_byte(str, (char) code);
12555  return;
12556  }
12557  }
12558 
12559  rb_str_concat(str, codepoint);
12560 }
12561 #endif
12562 
12563 void
12564 Init_String(void)
12565 {
12566  rb_cString = rb_define_class("String", rb_cObject);
12567  RUBY_ASSERT(rb_vm_fstring_table());
12568  st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12570  rb_define_alloc_func(rb_cString, empty_str_alloc);
12571  rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12572  rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12573  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12574  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12575  rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12578  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12579  rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12580  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12581  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12584  rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12585  rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12586  rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12587  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12588  rb_define_method(rb_cString, "length", rb_str_length, 0);
12590  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12591  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12592  rb_define_method(rb_cString, "=~", rb_str_match, 1);
12593  rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12594  rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12596  rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12598  rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12599  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12600  rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12601  rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12602  rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12603  rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12604  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
12605  rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12606  rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12607  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12608  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12609  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12610  rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12611  rb_define_method(rb_cString, "scrub", str_scrub, -1);
12612  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12613  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
12614  rb_define_method(rb_cString, "+@", str_uplus, 0);
12615  rb_define_method(rb_cString, "-@", str_uminus, 0);
12616  rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12617  rb_define_alias(rb_cString, "dedup", "-@");
12618 
12619  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12620  rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12621  rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12622  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12623  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
12625  rb_define_method(rb_cString, "undump", str_undump, 0);
12626 
12627  sym_ascii = ID2SYM(rb_intern_const("ascii"));
12628  sym_turkic = ID2SYM(rb_intern_const("turkic"));
12629  sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12630  sym_fold = ID2SYM(rb_intern_const("fold"));
12631 
12632  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12633  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12634  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12635  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12636 
12637  rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12638  rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12639  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12640  rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12641 
12642  rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12643  rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12644  rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12645  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12646  rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12647  rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12648  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12649  rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12650  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12651  rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12652  rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12653  rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
12655  rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12656  rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12657  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12658  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12659  rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12660 
12661  rb_define_method(rb_cString, "include?", rb_str_include, 1);
12662  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12663  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12664 
12665  rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12666 
12667  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12668  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12669  rb_define_method(rb_cString, "center", rb_str_center, -1);
12670 
12671  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12672  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12673  rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12674  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12675  rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12676  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12677  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12678  rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12679  rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12680 
12681  rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12682  rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12683  rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12684  rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12685  rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12686  rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12687  rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12688  rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12689  rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12690 
12691  rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12692  rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12693  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12694  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12695  rb_define_method(rb_cString, "count", rb_str_count, -1);
12696 
12697  rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12698  rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12699  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12700  rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12701 
12702  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12703  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12704  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12705  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12706  rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12707 
12708  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12709 
12710  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12711  rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12712 
12713  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12714  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12715 
12716  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12717  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12718  rb_define_method(rb_cString, "b", rb_str_b, 0);
12719  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12720  rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12721 
12722  /* define UnicodeNormalize module here so that we don't have to look it up */
12723  mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12724  id_normalize = rb_intern_const("normalize");
12725  id_normalized_p = rb_intern_const("normalized?");
12726 
12727  rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12728  rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12729  rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12730 
12731  rb_fs = Qnil;
12732  rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12733  rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12735 
12736  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12740  rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12741 
12742  rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12743  rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12744  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12745  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12746  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12747  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12748 
12749  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12750  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12751  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12752  rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12753 
12754  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12755  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12756  rb_define_method(rb_cSymbol, "length", sym_length, 0);
12757  rb_define_method(rb_cSymbol, "size", sym_length, 0);
12758  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12759  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12760  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12761 
12762  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12763  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12764  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12765  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12766 
12767  rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12768  rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12769 
12770  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12771 }
#define RUBY_ASSERT_ALWAYS(expr,...)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition: assert.h:199
#define RUBY_ASSERT_BUILTIN_TYPE(obj, type)
A variant of RUBY_ASSERT that asserts when either RUBY_DEBUG or built-in type of obj is type.
Definition: assert.h:291
#define RUBY_ASSERT(...)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition: assert.h:219
Atomic operations.
#define RB_LIKELY(x)
Asserts that the given Boolean expression likely holds.
Definition: assume.h:43
#define RB_UNLIKELY(x)
Asserts that the given Boolean expression likely doesn't hold.
Definition: assume.h:50
@ RUBY_ENC_CODERANGE_7BIT
The object holds 0 to 127 inclusive and nothing else.
Definition: coderange.h:39
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition: coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition: ctype.h:395
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition: ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition: ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition: ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition: ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition: sprintf.c:1198
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition: fl_type.h:883
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition: fl_type.h:469
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition: fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition: class.c:1187
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:980
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition: class.c:1095
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:2345
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition: class.c:2166
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2635
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:2142
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition: eval.c:916
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition: class.c:2424
#define TYPE(_)
Old name of rb_type.
Definition: value_type.h:108
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition: encoding.h:106
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition: value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition: fl_type.h:134
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition: fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition: memory.h:399
#define ISSPACE
Old name of rb_isspace.
Definition: ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition: value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition: coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition: coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:137
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition: assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define T_BIGNUM
Old name of RUBY_T_BIGNUM.
Definition: value_type.h:57
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition: value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition: assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition: symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition: coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition: globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition: size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition: fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition: encoding.h:109
#define LONG2FIX
Old name of RB_INT2FIX.
Definition: long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition: ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition: coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition: memory.h:396
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:394
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:517
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition: fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition: fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition: array.h:658
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition: encoding.h:67
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition: long.h:50
#define FL_ANY_RAW
Old name of RB_FL_ANY_RAW.
Definition: fl_type.h:126
#define ISALPHA
Old name of rb_isalpha.
Definition: ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition: encoding.h:518
#define ISASCII
Old name of rb_isascii.
Definition: ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition: ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition: st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition: encoding.h:519
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition: fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition: long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition: util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define ALLOCV_N
Old name of RB_ALLOCV_N.
Definition: memory.h:400
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:516
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition: fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition: double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition: ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition: value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition: encoding.h:68
#define FL_TEST
Old name of RB_FL_TEST.
Definition: fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition: fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition: encoding.h:108
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition: coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition: fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition: int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition: encoding.h:110
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition: symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition: coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition: fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition: fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition: value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition: encoding.h:69
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition: error.c:476
void rb_raise(VALUE exc_class, const char *fmt,...)
Exception entry point.
Definition: error.c:3635
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:676
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition: error.c:3748
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:1089
VALUE rb_eRangeError
RangeError exception.
Definition: error.c:1412
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1408
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition: error.c:3687
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition: error.c:1415
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1406
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1409
VALUE rb_eIndexError
IndexError exception.
Definition: error.c:1410
VALUE rb_ensure(VALUE(*b_proc)(VALUE), VALUE data1, VALUE(*e_proc)(VALUE), VALUE data2)
An equivalent to ensure clause.
Definition: eval.c:1045
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition: error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition: object.c:669
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition: object.c:2093
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition: object.c:2111
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition: object.c:1272
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition: object.c:3479
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:247
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition: object.c:576
VALUE rb_cSymbol
Symbol class.
Definition: string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition: object.c:179
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition: object.c:1260
VALUE rb_mComparable
Comparable module.
Definition: compar.c:19
VALUE rb_cString
String class.
Definition: string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:3188
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition: gc.h:603
Encoding relates APIs.
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1523
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1589
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:197
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1191
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:920
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1022
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1487
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1241
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:683
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1149
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1481
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1173
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1469
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1140
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1227
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:638
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1676
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:191
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:704
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:323
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:986
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1463
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1062
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:182
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:768
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1475
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:662
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:571
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:643
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:402
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:447
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1179
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:994
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1028
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:99
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:591
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:417
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:726
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:432
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:619
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1203
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1493
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1537
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1285
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition: string.c:2926
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:900
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition: string.c:1150
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition: string.c:1169
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition: string.c:12519
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition: re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition: string.c:2249
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition: string.c:3610
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1098
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_new(), except it additionally takes an encoding.
Definition: string.c:1068
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition: string.c:1390
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition: string.c:1291
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition: string.c:919
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1163
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition: string.c:12541
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:784
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition: symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2914
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2651
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition: vm_eval.c:1099
VALUE rb_funcallv(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcall(), except it takes the method arguments as a C array.
Definition: vm_eval.c:1058
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition: vm_eval.c:1186
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition: gc.h:479
void rb_gc_register_address(VALUE *valptr)
Inform the garbage collector that the global or static variable pointed by valptr stores a live Ruby ...
Definition: gc.c:2927
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:1008
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:741
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
Definition: array.c:735
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1378
VALUE rb_ary_freeze(VALUE obj)
Freeze an array, preventing further modifications.
Definition: array.c:642
VALUE rb_ary_new_from_args(long n,...)
Constructs an array from the passed objects.
Definition: array.c:747
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Identical to rb_cstr2inum(), except it takes Ruby's strings instead of C's.
Definition: bignum.c:4308
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition: enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition: enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition: error.h:35
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:284
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1864
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2073
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2893
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Identical to rb_hash_aref(), except it always returns RUBY_Qnil for misshits.
Definition: hash.c:2099
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1475
VALUE rb_rs
The record separator character for inputs, or the $/.
Definition: io.c:205
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition: string.c:669
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition: io.c:207
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition: vm.c:1826
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition: symbol.c:1042
void rb_backref_set(VALUE md)
Updates $~.
Definition: vm.c:1832
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition: range.c:1842
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition: re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition: re.c:4198
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition: re.c:3695
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition: re.c:1489
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition: re.c:1905
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition: string.c:12500
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition: string.c:1677
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: string.c:1455
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition: string.c:2400
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition: string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition: string.h:939
VALUE rb_utf8_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1062
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition: string.c:3675
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition: string.c:1366
VALUE rb_utf8_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "UTF-8" encoding.
Definition: string.c:1092
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition: string.c:12149
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition: string.c:2472
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition: string.c:1342
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1671
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition: string.c:2954
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition: string.c:5267
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition: string.c:4044
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition: string.c:3051
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:11448
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition: random.c:1752
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1713
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition: string.c:1132
VALUE rb_str_buf_cat(VALUE, const char *, long)
Just another name of rb_str_cat.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:954
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1461
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1916
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2640
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition: string.c:4030
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition: string.c:3443
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition: string.c:2338
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition: string.c:1934
VALUE rb_usascii_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1056
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition: string.c:6475
VALUE rb_usascii_str_new_cstr(const char *ptr)
Identical to rb_str_new_cstr(), except it generates a string of "US ASCII" encoding.
Definition: string.c:1086
VALUE rb_str_buf_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition: string.c:3059
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition: string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition: string.c:12513
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition: string.c:1372
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition: string.c:3641
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition: string.c:3001
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition: string.c:4146
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3267
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition: string.c:7196
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition: string.c:2692
VALUE rb_str_buf_new_cstr(const char *ptr)
This is a rb_str_buf_new() + rb_str_buf_cat() combo.
Definition: string.c:1659
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition: string.c:12506
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition: string.c:4100
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition: string.c:3917
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition: string.c:4075
#define rb_strlen_lit(str)
Length of a string literal.
Definition: string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition: string.c:3617
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition: string.c:3176
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition: string.c:5777
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:1050
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition: string.c:11506
VALUE rb_str_dup_frozen(VALUE)
Just another name of rb_str_new_frozen.
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition: string.c:1627
VALUE rb_locale_str_new_cstr(const char *ptr)
Identical to rb_locale_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1360
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition: string.c:2850
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition: string.c:3148
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition: string.c:3250
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1074
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3315
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition: string.c:1144
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2648
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:7310
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition: string.c:1354
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1643
VALUE rb_external_str_new_cstr(const char *ptr)
Identical to rb_external_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1348
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition: string.c:2352
VALUE rb_str_cat_cstr(VALUE dst, const char *src)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:3453
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5695
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition: string.c:9403
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition: string.c:1138
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition: symbol.c:878
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition: string.c:1775
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1871
VALUE rb_ivar_defined(VALUE obj, ID name)
Queries if the instance variable is defined at the object.
Definition: variable.c:1888
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2960
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition: vm_method.c:1291
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:277
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:823
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition: symbol.c:970
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition: string.c:12473
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: string.c:12463
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition: symbol.c:829
VALUE rb_id2str(ID id)
Identical to rb_id2name(), except it returns a frozen Ruby String instead of a C String.
Definition: symbol.c:986
void rb_define_hooked_variable(const char *name, VALUE *var, rb_gvar_getter_t *getter, rb_gvar_setter_t *setter)
Identical to rb_define_virtual_variable(), but can also specify a storage.
Definition: variable.c:719
int capa
Designed capacity of the buffer.
Definition: io.h:11
char * ptr
Pointer to the underlying memory region, of at least capa bytes.
Definition: io.h:2
int off
Offset inside of ptr.
Definition: io.h:5
int len
Length of the buffer.
Definition: io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition: re.c:1844
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition: re.c:3479
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition: re.c:4442
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition: sprintf.c:215
VALUE rb_yield(VALUE val)
Yields the block.
Definition: vm_eval.c:1354
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:367
#define ALLOCA_N(type, n)
Definition: memory.h:287
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition: memory.h:355
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:162
VALUE type(ANYARGS)
ANYARGS-ed function type.
Definition: cxxanyargs.hpp:56
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition: rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition: rbasic.h:150
#define RBASIC(obj)
Convenient casting macro.
Definition: rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:67
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition: rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition: rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition: string.c:1384
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:442
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:416
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition: rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition: rstring.h:488
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition: string.c:2722
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition: string.c:2827
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition: string.c:2711
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:367
#define RSTRING(obj)
Convenient casting macro.
Definition: rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition: string.c:1378
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition: string.c:1704
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition: load.c:1416
#define errno
Ractor-aware version of errno.
Definition: ruby.h:388
#define RB_NUM2SSIZE
Converts an instance of rb_cInteger into C's ssize_t.
Definition: size_t.h:49
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition: stdarg.h:35
VALUE flags
Per-object flags.
Definition: rbasic.h:75
Ruby's String.
Definition: rstring.h:196
union RString::@52 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition: rstring.h:199
long len
Length of the string, not including terminating NUL character.
Definition: rstring.h:206
struct RString::@52::@54 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition: rstring.h:240
struct RString::@52::@53 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:200
Definition: st.h:79
Definition: string.c:8268
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition: thread.c:298
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static enum ruby_value_type rb_type(VALUE obj)
Identical to RB_BUILTIN_TYPE(), except it can also accept special constants.
Definition: value_type.h:225
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition: value_type.h:433
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:376
ruby_value_type
C-level type of an object.
Definition: value_type.h:113
void ruby_xfree(void *ptr)
Deallocates a storage instance.
Definition: gc.c:4594