12#include "ruby/internal/config.h"
19#include "internal/encoding.h"
20#include "internal/error.h"
21#include "internal/hash.h"
22#include "internal/imemo.h"
23#include "internal/re.h"
24#include "internal/string.h"
25#include "internal/object.h"
26#include "internal/ractor.h"
27#include "internal/variable.h"
32#include "ractor_core.h"
36typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
37#define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
39#define BEG(no) (regs->beg[(no)])
40#define END(no) (regs->end[(no)])
43static const char casetable[] = {
44 '\000',
'\001',
'\002',
'\003',
'\004',
'\005',
'\006',
'\007',
45 '\010',
'\011',
'\012',
'\013',
'\014',
'\015',
'\016',
'\017',
46 '\020',
'\021',
'\022',
'\023',
'\024',
'\025',
'\026',
'\027',
47 '\030',
'\031',
'\032',
'\033',
'\034',
'\035',
'\036',
'\037',
49 '\040',
'\041',
'\042',
'\043',
'\044',
'\045',
'\046',
'\047',
51 '\050',
'\051',
'\052',
'\053',
'\054',
'\055',
'\056',
'\057',
53 '\060',
'\061',
'\062',
'\063',
'\064',
'\065',
'\066',
'\067',
55 '\070',
'\071',
'\072',
'\073',
'\074',
'\075',
'\076',
'\077',
57 '\100',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
59 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
61 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
63 '\170',
'\171',
'\172',
'\133',
'\134',
'\135',
'\136',
'\137',
65 '\140',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
67 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
69 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
71 '\170',
'\171',
'\172',
'\173',
'\174',
'\175',
'\176',
'\177',
72 '\200',
'\201',
'\202',
'\203',
'\204',
'\205',
'\206',
'\207',
73 '\210',
'\211',
'\212',
'\213',
'\214',
'\215',
'\216',
'\217',
74 '\220',
'\221',
'\222',
'\223',
'\224',
'\225',
'\226',
'\227',
75 '\230',
'\231',
'\232',
'\233',
'\234',
'\235',
'\236',
'\237',
76 '\240',
'\241',
'\242',
'\243',
'\244',
'\245',
'\246',
'\247',
77 '\250',
'\251',
'\252',
'\253',
'\254',
'\255',
'\256',
'\257',
78 '\260',
'\261',
'\262',
'\263',
'\264',
'\265',
'\266',
'\267',
79 '\270',
'\271',
'\272',
'\273',
'\274',
'\275',
'\276',
'\277',
80 '\300',
'\301',
'\302',
'\303',
'\304',
'\305',
'\306',
'\307',
81 '\310',
'\311',
'\312',
'\313',
'\314',
'\315',
'\316',
'\317',
82 '\320',
'\321',
'\322',
'\323',
'\324',
'\325',
'\326',
'\327',
83 '\330',
'\331',
'\332',
'\333',
'\334',
'\335',
'\336',
'\337',
84 '\340',
'\341',
'\342',
'\343',
'\344',
'\345',
'\346',
'\347',
85 '\350',
'\351',
'\352',
'\353',
'\354',
'\355',
'\356',
'\357',
86 '\360',
'\361',
'\362',
'\363',
'\364',
'\365',
'\366',
'\367',
87 '\370',
'\371',
'\372',
'\373',
'\374',
'\375',
'\376',
'\377',
90# error >>> "You lose. You will need a translation table for your character set." <<<
94rb_hrtime_t rb_reg_match_time_limit = 0;
97rb_memcicmp(
const void *x,
const void *y,
long len)
99 const unsigned char *p1 = x, *p2 = y;
103 if ((tmp = casetable[(
unsigned)*p1++] - casetable[(
unsigned)*p2++]))
111rb_memsearch_ss(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
113 const unsigned char *y;
115 if ((y = memmem(ys, n, xs, m)) != NULL)
122rb_memsearch_ss(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
124 const unsigned char *x = xs, *xe = xs + m;
125 const unsigned char *y = ys, *ye = ys + n;
126#define VALUE_MAX ((VALUE)~(VALUE)0)
130 rb_bug(
"!!too long pattern string!!");
132 if (!(y = memchr(y, *x, n - m + 1)))
136 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
156rb_memsearch_qs(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
158 const unsigned char *x = xs, *xe = xs + m;
159 const unsigned char *y = ys;
160 VALUE i, qstable[256];
163 for (i = 0; i < 256; ++i)
166 qstable[*x] = xe - x;
168 for (; y + m <= ys + n; y += *(qstable + y[m])) {
169 if (*xs == *y && memcmp(xs, y, m) == 0)
175static inline unsigned int
176rb_memsearch_qs_utf8_hash(
const unsigned char *x)
178 register const unsigned int mix = 8353;
179 register unsigned int h = *x;
204 return (
unsigned char)h;
208rb_memsearch_qs_utf8(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
210 const unsigned char *x = xs, *xe = xs + m;
211 const unsigned char *y = ys;
212 VALUE i, qstable[512];
215 for (i = 0; i < 512; ++i) {
218 for (; x < xe; ++x) {
219 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
222 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
223 if (*xs == *y && memcmp(xs, y, m) == 0)
230rb_memsearch_with_char_size(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n,
int char_size)
232 const unsigned char *x = xs, x0 = *xs, *y = ys;
234 for (n -= m; n >= 0; n -= char_size, y += char_size) {
235 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
242rb_memsearch_wchar(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
244 return rb_memsearch_with_char_size(xs, m, ys, n, 2);
248rb_memsearch_qchar(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
250 return rb_memsearch_with_char_size(xs, m, ys, n, 4);
256 const unsigned char *x = x0, *y = y0;
258 if (m > n)
return -1;
260 return memcmp(x0, y0, m) == 0 ? 0 : -1;
266 const unsigned char *ys = memchr(y, *x, n);
273 else if (LIKELY(rb_enc_mbminlen(enc) == 1)) {
275 return rb_memsearch_ss(x0, m, y0, n);
277 else if (enc == rb_utf8_encoding()){
278 return rb_memsearch_qs_utf8(x0, m, y0, n);
281 else if (LIKELY(rb_enc_mbminlen(enc) == 2)) {
282 return rb_memsearch_wchar(x0, m, y0, n);
284 else if (LIKELY(rb_enc_mbminlen(enc) == 4)) {
285 return rb_memsearch_qchar(x0, m, y0, n);
287 return rb_memsearch_qs(x0, m, y0, n);
290#define REG_ENCODING_NONE FL_USER6
292#define KCODE_FIXED FL_USER4
301 val = ONIG_OPTION_IGNORECASE;
304 val = ONIG_OPTION_EXTEND;
307 val = ONIG_OPTION_MULTILINE;
316enum { OPTBUF_SIZE = 4 };
319option_to_str(
char str[OPTBUF_SIZE],
int options)
322 if (options & ONIG_OPTION_MULTILINE) *p++ =
'm';
323 if (options & ONIG_OPTION_IGNORECASE) *p++ =
'i';
324 if (options & ONIG_OPTION_EXTEND) *p++ =
'x';
336 *kcode = rb_ascii8bit_encindex();
337 return (*option = ARG_ENCODING_NONE);
339 *kcode = ENCINDEX_EUC_JP;
342 *kcode = ENCINDEX_Windows_31J;
345 *kcode = rb_utf8_encindex();
349 return (*option = char_to_option(c));
351 *option = ARG_ENCODING_FIXED;
356rb_reg_check(
VALUE re)
364rb_reg_expr_str(
VALUE str,
const char *s,
long len,
367 const char *p, *pend;
372 p = s; pend = p +
len;
376 c = rb_enc_ascget(p, pend, &clen, enc);
379 p += mbclen(p, pend, enc);
403 int unicode_p = rb_enc_unicode_p(enc);
406 c = rb_enc_ascget(p, pend, &clen, enc);
407 if (c ==
'\\' && p+clen < pend) {
408 int n = clen + mbclen(p+clen, pend, enc);
414 clen = rb_enc_precise_mbclen(p, pend, enc);
416 c = (
unsigned char)*p;
421 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
422 rb_str_buf_cat_escaped_char(str, c, unicode_p);
429 else if (c == term) {
437 else if (!rb_enc_isspace(c, enc)) {
441 snprintf(b,
sizeof(b),
"\\x%02X", c);
457 rb_encoding *resenc = rb_default_internal_encoding();
458 if (resenc == NULL) resenc = rb_default_external_encoding();
460 if (re && rb_enc_asciicompat(enc)) {
461 rb_enc_copy(str, re);
464 rb_enc_associate(str, rb_usascii_encoding());
468 rb_reg_expr_str(str, RSTRING_PTR(src_str), RSTRING_LEN(src_str), enc, resenc,
'/');
473 char opts[OPTBUF_SIZE];
475 if (*option_to_str(opts,
RREGEXP_PTR(re)->options))
477 if (
RBASIC(re)->flags & REG_ENCODING_NONE)
503rb_reg_source(
VALUE re)
524rb_reg_inspect(
VALUE re)
529 return rb_reg_desc(re);
532static VALUE rb_reg_str_with_term(
VALUE re,
int term);
564 return rb_reg_str_with_term(re,
'/');
568rb_reg_str_with_term(
VALUE re,
int term)
571 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
573 char optbuf[OPTBUF_SIZE + 1];
578 rb_enc_copy(str, re);
581 const UChar *ptr = (UChar *)RSTRING_PTR(src_str);
582 long len = RSTRING_LEN(src_str);
584 if (
len >= 4 && ptr[0] ==
'(' && ptr[1] ==
'?') {
587 if ((
len -= 2) > 0) {
589 opt = char_to_option((
int )*ptr);
599 if (
len > 1 && *ptr ==
'-') {
603 opt = char_to_option((
int )*ptr);
618 if (*ptr ==
':' && ptr[
len-1] ==
')') {
625 err = onig_new(&rp, ptr, ptr +
len, options,
626 enc, OnigDefaultSyntax, NULL);
639 if ((options & embeddable) != embeddable) {
641 option_to_str(optbuf + 1, ~options);
646 if (rb_enc_asciicompat(enc)) {
647 rb_reg_expr_str(str, (
char*)ptr,
len, enc, NULL, term);
655 rb_enc_associate(str, rb_usascii_encoding());
659 s = RSTRING_PTR(str);
665 rb_str_resize(str, RSTRING_LEN(str) - n);
667 rb_reg_expr_str(str, (
char*)ptr,
len, enc, NULL, term);
670 rb_enc_copy(str, re);
677NORETURN(
static void rb_reg_raise(
const char *err,
VALUE re));
680rb_reg_raise(
const char *err,
VALUE re)
682 VALUE desc = rb_reg_desc(re);
688rb_enc_reg_error_desc(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err)
690 char opts[OPTBUF_SIZE + 1];
692 rb_encoding *resenc = rb_default_internal_encoding();
693 if (resenc == NULL) resenc = rb_default_external_encoding();
695 rb_enc_associate(desc, enc);
697 rb_reg_expr_str(desc, s,
len, enc, resenc,
'/');
699 option_to_str(opts + 1, options);
704NORETURN(
static void rb_enc_reg_raise(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err));
707rb_enc_reg_raise(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err)
713rb_reg_error_desc(
VALUE str,
int options,
const char *err)
715 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
716 rb_enc_get(str), options, err);
719NORETURN(
static void rb_reg_raise_str(
VALUE str,
int options,
const char *err));
722rb_reg_raise_str(
VALUE str,
int options,
const char *err)
742rb_reg_casefold_p(
VALUE re)
745 return RBOOL(
RREGEXP_PTR(re)->options & ONIG_OPTION_IGNORECASE);
787rb_reg_options_m(
VALUE re)
794reg_names_iter(
const OnigUChar *name,
const OnigUChar *name_end,
795 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
798 rb_ary_push(ary, rb_enc_str_new((
const char *)name, name_end-name, regex->enc));
816rb_reg_names(
VALUE re)
821 onig_foreach_name(
RREGEXP_PTR(re), reg_names_iter, (
void*)ary);
826reg_named_captures_iter(
const OnigUChar *name,
const OnigUChar *name_end,
827 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
833 for (i = 0; i < back_num; i++)
836 rb_hash_aset(hash,
rb_str_new((
const char*)name, name_end-name),ary);
860rb_reg_named_captures(
VALUE re)
863 VALUE hash = rb_hash_new_with_size(onig_number_of_names(reg));
864 onig_foreach_name(reg, reg_named_captures_iter, (
void*)hash);
869onig_new_with_source(
regex_t** reg,
const UChar* pattern,
const UChar* pattern_end,
871 OnigErrorInfo* einfo,
const char *sourcefile,
int sourceline)
876 if (IS_NULL(*reg))
return ONIGERR_MEMORY;
878 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
881 r = onig_compile_ruby(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
891make_regexp(
const char *s,
long len,
rb_encoding *enc,
int flags, onig_errmsg_buffer err,
892 const char *sourcefile,
int sourceline)
905 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s +
len), flags,
906 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
908 onig_error_code_to_str((UChar*)err, r, &einfo);
967match_alloc(
VALUE klass)
971 NEWOBJ_OF(match,
struct RMatch, klass, flags, alloc_size, 0);
984 if (to->allocated)
return 0;
987 if (to->allocated)
return 0;
988 return ONIGERR_MEMORY;
997pair_byte_cmp(
const void *pair1,
const void *pair2)
999 long diff = ((
pair_t*)pair1)->byte_pos - ((
pair_t*)pair2)->byte_pos;
1000#if SIZEOF_LONG > SIZEOF_INT
1001 return diff ? diff > 0 ? 1 : -1 : 0;
1008update_char_offset(
VALUE match)
1012 int i, num_regs, num_pos;
1022 num_regs = rm->
regs.num_regs;
1029 enc = rb_enc_get(
RMATCH(match)->str);
1031 for (i = 0; i < num_regs; i++) {
1040 for (i = 0; i < num_regs; i++) {
1043 pairs[num_pos++].byte_pos = BEG(i);
1044 pairs[num_pos++].byte_pos = END(i);
1046 qsort(pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1048 s = p = RSTRING_PTR(
RMATCH(match)->str);
1050 for (i = 0; i < num_pos; i++) {
1051 q = s + pairs[i].byte_pos;
1053 pairs[i].char_pos = c;
1057 for (i = 0; i < num_regs; i++) {
1065 key.byte_pos = BEG(i);
1066 found = bsearch(&key, pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1069 key.byte_pos = END(i);
1070 found = bsearch(&key, pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1076match_check(
VALUE match)
1078 if (!
RMATCH(match)->regexp) {
1095 rm = RMATCH_EXT(obj);
1099 if (RMATCH_EXT(orig)->char_offset_num_allocated) {
1125match_regexp(
VALUE match)
1129 regexp =
RMATCH(match)->regexp;
1130 if (
NIL_P(regexp)) {
1160match_names(
VALUE match)
1165 return rb_reg_names(
RMATCH(match)->regexp);
1181match_size(
VALUE match)
1187static int name_to_backref_number(
struct re_registers *,
VALUE,
const char*,
const char*);
1188NORETURN(
static void name_to_backref_error(
VALUE name));
1191name_to_backref_error(
VALUE name)
1193 rb_raise(
rb_eIndexError,
"undefined group name reference: % "PRIsVALUE,
1200 if (i < 0 || regs->num_regs <= i)
1205match_backref_number(
VALUE match,
VALUE backref)
1222 num = name_to_backref_number(regs, regexp, name, name + RSTRING_LEN(backref));
1225 name_to_backref_error(backref);
1234 return match_backref_number(match, backref);
1249 int i = match_backref_number(match, n);
1253 backref_number_check(regs, i);
1258 update_char_offset(match);
1260 LONG2NUM(RMATCH_EXT(match)->char_offset[i].end));
1284 int i = match_backref_number(match, n);
1288 backref_number_check(regs, i);
1308 int i = match_backref_number(match, n);
1312 backref_number_check(regs, i);
1332 int i = match_backref_number(match, n);
1336 backref_number_check(regs, i);
1356 int i = match_backref_number(match, n);
1360 backref_number_check(regs, i);
1365 update_char_offset(match);
1366 return LONG2NUM(RMATCH_EXT(match)->char_offset[i].beg);
1382 int i = match_backref_number(match, n);
1386 backref_number_check(regs, i);
1391 update_char_offset(match);
1392 return LONG2NUM(RMATCH_EXT(match)->char_offset[i].end);
1424 int i = match_backref_number(match, n);
1427 backref_number_check(regs, i);
1429 long start = BEG(i), end = END(i);
1468 int i = match_backref_number(match, n);
1472 backref_number_check(regs, i);
1477 update_char_offset(match);
1479 &RMATCH_EXT(match)->char_offset[i];
1483#define MATCH_BUSY FL_USER2
1488 FL_SET(match, MATCH_BUSY);
1492rb_match_unbusy(
VALUE match)
1498rb_match_count(
VALUE match)
1501 if (
NIL_P(match))
return -1;
1503 if (!regs)
return -1;
1504 return regs->num_regs;
1515 int err = onig_region_resize(&rmatch->
regs, 1);
1516 if (err) rb_memerror();
1517 rmatch->
regs.beg[0] = pos;
1518 rmatch->
regs.end[0] = pos +
len;
1522rb_backref_set_string(
VALUE string,
long pos,
long len)
1528 match_set_string(match,
string, pos,
len);
1563rb_reg_fixed_encoding_p(
VALUE re)
1565 return RBOOL(
FL_TEST(re, KCODE_FIXED));
1569rb_reg_preprocess(
const char *p,
const char *end,
rb_encoding *enc,
1570 rb_encoding **fixed_enc, onig_errmsg_buffer err,
int options);
1578 "incompatible encoding regexp match (%s regexp with %s string)",
1579 rb_enc_inspect_name(rb_enc_get(re)),
1580 rb_enc_inspect_name(rb_enc_get(
str)));
1597 int cr = str_coderange(
str);
1600 rb_raise(rb_eArgError,
1601 "invalid byte sequence in %s",
1602 rb_enc_name(rb_enc_get(
str)));
1606 enc = rb_enc_get(
str);
1613 else if (!rb_enc_asciicompat(enc)) {
1614 reg_enc_error(re,
str);
1616 else if (rb_reg_fixed_encoding_p(re)) {
1619 reg_enc_error(re,
str);
1623 else if (warn && (
RBASIC(re)->flags & REG_ENCODING_NONE) &&
1624 enc != rb_ascii8bit_encoding() &&
1626 rb_warn(
"historical binary regexp match /.../n against %s string",
1642 if (reg->enc == enc)
return reg;
1647 const char *pattern = RSTRING_PTR(src_str);
1649 onig_errmsg_buffer err =
"";
1650 unescaped = rb_reg_preprocess(
1651 pattern, pattern + RSTRING_LEN(src_str), enc,
1652 &fixed_enc, err, 0);
1654 if (
NIL_P(unescaped)) {
1655 rb_raise(rb_eArgError,
"regexp preprocess failed: %s", err);
1659 rb_hrtime_t timelimit = reg->timelimit;
1666 if (ruby_single_main_ractor &&
RREGEXP(re)->usecnt == 0) {
1668 r = onig_new_without_alloc(&tmp_reg, (UChar *)ptr, (UChar *)(ptr +
len),
1670 OnigDefaultSyntax, &einfo);
1674 onig_free_body(&tmp_reg);
1677 onig_free_body(reg);
1683 r = onig_new(®, (UChar *)ptr, (UChar *)(ptr +
len),
1685 OnigDefaultSyntax, &einfo);
1689 onig_error_code_to_str((UChar*)err, r, &einfo);
1690 rb_reg_raise(err, re);
1693 reg->timelimit = timelimit;
1708 if (!tmpreg)
RREGEXP(re)->usecnt++;
1710 OnigPosition result = match(reg,
str, regs, args);
1712 if (!tmpreg)
RREGEXP(re)->usecnt--;
1718 onig_region_free(regs, 0);
1723 case ONIGERR_TIMEOUT:
1724 rb_raise(rb_eRegexpTimeoutError,
"regexp match timeout");
1726 onig_errmsg_buffer err =
"";
1727 onig_error_code_to_str((UChar*)err, (
int)result);
1728 rb_reg_raise(err, re);
1743 enc = rb_reg_prepare_enc(re,
str, 0);
1749 range = RSTRING_LEN(
str) - pos;
1752 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(
str)) {
1753 string = (UChar*)RSTRING_PTR(
str);
1756 p = onigenc_get_right_adjust_char_head(enc,
string,
string + pos,
string + RSTRING_LEN(
str));
1759 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,
string,
string + pos,
string + RSTRING_LEN(
str));
1783 (UChar *)(ptr +
len),
1784 (UChar *)(ptr + args->pos),
1785 (UChar *)(ptr + args->range),
1792rb_reg_search_set_match(
VALUE re,
VALUE str,
long pos,
int reverse,
int set_backref_str,
VALUE *set_match)
1794 long len = RSTRING_LEN(str);
1795 if (pos >
len || pos < 0) {
1802 .range = reverse ? 0 :
len,
1806 OnigPosition result =
rb_reg_onig_match(re, str, reg_onig_search, &args, ®s);
1808 if (result == ONIG_MISMATCH) {
1810 return ONIG_MISMATCH;
1830 onig_region_free(&RMATCH_EXT(match)->regs,
false);
1836 if (set_backref_str) {
1850 if (set_match) *set_match = match;
1856rb_reg_search0(
VALUE re,
VALUE str,
long pos,
int reverse,
int set_backref_str,
VALUE *match)
1858 return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, match);
1864 return rb_reg_search_set_match(re, str, pos, reverse, 1, NULL);
1877 (UChar *)(ptr +
len),
1912 if (nth >= regs->num_regs) {
1916 nth += regs->num_regs;
1917 if (nth <= 0)
return Qnil;
1919 return RBOOL(BEG(nth) != -1);
1926 long start, end,
len;
1932 if (nth >= regs->num_regs) {
1936 nth += regs->num_regs;
1937 if (nth <= 0)
return Qnil;
1940 if (start == -1)
return Qnil;
1980 if (BEG(0) == -1)
return Qnil;
2014 if (BEG(0) == -1)
return Qnil;
2015 str =
RMATCH(match)->str;
2022match_last_index(
VALUE match)
2027 if (
NIL_P(match))
return -1;
2030 if (BEG(0) == -1)
return -1;
2032 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
2040 int i = match_last_index(match);
2041 if (i <= 0)
return Qnil;
2047rb_reg_last_defined(
VALUE match)
2049 int i = match_last_index(match);
2050 if (i < 0)
return Qnil;
2055last_match_getter(
ID _x,
VALUE *_y)
2061prematch_getter(
ID _x,
VALUE *_y)
2067postmatch_getter(
ID _x,
VALUE *_y)
2073last_paren_match_getter(
ID _x,
VALUE *_y)
2079match_array(
VALUE match,
int start)
2089 target =
RMATCH(match)->str;
2091 for (i=start; i<regs->num_regs; i++) {
2092 if (regs->beg[i] == -1) {
2119match_to_a(
VALUE match)
2121 return match_array(match, 0);
2141match_captures(
VALUE match)
2143 return match_array(match, 1);
2147name_to_backref_number(
struct re_registers *regs,
VALUE regexp,
const char* name,
const char* name_end)
2149 if (
NIL_P(regexp))
return -1;
2150 return onig_name_to_backref_number(
RREGEXP_PTR(regexp),
2151 (
const unsigned char *)name, (
const unsigned char *)name_end, regs);
2154#define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end) \
2156 !rb_enc_compatible(RREGEXP_SRC(re), (name)) ? 0 : \
2157 name_to_backref_number((regs), (re), (name_ptr), (name_end)))
2170 num = NAME_TO_NUMBER(regs, re, name,
2173 name_to_backref_error(name);
2179match_ary_subseq(
VALUE match,
long beg,
long len,
VALUE result)
2182 long j, end = olen < beg+
len ? olen : beg+
len;
2184 if (
len == 0)
return result;
2186 for (j = beg; j < end; j++) {
2189 if (beg +
len > j) {
2210 return match_ary_subseq(match, beg,
len, result);
2253match_aref(
int argc,
VALUE *argv,
VALUE match)
2260 if (
NIL_P(length)) {
2265 int num = namev_to_backref_number(
RMATCH_REGS(match),
RMATCH(match)->regexp, idx);
2270 return match_ary_aref(match, idx,
Qnil);
2283 if (beg < 0)
return Qnil;
2285 else if (beg > num_regs) {
2288 if (beg+
len > num_regs) {
2289 len = num_regs - beg;
2291 return match_ary_subseq(match, beg,
len,
Qnil);
2322match_values_at(
int argc,
VALUE *argv,
VALUE match)
2330 for (i=0; i<argc; i++) {
2335 int num = namev_to_backref_number(
RMATCH_REGS(match),
RMATCH(match)->regexp, argv[i]);
2340 match_ary_aref(match, argv[i], result);
2367match_to_s(
VALUE match)
2376match_named_captures_iter(
const OnigUChar *name,
const OnigUChar *name_end,
2377 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
2379 struct MEMO *memo = MEMO_CAST(arg);
2380 VALUE hash = memo->v1;
2381 VALUE match = memo->v2;
2382 long symbolize = memo->u3.state;
2384 VALUE key = rb_enc_str_new((
const char *)name, name_end-name, regex->enc);
2386 if (symbolize > 0) {
2395 for (i = 0; i < back_num; i++) {
2398 rb_hash_aset(hash, key, value);
2404 rb_hash_aset(hash, key,
Qnil);
2443match_named_captures(
int argc,
VALUE *argv,
VALUE match)
2450 return rb_hash_new();
2453 VALUE symbolize_names = 0;
2458 static ID keyword_ids[1];
2460 VALUE symbolize_names_val;
2462 if (!keyword_ids[0]) {
2465 rb_get_kwargs(opt, keyword_ids, 0, 1, &symbolize_names_val);
2466 if (!UNDEF_P(symbolize_names_val) &&
RTEST(symbolize_names_val)) {
2467 symbolize_names = 1;
2471 hash = rb_hash_new();
2472 memo = rb_imemo_memo_new(hash, match, symbolize_names);
2474 onig_foreach_name(
RREGEXP(
RMATCH(match)->regexp)->ptr, match_named_captures_iter, (
void*)memo);
2496match_deconstruct_keys(
VALUE match,
VALUE keys)
2504 return rb_hash_new_with_size(0);
2508 h = rb_hash_new_with_size(onig_number_of_names(
RREGEXP_PTR(
RMATCH(match)->regexp)));
2511 memo = rb_imemo_memo_new(h, match, 1);
2513 onig_foreach_name(
RREGEXP_PTR(
RMATCH(match)->regexp), match_named_captures_iter, (
void*)memo);
2521 return rb_hash_new_with_size(0);
2562match_string(
VALUE match)
2565 return RMATCH(match)->str;
2574match_inspect_name_iter(
const OnigUChar *name,
const OnigUChar *name_end,
2575 int back_num,
int *back_refs,
OnigRegex regex,
void *arg0)
2580 for (i = 0; i < back_num; i++) {
2581 arg[back_refs[i]].name = name;
2582 arg[back_refs[i]].len = name_end - name;
2609match_inspect(
VALUE match)
2615 int num_regs = regs->num_regs;
2620 return rb_sprintf(
"#<%"PRIsVALUE
":%p>", cname, (
void*)match);
2622 else if (
NIL_P(regexp)) {
2623 return rb_sprintf(
"#<%"PRIsVALUE
": %"PRIsVALUE
">",
2631 match_inspect_name_iter, names);
2636 for (i = 0; i < num_regs; i++) {
2643 rb_str_catf(str,
"%d", i);
2661read_escaped_byte(
const char **pp,
const char *end, onig_errmsg_buffer err)
2663 const char *p = *pp;
2665 int meta_prefix = 0, ctrl_prefix = 0;
2668 if (p == end || *p++ !=
'\\') {
2669 errcpy(err,
"too short escaped multibyte character");
2675 errcpy(err,
"too short escape sequence");
2679 case '\\': code =
'\\';
break;
2680 case 'n': code =
'\n';
break;
2681 case 't': code =
'\t';
break;
2682 case 'r': code =
'\r';
break;
2683 case 'f': code =
'\f';
break;
2684 case 'v': code =
'\013';
break;
2685 case 'a': code =
'\007';
break;
2686 case 'e': code =
'\033';
break;
2689 case '0':
case '1':
case '2':
case '3':
2690 case '4':
case '5':
case '6':
case '7':
2699 errcpy(err,
"invalid hex escape");
2707 errcpy(err,
"duplicate meta escape");
2711 if (p+1 < end && *p++ ==
'-' && (*p & 0x80) == 0) {
2721 errcpy(err,
"too short meta escape");
2725 if (p == end || *p++ !=
'-') {
2726 errcpy(err,
"too short control escape");
2731 errcpy(err,
"duplicate control escape");
2735 if (p < end && (*p & 0x80) == 0) {
2745 errcpy(err,
"too short control escape");
2749 errcpy(err,
"unexpected escape sequence");
2752 if (code < 0 || 0xff < code) {
2753 errcpy(err,
"invalid escape code");
2767unescape_escaped_nonascii(
const char **pp,
const char *end,
rb_encoding *enc,
2770 const char *p = *pp;
2772 unsigned char *area =
ALLOCA_N(
unsigned char, chmaxlen);
2773 char *chbuf = (
char *)area;
2778 memset(chbuf, 0, chmaxlen);
2780 byte = read_escaped_byte(&p, end, err);
2785 area[chlen++] = byte;
2786 while (chlen < chmaxlen &&
2788 byte = read_escaped_byte(&p, end, err);
2792 area[chlen++] = byte;
2795 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
2797 errcpy(err,
"invalid multibyte escape");
2800 if (1 < chlen || (area[0] & 0x80)) {
2805 else if (*encp != enc) {
2806 errcpy(err,
"escaped non ASCII character in UTF-8 regexp");
2812 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", area[0]&0xff);
2820check_unicode_range(
unsigned long code, onig_errmsg_buffer err)
2822 if ((0xd800 <= code && code <= 0xdfff) ||
2824 errcpy(err,
"invalid Unicode range");
2831append_utf8(
unsigned long uv,
2834 if (check_unicode_range(uv, err) != 0)
2838 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", (
int)uv);
2848 *encp = rb_utf8_encoding();
2849 else if (*encp != rb_utf8_encoding()) {
2850 errcpy(err,
"UTF-8 character in non UTF-8 regexp");
2858unescape_unicode_list(
const char **pp,
const char *end,
2861 const char *p = *pp;
2862 int has_unicode = 0;
2866 while (p < end &&
ISSPACE(*p)) p++;
2869 code = ruby_scan_hex(p, end-p, &
len);
2873 errcpy(err,
"invalid Unicode range");
2877 if (append_utf8(code, buf, encp, err) != 0)
2881 while (p < end &&
ISSPACE(*p)) p++;
2884 if (has_unicode == 0) {
2885 errcpy(err,
"invalid Unicode list");
2895unescape_unicode_bmp(
const char **pp,
const char *end,
2898 const char *p = *pp;
2903 errcpy(err,
"invalid Unicode escape");
2906 code = ruby_scan_hex(p, 4, &
len);
2908 errcpy(err,
"invalid Unicode escape");
2911 if (append_utf8(code, buf, encp, err) != 0)
2918unescape_nonascii0(
const char **pp,
const char *end,
rb_encoding *enc,
2920 onig_errmsg_buffer err,
int options,
int recurse)
2922 const char *p = *pp;
2925 int in_char_class = 0;
2927 int extended_mode = options & ONIG_OPTION_EXTEND;
2931 int chlen = rb_enc_precise_mbclen(p, end, enc);
2934 errcpy(err,
"invalid multibyte character");
2938 if (1 < chlen || (*p & 0x80)) {
2944 else if (*encp != enc) {
2945 errcpy(err,
"non ASCII character in UTF-8 regexp");
2954 errcpy(err,
"too short escape sequence");
2957 chlen = rb_enc_precise_mbclen(p, end, enc);
2959 goto invalid_multibyte;
2968 case '1':
case '2':
case '3':
2969 case '4':
case '5':
case '6':
case '7':
2971 size_t len = end-(p-1), octlen;
2972 if (ruby_scan_oct(p-1,
len < 3 ?
len : 3, &octlen) <= 0177) {
2988 if (rb_is_usascii_enc(enc)) {
2989 const char *pbeg = p;
2990 int byte = read_escaped_byte(&p, end, err);
2991 if (
byte == -1)
return -1;
2996 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
3003 errcpy(err,
"too short escape sequence");
3009 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
3011 if (p == end || *p++ !=
'}') {
3012 errcpy(err,
"invalid Unicode list");
3019 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
3041 if (extended_mode && !in_char_class) {
3043 while ((p < end) && ((c = *p++) !=
'\n')) {
3044 if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) {
3057 if (in_char_class) {
3064 if (!in_char_class && recurse) {
3065 if (--parens == 0) {
3072 if (!in_char_class && p + 1 < end && *p ==
'?') {
3073 if (*(p+1) ==
'#') {
3075 const char *orig_p = p;
3078 while (cont && (p < end)) {
3081 if (!(c & 0x80))
break;
3082 if (!*encp && enc == rb_utf8_encoding()) {
3088 chlen = rb_enc_precise_mbclen(p, end, enc);
3090 goto invalid_multibyte;
3111 int local_extend = 0;
3118 for (s = p+1; s < end; s++) {
3121 local_extend = invert ? -1 : 1;
3128 if (local_extend == 0 ||
3129 (local_extend == -1 && !extended_mode) ||
3130 (local_extend == 1 && extended_mode)) {
3137 int local_options = options;
3138 if (local_extend == 1) {
3139 local_options |= ONIG_OPTION_EXTEND;
3142 local_options &= ~ONIG_OPTION_EXTEND;
3146 int ret = unescape_nonascii0(&p, end, enc, buf, encp,
3149 if (ret < 0)
return ret;
3154 extended_mode = local_extend == 1;
3171 else if (!in_char_class && recurse) {
3189unescape_nonascii(
const char *p,
const char *end,
rb_encoding *enc,
3191 onig_errmsg_buffer err,
int options)
3193 return unescape_nonascii0(&p, end, enc, buf, encp, has_property,
3198rb_reg_preprocess(
const char *p,
const char *end,
rb_encoding *enc,
3199 rb_encoding **fixed_enc, onig_errmsg_buffer err,
int options)
3202 int has_property = 0;
3206 if (rb_enc_asciicompat(enc))
3210 rb_enc_associate(buf, enc);
3213 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err, options) != 0)
3216 if (has_property && !*fixed_enc) {
3221 rb_enc_associate(buf, *fixed_enc);
3228rb_reg_check_preprocess(
VALUE str)
3231 onig_errmsg_buffer err =
"";
3237 p = RSTRING_PTR(str);
3238 end = p + RSTRING_LEN(str);
3239 enc = rb_enc_get(str);
3241 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err, 0);
3245 return rb_reg_error_desc(str, 0, err);
3251rb_reg_preprocess_dregexp(
VALUE ary,
int options)
3255 onig_errmsg_buffer err =
"";
3261 rb_raise(rb_eArgError,
"no arguments given");
3270 src_enc = rb_enc_get(str);
3271 if (options & ARG_ENCODING_NONE &&
3272 src_enc != ascii8bit) {
3274 rb_raise(
rb_eRegexpError,
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
3276 src_enc = ascii8bit;
3280 p = RSTRING_PTR(str);
3281 end = p + RSTRING_LEN(str);
3283 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err, options);
3286 rb_raise(rb_eArgError,
"%s", err);
3288 if (fixed_enc != 0) {
3289 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
3290 rb_raise(
rb_eRegexpError,
"encoding mismatch in dynamic regexp : %s and %s",
3291 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
3293 regexp_enc = fixed_enc;
3302 rb_enc_associate(result, regexp_enc);
3309rb_reg_initialize_check(
VALUE obj)
3311 rb_check_frozen(obj);
3319 int options, onig_errmsg_buffer err,
3320 const char *sourcefile,
int sourceline)
3327 rb_reg_initialize_check(obj);
3329 if (rb_enc_dummy_p(enc)) {
3330 errcpy(err,
"can't make regexp with dummy encoding");
3334 unescaped = rb_reg_preprocess(s, s+
len, enc, &fixed_enc, err, options);
3335 if (
NIL_P(unescaped))
3339 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
3340 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
3341 errcpy(err,
"incompatible character encoding");
3344 if (fixed_enc != a_enc) {
3345 options |= ARG_ENCODING_FIXED;
3349 else if (!(options & ARG_ENCODING_FIXED)) {
3350 enc = rb_usascii_encoding();
3353 rb_enc_associate((
VALUE)re, enc);
3354 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
3357 if (options & ARG_ENCODING_NONE) {
3361 re->
ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
3362 options & ARG_REG_OPTION_MASK, err,
3363 sourcefile, sourceline);
3364 if (!re->
ptr)
return -1;
3374 if (regenc != enc) {
3376 str = rb_enc_associate(dup, enc = regenc);
3378 str = rb_fstring(str);
3383rb_reg_initialize_str(
VALUE obj,
VALUE str,
int options, onig_errmsg_buffer err,
3384 const char *sourcefile,
int sourceline)
3387 rb_encoding *str_enc = rb_enc_get(str), *enc = str_enc;
3388 if (options & ARG_ENCODING_NONE) {
3390 if (enc != ascii8bit) {
3392 errcpy(err,
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
3398 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
3399 options, err, sourcefile, sourceline);
3400 if (ret == 0) reg_set_source(obj, str, str_enc);
3405rb_reg_s_alloc(
VALUE klass)
3425 return rb_reg_init_str(rb_reg_alloc(), s, options);
3429rb_reg_init_str(
VALUE re,
VALUE s,
int options)
3431 onig_errmsg_buffer err =
"";
3433 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
3434 rb_reg_raise_str(s, options, err);
3443 onig_errmsg_buffer err =
"";
3445 if (rb_reg_initialize(re, RSTRING_PTR(s), RSTRING_LEN(s),
3446 enc, options, err, NULL, 0) != 0) {
3447 rb_reg_raise_str(s, options, err);
3449 reg_set_source(re, s, enc);
3455rb_reg_new_ary(
VALUE ary,
int opt)
3465 VALUE re = rb_reg_alloc();
3466 onig_errmsg_buffer err =
"";
3468 if (rb_reg_initialize(re, s,
len, enc, options, err, NULL, 0) != 0) {
3469 rb_enc_reg_raise(s,
len, enc, options, err);
3483rb_reg_compile(
VALUE str,
int options,
const char *sourcefile,
int sourceline)
3485 VALUE re = rb_reg_alloc();
3486 onig_errmsg_buffer err =
"";
3489 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
3490 rb_set_errinfo(rb_reg_error_desc(str, options, err));
3497static VALUE reg_cache;
3502 if (rb_ractor_main_p()) {
3505 && memcmp(
RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
3515static st_index_t reg_hash(
VALUE re);
3527rb_reg_hash(
VALUE re)
3529 st_index_t hashval = reg_hash(re);
3562 if (re1 == re2)
return Qtrue;
3564 rb_reg_check(re1); rb_reg_check(re2);
3584match_hash(
VALUE match)
3591 hashval =
rb_hash_uint(hashval, reg_hash(match_regexp(match)));
3614 if (match1 == match2)
return Qtrue;
3618 if (!rb_reg_equal(match_regexp(match1), match_regexp(match2)))
return Qfalse;
3621 if (regs1->num_regs != regs2->num_regs)
return Qfalse;
3622 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs *
sizeof(*regs1->beg)))
return Qfalse;
3623 if (memcmp(regs1->end, regs2->end, regs1->num_regs *
sizeof(*regs1->end)))
return Qfalse;
3628reg_operand(
VALUE s,
int check)
3650 *strp = str = reg_operand(str, TRUE);
3661 return rb_reg_search_set_match(re, str, pos, 0, 1, set_match);
3723 long pos = reg_match_pos(re, &str, 0, NULL);
3724 if (pos < 0)
return Qnil;
3754 str = reg_operand(str, FALSE);
3760 return RBOOL(start >= 0);
3837rb_reg_match_m(
int argc,
VALUE *argv,
VALUE re)
3842 if (
rb_scan_args(argc, argv,
"11", &str, &initpos) == 2) {
3849 pos = reg_match_pos(re, &str, pos, &result);
3878rb_reg_match_m_p(
int argc,
VALUE *argv,
VALUE re)
3881 return rb_reg_match_p(re, argv[0], pos);
3892 if (pos < 0)
return Qfalse;
3898 pos = beg - RSTRING_PTR(str);
3904 .range = RSTRING_LEN(str),
3917str_to_option(
VALUE str)
3923 if (
NIL_P(str))
return -1;
3925 for (
long i = 0; i <
len; ++i) {
3926 int f = char_to_option(ptr[i]);
3928 rb_raise(rb_eArgError,
"unknown regexp option: %"PRIsVALUE, str);
3936set_timeout(rb_hrtime_t *hrt,
VALUE timeout)
3938 double timeout_d =
NIL_P(timeout) ? 0.0 :
NUM2DBL(timeout);
3939 if (!
NIL_P(timeout) && timeout_d <= 0) {
3940 rb_raise(rb_eArgError,
"invalid timeout: %"PRIsVALUE, timeout);
3942 double2hrtime(hrt, timeout_d);
3951 rb_reg_initialize_check(copy);
3952 if ((r = onig_reg_copy(&re,
RREGEXP_PTR(orig))) != 0) {
3959 rb_enc_copy(copy, orig);
4030rb_reg_initialize_m(
int argc,
VALUE *argv,
VALUE self)
4033 VALUE re = reg_extract_args(argc, argv, &args);
4042 set_timeout(&
RREGEXP_PTR(self)->timelimit, args.timeout);
4057 args->timeout =
Qnil;
4058 if (!
NIL_P(kwargs)) {
4059 static ID keywords[1];
4080 else if ((f = str_to_option(opts)) >= 0) flags = f;
4081 else if (rb_bool_expected(opts,
"ignorecase", FALSE))
4082 flags = ONIG_OPTION_IGNORECASE;
4088 args->flags = flags;
4095 if (enc && rb_enc_get(str) != enc)
4096 rb_reg_init_str_enc(self, str, enc, flags);
4098 rb_reg_init_str(self, str, flags);
4111 s = RSTRING_PTR(str);
4112 send = s + RSTRING_LEN(str);
4114 c = rb_enc_ascget(s, send, &clen, enc);
4116 s += mbclen(s, send, enc);
4120 case '[':
case ']':
case '{':
case '}':
4121 case '(':
case ')':
case '|':
case '-':
4122 case '*':
case '.':
case '\\':
4123 case '?':
case '+':
case '^':
case '$':
4125 case '\t':
case '\f':
case '\v':
case '\n':
case '\r':
4132 rb_enc_associate(tmp, rb_usascii_encoding());
4139 rb_enc_associate(tmp, rb_usascii_encoding());
4142 rb_enc_copy(tmp, str);
4144 t = RSTRING_PTR(tmp);
4146 const char *p = RSTRING_PTR(str);
4147 memcpy(t, p, s - p);
4151 c = rb_enc_ascget(s, send, &clen, enc);
4153 int n = mbclen(s, send, enc);
4161 case '[':
case ']':
case '{':
case '}':
4162 case '(':
case ')':
case '|':
case '-':
4163 case '*':
case '.':
case '\\':
4164 case '?':
case '+':
case '^':
case '$':
4166 t += rb_enc_mbcput(
'\\', t, enc);
4169 t += rb_enc_mbcput(
'\\', t, enc);
4170 t += rb_enc_mbcput(
' ', t, enc);
4173 t += rb_enc_mbcput(
'\\', t, enc);
4174 t += rb_enc_mbcput(
't', t, enc);
4177 t += rb_enc_mbcput(
'\\', t, enc);
4178 t += rb_enc_mbcput(
'n', t, enc);
4181 t += rb_enc_mbcput(
'\\', t, enc);
4182 t += rb_enc_mbcput(
'r', t, enc);
4185 t += rb_enc_mbcput(
'\\', t, enc);
4186 t += rb_enc_mbcput(
'f', t, enc);
4189 t += rb_enc_mbcput(
'\\', t, enc);
4190 t += rb_enc_mbcput(
'v', t, enc);
4193 t += rb_enc_mbcput(c, t, enc);
4195 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
4228 options =
RREGEXP_PTR(re)->options & ARG_REG_OPTION_MASK;
4229 if (
RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
4230 if (
RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
4235rb_check_regexp_type(
VALUE re)
4261 return rb_check_regexp_type(re);
4274 else if (argc == 1) {
4276 VALUE re = rb_check_regexp_type(arg);
4281 quoted = rb_reg_s_quote(
Qnil, arg);
4290 int has_asciionly = 0;
4294 for (i = 0; i < argc; i++) {
4301 v = rb_check_regexp_type(e);
4304 if (!rb_enc_asciicompat(enc)) {
4305 if (!has_ascii_incompat)
4306 has_ascii_incompat = enc;
4307 else if (has_ascii_incompat != enc)
4308 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4309 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
4311 else if (rb_reg_fixed_encoding_p(v)) {
4312 if (!has_ascii_compat_fixed)
4313 has_ascii_compat_fixed = enc;
4314 else if (has_ascii_compat_fixed != enc)
4315 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4316 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
4321 v = rb_reg_str_with_term(v, -1);
4326 enc = rb_enc_get(e);
4327 if (!rb_enc_asciicompat(enc)) {
4328 if (!has_ascii_incompat)
4329 has_ascii_incompat = enc;
4330 else if (has_ascii_incompat != enc)
4331 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4332 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
4338 if (!has_ascii_compat_fixed)
4339 has_ascii_compat_fixed = enc;
4340 else if (has_ascii_compat_fixed != enc)
4341 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4342 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
4344 v = rb_reg_s_quote(
Qnil, e);
4346 if (has_ascii_incompat) {
4347 if (has_asciionly) {
4348 rb_raise(rb_eArgError,
"ASCII incompatible encoding: %s",
4349 rb_enc_name(has_ascii_incompat));
4351 if (has_ascii_compat_fixed) {
4352 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4353 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
4358 rb_enc_copy(source, v);
4363 if (has_ascii_incompat) {
4364 result_enc = has_ascii_incompat;
4366 else if (has_ascii_compat_fixed) {
4367 result_enc = has_ascii_compat_fixed;
4370 result_enc = rb_ascii8bit_encoding();
4373 rb_enc_associate(source, result_enc);
4418 return rb_reg_s_union(self, v);
4420 return rb_reg_s_union(self, args);
4445rb_reg_s_linear_time_p(
int argc,
VALUE *argv,
VALUE self)
4448 VALUE re = reg_extract_args(argc, argv, &args);
4451 re =
reg_init_args(rb_reg_alloc(), args.str, args.enc, args.flags);
4454 return RBOOL(onig_check_linear_time(
RREGEXP_PTR(re)));
4463 return reg_copy(copy, re);
4474 int acompat = rb_enc_asciicompat(str_enc);
4476#define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
4483 int c = ASCGET(s, e, &clen);
4487 s += mbclen(s, e, str_enc);
4493 if (c !=
'\\' || s == e)
continue;
4500 c = ASCGET(s, e, &clen);
4502 s += mbclen(s, e, str_enc);
4511 case '1':
case '2':
case '3':
case '4':
4512 case '5':
case '6':
case '7':
case '8':
case '9':
4513 if (!
NIL_P(regexp) && onig_noname_group_capture_is_active(
RREGEXP_PTR(regexp))) {
4522 if (s < e && ASCGET(s, e, &clen) ==
'<') {
4523 char *name, *name_end;
4525 name_end = name = s + clen;
4526 while (name_end < e) {
4527 c = ASCGET(name_end, e, &clen);
4528 if (c ==
'>')
break;
4529 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
4533 (
long)(name_end - name));
4534 if ((no = NAME_TO_NUMBER(regs, regexp, n, name, name_end)) < 1) {
4535 name_to_backref_error(n);
4537 p = s = name_end + clen;
4562 no = regs->num_regs-1;
4563 while (BEG(no) == -1 && no > 0) no--;
4564 if (no == 0)
continue;
4577 if (no >= regs->num_regs)
continue;
4578 if (BEG(no) == -1)
continue;
4583 if (!val)
return str;
4592ignorecase_getter(
ID _x,
VALUE *_y)
4615get_LAST_MATCH_INFO(
ID _x,
VALUE *_y)
4617 return match_getter();
4668rb_reg_s_last_match(
int argc,
VALUE *argv,
VALUE _)
4674 n = match_backref_number(match, argv[0]);
4677 return match_getter();
4681re_warn(
const char *s)
4688rb_reg_timeout_p(
regex_t *reg,
void *end_time_)
4690 rb_hrtime_t *end_time = (rb_hrtime_t *)end_time_;
4692 if (*end_time == 0) {
4696 rb_hrtime_t timelimit = reg->timelimit;
4700 timelimit = rb_reg_match_time_limit;
4704 *end_time = rb_hrtime_add(timelimit, rb_hrtime_now());
4708 *end_time = RB_HRTIME_MAX;
4712 if (*end_time < rb_hrtime_now()) {
4730rb_reg_s_timeout_get(
VALUE dummy)
4732 double d = hrtime2double(rb_reg_match_time_limit);
4733 if (d == 0.0)
return Qnil;
4751rb_reg_s_timeout_set(
VALUE dummy,
VALUE timeout)
4753 rb_ractor_ensure_main_ractor(
"can not access Regexp.timeout from non-main Ractors");
4755 set_timeout(&rb_reg_match_time_limit, timeout);
4776rb_reg_timeout_get(
VALUE re)
4779 double d = hrtime2double(
RREGEXP_PTR(re)->timelimit);
4780 if (d == 0.0)
return Qnil;
4807 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
4808 onig_set_warn_func(re_warn);
4809 onig_set_verb_warn_func(re_warn);
4817 rb_gvar_ractor_local(
"$~");
4818 rb_gvar_ractor_local(
"$&");
4819 rb_gvar_ractor_local(
"$`");
4820 rb_gvar_ractor_local(
"$'");
4821 rb_gvar_ractor_local(
"$+");
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define rb_str_new2
Old name of rb_str_new_cstr.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define REALLOC_N
Old name of RB_REALLOC_N.
#define OBJ_INIT_COPY(obj, orig)
Old name of RB_OBJ_INIT_COPY.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define rb_str_buf_new2
Old name of rb_str_buf_new_cstr.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define FIX2INT
Old name of RB_FIX2INT.
#define NUM2DBL
Old name of rb_num2dbl.
#define rb_str_new3
Old name of rb_str_new_shared.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define rb_exc_new3
Old name of rb_exc_new_str.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define NUM2INT
Old name of RB_NUM2INT.
#define INT2NUM
Old name of RB_INT2NUM.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define T_ARRAY
Old name of RUBY_T_ARRAY.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define T_SYMBOL
Old name of RUBY_T_SYMBOL.
#define DBL2NUM
Old name of rb_float_new.
#define T_MATCH
Old name of RUBY_T_MATCH.
#define FL_TEST
Old name of RB_FL_TEST.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define scan_oct(s, l, e)
Old name of ruby_scan_oct.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define rb_str_new4
Old name of rb_str_new_frozen.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
VALUE rb_eStandardError
StandardError exception.
VALUE rb_eRegexpError
RegexpError exception.
#define ruby_verbose
This variable controls whether the interpreter is in debug mode.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
void rb_warn(const char *fmt,...)
Identical to rb_warning(), except it reports unless $VERBOSE is nil.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_obj_reveal(VALUE obj, VALUE klass)
Make a hidden object visible again.
VALUE rb_check_convert_type(VALUE val, int type, const char *name, const char *mid)
Identical to rb_convert_type(), except it returns RUBY_Qnil instead of raising exceptions,...
VALUE rb_cObject
Object class.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_class_new_instance(int argc, const VALUE *argv, VALUE klass)
Allocates, then initialises an instance of the given class.
VALUE rb_cMatch
MatchData class.
VALUE rb_obj_hide(VALUE obj)
Make the object invisible from Ruby code.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_cRegexp
Regexp class.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Converts a character option to its encoding.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
VALUE rb_enc_reg_new(const char *ptr, long len, rb_encoding *enc, int opts)
Identical to rb_reg_new(), except it additionally takes an encoding.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
#define RGENGC_WB_PROTECTED_MATCH
This is a compile-time flag to enable/disable write barrier for struct RMatch.
#define RGENGC_WB_PROTECTED_REGEXP
This is a compile-time flag to enable/disable write barrier for struct RRegexp.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_resize(VALUE ary, long len)
Expands or shrinks the passed array to the passed length.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
int rb_uv_to_utf8(char buf[6], unsigned long uv)
Encodes a Unicode codepoint into its UTF-8 representation.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_lastline_get(void)
Queries the last line, or the $_.
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_last_match(VALUE md)
This just returns the argument, stringified.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
VALUE rb_reg_match_post(VALUE md)
The portion of the original string after the given match.
VALUE rb_reg_nth_defined(int n, VALUE md)
Identical to rb_reg_nth_match(), except it just returns Boolean.
VALUE rb_reg_match_pre(VALUE md)
The portion of the original string before the given match.
VALUE rb_reg_new_str(VALUE src, int opts)
Identical to rb_reg_new(), except it takes the expression in Ruby's string instead of C's.
VALUE rb_reg_match_last(VALUE md)
The portion of the original string that captured at the very last.
VALUE rb_reg_match2(VALUE re)
Identical to rb_reg_match(), except it matches against rb_lastline_get() (or, the $_).
VALUE rb_reg_new(const char *src, long len, int opts)
Creates a new Regular expression.
#define rb_hash_uint(h, i)
Just another name of st_hash_uint.
#define rb_hash_end(h)
Just another name of st_hash_end.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
st_index_t rb_hash_start(st_index_t i)
Starts a series of hashing.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_class_path(VALUE mod)
Identical to rb_mod_name(), except it returns #<Class: ...> style inspection for anonymous modules.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
regex_t * rb_reg_prepare_re(VALUE re, VALUE str)
Exercises various checks and preprocesses so that the given regular expression can be applied to the ...
long rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int dir)
Tell us if this is a wrong idea, but it seems this function has no usage at all.
OnigPosition rb_reg_onig_match(VALUE re, VALUE str, OnigPosition(*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), void *args, struct re_registers *regs)
Runs a regular expression match using function match.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_quote(VALUE str)
Escapes any characters that would have special meaning in a regular expression.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
int rb_reg_region_copy(struct re_registers *dst, const struct re_registers *src)
Duplicates a match data.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_virtual_variable(const char *q, type *w, void_type *e)
Define a function-backended global variable.
#define RARRAY_LEN
Just another name of rb_array_len.
#define RARRAY_AREF(a, i)
#define RBASIC(obj)
Convenient casting macro.
#define RMATCH(obj)
Convenient casting macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
#define RREGEXP(obj)
Convenient casting macro.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define RREGEXP_PTR(obj)
Convenient accessor macro.
static long RREGEXP_SRC_LEN(VALUE rexp)
Convenient getter function.
static char * RREGEXP_SRC_PTR(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
Regular expression execution context.
VALUE regexp
The expression of this match.
VALUE str
The target string that the match was made against.
Ruby's regular expression.
struct RBasic basic
Basic part, including flags and class.
const VALUE src
Source code of this expression.
unsigned long usecnt
Reference count.
struct re_pattern_buffer * ptr
The pattern buffer.
struct rmatch_offset * char_offset
Capture group offsets, in C array.
int char_offset_num_allocated
Number of rmatch_offset that ::rmatch::char_offset holds.
struct re_registers regs
"Registers" of a match.
Represents the region of a capture group.
long beg
Beginning of a group.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
#define SIZEOF_VALUE
Identical to sizeof(VALUE), except it is a macro that can also be used inside of preprocessor directi...
uintptr_t VALUE
Type that represents a Ruby object.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.