12#include "ruby/internal/config.h"
19#include "internal/encoding.h"
20#include "internal/hash.h"
21#include "internal/imemo.h"
22#include "internal/re.h"
23#include "internal/string.h"
24#include "internal/object.h"
25#include "internal/ractor.h"
26#include "internal/variable.h"
31#include "ractor_core.h"
35typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
36#define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
38#define BEG(no) (regs->beg[(no)])
39#define END(no) (regs->end[(no)])
42static const char casetable[] = {
43 '\000',
'\001',
'\002',
'\003',
'\004',
'\005',
'\006',
'\007',
44 '\010',
'\011',
'\012',
'\013',
'\014',
'\015',
'\016',
'\017',
45 '\020',
'\021',
'\022',
'\023',
'\024',
'\025',
'\026',
'\027',
46 '\030',
'\031',
'\032',
'\033',
'\034',
'\035',
'\036',
'\037',
48 '\040',
'\041',
'\042',
'\043',
'\044',
'\045',
'\046',
'\047',
50 '\050',
'\051',
'\052',
'\053',
'\054',
'\055',
'\056',
'\057',
52 '\060',
'\061',
'\062',
'\063',
'\064',
'\065',
'\066',
'\067',
54 '\070',
'\071',
'\072',
'\073',
'\074',
'\075',
'\076',
'\077',
56 '\100',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
58 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
60 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
62 '\170',
'\171',
'\172',
'\133',
'\134',
'\135',
'\136',
'\137',
64 '\140',
'\141',
'\142',
'\143',
'\144',
'\145',
'\146',
'\147',
66 '\150',
'\151',
'\152',
'\153',
'\154',
'\155',
'\156',
'\157',
68 '\160',
'\161',
'\162',
'\163',
'\164',
'\165',
'\166',
'\167',
70 '\170',
'\171',
'\172',
'\173',
'\174',
'\175',
'\176',
'\177',
71 '\200',
'\201',
'\202',
'\203',
'\204',
'\205',
'\206',
'\207',
72 '\210',
'\211',
'\212',
'\213',
'\214',
'\215',
'\216',
'\217',
73 '\220',
'\221',
'\222',
'\223',
'\224',
'\225',
'\226',
'\227',
74 '\230',
'\231',
'\232',
'\233',
'\234',
'\235',
'\236',
'\237',
75 '\240',
'\241',
'\242',
'\243',
'\244',
'\245',
'\246',
'\247',
76 '\250',
'\251',
'\252',
'\253',
'\254',
'\255',
'\256',
'\257',
77 '\260',
'\261',
'\262',
'\263',
'\264',
'\265',
'\266',
'\267',
78 '\270',
'\271',
'\272',
'\273',
'\274',
'\275',
'\276',
'\277',
79 '\300',
'\301',
'\302',
'\303',
'\304',
'\305',
'\306',
'\307',
80 '\310',
'\311',
'\312',
'\313',
'\314',
'\315',
'\316',
'\317',
81 '\320',
'\321',
'\322',
'\323',
'\324',
'\325',
'\326',
'\327',
82 '\330',
'\331',
'\332',
'\333',
'\334',
'\335',
'\336',
'\337',
83 '\340',
'\341',
'\342',
'\343',
'\344',
'\345',
'\346',
'\347',
84 '\350',
'\351',
'\352',
'\353',
'\354',
'\355',
'\356',
'\357',
85 '\360',
'\361',
'\362',
'\363',
'\364',
'\365',
'\366',
'\367',
86 '\370',
'\371',
'\372',
'\373',
'\374',
'\375',
'\376',
'\377',
89# error >>> "You lose. You will need a translation table for your character set." <<<
93rb_hrtime_t rb_reg_match_time_limit = 0;
96rb_memcicmp(
const void *x,
const void *y,
long len)
98 const unsigned char *p1 = x, *p2 = y;
102 if ((tmp = casetable[(
unsigned)*p1++] - casetable[(
unsigned)*p2++]))
110rb_memsearch_ss(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
112 const unsigned char *y;
114 if ((y = memmem(ys, n, xs, m)) != NULL)
121rb_memsearch_ss(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
123 const unsigned char *x = xs, *xe = xs + m;
124 const unsigned char *y = ys, *ye = ys + n;
125#define VALUE_MAX ((VALUE)~(VALUE)0)
129 rb_bug(
"!!too long pattern string!!");
131 if (!(y = memchr(y, *x, n - m + 1)))
135 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
155rb_memsearch_qs(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
157 const unsigned char *x = xs, *xe = xs + m;
158 const unsigned char *y = ys;
159 VALUE i, qstable[256];
162 for (i = 0; i < 256; ++i)
165 qstable[*x] = xe - x;
167 for (; y + m <= ys + n; y += *(qstable + y[m])) {
168 if (*xs == *y && memcmp(xs, y, m) == 0)
174static inline unsigned int
175rb_memsearch_qs_utf8_hash(
const unsigned char *x)
177 register const unsigned int mix = 8353;
178 register unsigned int h = *x;
203 return (
unsigned char)h;
207rb_memsearch_qs_utf8(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
209 const unsigned char *x = xs, *xe = xs + m;
210 const unsigned char *y = ys;
211 VALUE i, qstable[512];
214 for (i = 0; i < 512; ++i) {
217 for (; x < xe; ++x) {
218 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
221 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
222 if (*xs == *y && memcmp(xs, y, m) == 0)
229rb_memsearch_with_char_size(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n,
int char_size)
231 const unsigned char *x = xs, x0 = *xs, *y = ys;
233 for (n -= m; n >= 0; n -= char_size, y += char_size) {
234 if (x0 == *y && memcmp(x+1, y+1, m-1) == 0)
241rb_memsearch_wchar(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
243 return rb_memsearch_with_char_size(xs, m, ys, n, 2);
247rb_memsearch_qchar(
const unsigned char *xs,
long m,
const unsigned char *ys,
long n)
249 return rb_memsearch_with_char_size(xs, m, ys, n, 4);
255 const unsigned char *x = x0, *y = y0;
257 if (m > n)
return -1;
259 return memcmp(x0, y0, m) == 0 ? 0 : -1;
265 const unsigned char *ys = memchr(y, *x, n);
272 else if (LIKELY(rb_enc_mbminlen(enc) == 1)) {
274 return rb_memsearch_ss(x0, m, y0, n);
276 else if (enc == rb_utf8_encoding()){
277 return rb_memsearch_qs_utf8(x0, m, y0, n);
280 else if (LIKELY(rb_enc_mbminlen(enc) == 2)) {
281 return rb_memsearch_wchar(x0, m, y0, n);
283 else if (LIKELY(rb_enc_mbminlen(enc) == 4)) {
284 return rb_memsearch_qchar(x0, m, y0, n);
286 return rb_memsearch_qs(x0, m, y0, n);
289#define REG_ENCODING_NONE FL_USER6
291#define KCODE_FIXED FL_USER4
293#define ARG_REG_OPTION_MASK \
294 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
295#define ARG_ENCODING_FIXED 16
296#define ARG_ENCODING_NONE 32
305 val = ONIG_OPTION_IGNORECASE;
308 val = ONIG_OPTION_EXTEND;
311 val = ONIG_OPTION_MULTILINE;
320enum { OPTBUF_SIZE = 4 };
323option_to_str(
char str[OPTBUF_SIZE],
int options)
326 if (options & ONIG_OPTION_MULTILINE) *p++ =
'm';
327 if (options & ONIG_OPTION_IGNORECASE) *p++ =
'i';
328 if (options & ONIG_OPTION_EXTEND) *p++ =
'x';
340 *kcode = rb_ascii8bit_encindex();
341 return (*option = ARG_ENCODING_NONE);
343 *kcode = ENCINDEX_EUC_JP;
346 *kcode = ENCINDEX_Windows_31J;
349 *kcode = rb_utf8_encindex();
353 return (*option = char_to_option(c));
355 *option = ARG_ENCODING_FIXED;
360rb_reg_check(
VALUE re)
368rb_reg_expr_str(
VALUE str,
const char *s,
long len,
371 const char *p, *pend;
376 p = s; pend = p +
len;
380 c = rb_enc_ascget(p, pend, &clen, enc);
383 p += mbclen(p, pend, enc);
407 int unicode_p = rb_enc_unicode_p(enc);
410 c = rb_enc_ascget(p, pend, &clen, enc);
411 if (c ==
'\\' && p+clen < pend) {
412 int n = clen + mbclen(p+clen, pend, enc);
418 clen = rb_enc_precise_mbclen(p, pend, enc);
420 c = (
unsigned char)*p;
425 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
426 rb_str_buf_cat_escaped_char(str, c, unicode_p);
433 else if (c == term) {
441 else if (!rb_enc_isspace(c, enc)) {
445 snprintf(b,
sizeof(b),
"\\x%02X", c);
461 rb_encoding *resenc = rb_default_internal_encoding();
462 if (resenc == NULL) resenc = rb_default_external_encoding();
464 if (re && rb_enc_asciicompat(enc)) {
465 rb_enc_copy(str, re);
468 rb_enc_associate(str, rb_usascii_encoding());
472 rb_reg_expr_str(str, RSTRING_PTR(src_str), RSTRING_LEN(src_str), enc, resenc,
'/');
477 char opts[OPTBUF_SIZE];
479 if (*option_to_str(opts,
RREGEXP_PTR(re)->options))
481 if (
RBASIC(re)->flags & REG_ENCODING_NONE)
507rb_reg_source(
VALUE re)
528rb_reg_inspect(
VALUE re)
533 return rb_reg_desc(re);
536static VALUE rb_reg_str_with_term(
VALUE re,
int term);
568 return rb_reg_str_with_term(re,
'/');
572rb_reg_str_with_term(
VALUE re,
int term)
575 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
577 char optbuf[OPTBUF_SIZE + 1];
582 rb_enc_copy(str, re);
585 const UChar *ptr = (UChar *)RSTRING_PTR(src_str);
586 long len = RSTRING_LEN(src_str);
588 if (
len >= 4 && ptr[0] ==
'(' && ptr[1] ==
'?') {
591 if ((
len -= 2) > 0) {
593 opt = char_to_option((
int )*ptr);
603 if (
len > 1 && *ptr ==
'-') {
607 opt = char_to_option((
int )*ptr);
622 if (*ptr ==
':' && ptr[
len-1] ==
')') {
629 err = onig_new(&rp, ptr, ptr +
len, options,
630 enc, OnigDefaultSyntax, NULL);
643 if ((options & embeddable) != embeddable) {
645 option_to_str(optbuf + 1, ~options);
650 if (rb_enc_asciicompat(enc)) {
651 rb_reg_expr_str(str, (
char*)ptr,
len, enc, NULL, term);
659 rb_enc_associate(str, rb_usascii_encoding());
663 s = RSTRING_PTR(str);
669 rb_str_resize(str, RSTRING_LEN(str) - n);
671 rb_reg_expr_str(str, (
char*)ptr,
len, enc, NULL, term);
674 rb_enc_copy(str, re);
681NORETURN(
static void rb_reg_raise(
const char *err,
VALUE re));
684rb_reg_raise(
const char *err,
VALUE re)
686 VALUE desc = rb_reg_desc(re);
692rb_enc_reg_error_desc(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err)
694 char opts[OPTBUF_SIZE + 1];
696 rb_encoding *resenc = rb_default_internal_encoding();
697 if (resenc == NULL) resenc = rb_default_external_encoding();
699 rb_enc_associate(desc, enc);
701 rb_reg_expr_str(desc, s,
len, enc, resenc,
'/');
703 option_to_str(opts + 1, options);
708NORETURN(
static void rb_enc_reg_raise(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err));
711rb_enc_reg_raise(
const char *s,
long len,
rb_encoding *enc,
int options,
const char *err)
717rb_reg_error_desc(
VALUE str,
int options,
const char *err)
719 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
720 rb_enc_get(str), options, err);
723NORETURN(
static void rb_reg_raise_str(
VALUE str,
int options,
const char *err));
726rb_reg_raise_str(
VALUE str,
int options,
const char *err)
746rb_reg_casefold_p(
VALUE re)
749 return RBOOL(
RREGEXP_PTR(re)->options & ONIG_OPTION_IGNORECASE);
791rb_reg_options_m(
VALUE re)
798reg_names_iter(
const OnigUChar *name,
const OnigUChar *name_end,
799 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
802 rb_ary_push(ary, rb_enc_str_new((
const char *)name, name_end-name, regex->enc));
820rb_reg_names(
VALUE re)
825 onig_foreach_name(
RREGEXP_PTR(re), reg_names_iter, (
void*)ary);
830reg_named_captures_iter(
const OnigUChar *name,
const OnigUChar *name_end,
831 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
837 for (i = 0; i < back_num; i++)
840 rb_hash_aset(hash,
rb_str_new((
const char*)name, name_end-name),ary);
864rb_reg_named_captures(
VALUE re)
867 VALUE hash = rb_hash_new_with_size(onig_number_of_names(reg));
868 onig_foreach_name(reg, reg_named_captures_iter, (
void*)hash);
873onig_new_with_source(
regex_t** reg,
const UChar* pattern,
const UChar* pattern_end,
875 OnigErrorInfo* einfo,
const char *sourcefile,
int sourceline)
880 if (IS_NULL(*reg))
return ONIGERR_MEMORY;
882 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
885 r = onig_compile_ruby(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
895make_regexp(
const char *s,
long len,
rb_encoding *enc,
int flags, onig_errmsg_buffer err,
896 const char *sourcefile,
int sourceline)
909 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s +
len), flags,
910 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
912 onig_error_code_to_str((UChar*)err, r, &einfo);
971match_alloc(
VALUE klass)
975 NEWOBJ_OF(match,
struct RMatch, klass, flags, alloc_size, 0);
988 if (to->allocated)
return 0;
991 if (to->allocated)
return 0;
992 return ONIGERR_MEMORY;
1001pair_byte_cmp(
const void *pair1,
const void *pair2)
1003 long diff = ((
pair_t*)pair1)->byte_pos - ((
pair_t*)pair2)->byte_pos;
1004#if SIZEOF_LONG > SIZEOF_INT
1005 return diff ? diff > 0 ? 1 : -1 : 0;
1012update_char_offset(
VALUE match)
1016 int i, num_regs, num_pos;
1026 num_regs = rm->
regs.num_regs;
1033 enc = rb_enc_get(
RMATCH(match)->str);
1035 for (i = 0; i < num_regs; i++) {
1044 for (i = 0; i < num_regs; i++) {
1047 pairs[num_pos++].byte_pos = BEG(i);
1048 pairs[num_pos++].byte_pos = END(i);
1050 qsort(pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1052 s = p = RSTRING_PTR(
RMATCH(match)->str);
1054 for (i = 0; i < num_pos; i++) {
1055 q = s + pairs[i].byte_pos;
1057 pairs[i].char_pos = c;
1061 for (i = 0; i < num_regs; i++) {
1069 key.byte_pos = BEG(i);
1070 found = bsearch(&key, pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1073 key.byte_pos = END(i);
1074 found = bsearch(&key, pairs, num_pos,
sizeof(
pair_t), pair_byte_cmp);
1080match_check(
VALUE match)
1082 if (!
RMATCH(match)->regexp) {
1099 rm = RMATCH_EXT(obj);
1103 if (RMATCH_EXT(orig)->char_offset_num_allocated) {
1129match_regexp(
VALUE match)
1133 regexp =
RMATCH(match)->regexp;
1134 if (
NIL_P(regexp)) {
1164match_names(
VALUE match)
1169 return rb_reg_names(
RMATCH(match)->regexp);
1185match_size(
VALUE match)
1191static int name_to_backref_number(
struct re_registers *,
VALUE,
const char*,
const char*);
1192NORETURN(
static void name_to_backref_error(
VALUE name));
1195name_to_backref_error(
VALUE name)
1197 rb_raise(
rb_eIndexError,
"undefined group name reference: % "PRIsVALUE,
1204 if (i < 0 || regs->num_regs <= i)
1209match_backref_number(
VALUE match,
VALUE backref)
1226 num = name_to_backref_number(regs, regexp, name, name + RSTRING_LEN(backref));
1229 name_to_backref_error(backref);
1238 return match_backref_number(match, backref);
1253 int i = match_backref_number(match, n);
1257 backref_number_check(regs, i);
1262 update_char_offset(match);
1264 LONG2NUM(RMATCH_EXT(match)->char_offset[i].end));
1288 int i = match_backref_number(match, n);
1292 backref_number_check(regs, i);
1312 int i = match_backref_number(match, n);
1316 backref_number_check(regs, i);
1336 int i = match_backref_number(match, n);
1340 backref_number_check(regs, i);
1360 int i = match_backref_number(match, n);
1364 backref_number_check(regs, i);
1369 update_char_offset(match);
1370 return LONG2NUM(RMATCH_EXT(match)->char_offset[i].beg);
1386 int i = match_backref_number(match, n);
1390 backref_number_check(regs, i);
1395 update_char_offset(match);
1396 return LONG2NUM(RMATCH_EXT(match)->char_offset[i].end);
1428 int i = match_backref_number(match, n);
1431 backref_number_check(regs, i);
1433 long start = BEG(i), end = END(i);
1472 int i = match_backref_number(match, n);
1476 backref_number_check(regs, i);
1481 update_char_offset(match);
1483 &RMATCH_EXT(match)->char_offset[i];
1487#define MATCH_BUSY FL_USER2
1492 FL_SET(match, MATCH_BUSY);
1496rb_match_unbusy(
VALUE match)
1502rb_match_count(
VALUE match)
1505 if (
NIL_P(match))
return -1;
1507 if (!regs)
return -1;
1508 return regs->num_regs;
1519 int err = onig_region_resize(&rmatch->
regs, 1);
1520 if (err) rb_memerror();
1521 rmatch->
regs.beg[0] = pos;
1522 rmatch->
regs.end[0] = pos +
len;
1526rb_backref_set_string(
VALUE string,
long pos,
long len)
1532 match_set_string(match,
string, pos,
len);
1567rb_reg_fixed_encoding_p(
VALUE re)
1569 return RBOOL(
FL_TEST(re, KCODE_FIXED));
1573rb_reg_preprocess(
const char *p,
const char *end,
rb_encoding *enc,
1574 rb_encoding **fixed_enc, onig_errmsg_buffer err,
int options);
1582 "incompatible encoding regexp match (%s regexp with %s string)",
1583 rb_enc_inspect_name(rb_enc_get(re)),
1584 rb_enc_inspect_name(rb_enc_get(
str)));
1601 int cr = str_coderange(
str);
1604 rb_raise(rb_eArgError,
1605 "invalid byte sequence in %s",
1606 rb_enc_name(rb_enc_get(
str)));
1610 enc = rb_enc_get(
str);
1617 else if (!rb_enc_asciicompat(enc)) {
1618 reg_enc_error(re,
str);
1620 else if (rb_reg_fixed_encoding_p(re)) {
1623 reg_enc_error(re,
str);
1627 else if (warn && (
RBASIC(re)->flags & REG_ENCODING_NONE) &&
1628 enc != rb_ascii8bit_encoding() &&
1630 rb_warn(
"historical binary regexp match /.../n against %s string",
1646 if (reg->enc == enc)
return reg;
1651 const char *pattern = RSTRING_PTR(src_str);
1653 onig_errmsg_buffer err =
"";
1654 unescaped = rb_reg_preprocess(
1655 pattern, pattern + RSTRING_LEN(src_str), enc,
1656 &fixed_enc, err, 0);
1658 if (
NIL_P(unescaped)) {
1659 rb_raise(rb_eArgError,
"regexp preprocess failed: %s", err);
1663 rb_hrtime_t timelimit = reg->timelimit;
1670 if (ruby_single_main_ractor &&
RREGEXP(re)->usecnt == 0) {
1672 r = onig_new_without_alloc(&tmp_reg, (UChar *)ptr, (UChar *)(ptr +
len),
1674 OnigDefaultSyntax, &einfo);
1678 onig_free_body(&tmp_reg);
1681 onig_free_body(reg);
1687 r = onig_new(®, (UChar *)ptr, (UChar *)(ptr +
len),
1689 OnigDefaultSyntax, &einfo);
1693 onig_error_code_to_str((UChar*)err, r, &einfo);
1694 rb_reg_raise(err, re);
1697 reg->timelimit = timelimit;
1712 if (!tmpreg)
RREGEXP(re)->usecnt++;
1714 OnigPosition result = match(reg,
str, regs, args);
1716 if (!tmpreg)
RREGEXP(re)->usecnt--;
1722 onig_region_free(regs, 0);
1727 case ONIGERR_TIMEOUT:
1728 rb_raise(rb_eRegexpTimeoutError,
"regexp match timeout");
1730 onig_errmsg_buffer err =
"";
1731 onig_error_code_to_str((UChar*)err, (
int)result);
1732 rb_reg_raise(err, re);
1747 enc = rb_reg_prepare_enc(re,
str, 0);
1753 range = RSTRING_LEN(
str) - pos;
1756 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(
str)) {
1757 string = (UChar*)RSTRING_PTR(
str);
1760 p = onigenc_get_right_adjust_char_head(enc,
string,
string + pos,
string + RSTRING_LEN(
str));
1763 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,
string,
string + pos,
string + RSTRING_LEN(
str));
1787 (UChar *)(ptr +
len),
1788 (UChar *)(ptr + args->pos),
1789 (UChar *)(ptr + args->range),
1796rb_reg_search_set_match(
VALUE re,
VALUE str,
long pos,
int reverse,
int set_backref_str,
VALUE *set_match)
1798 long len = RSTRING_LEN(str);
1799 if (pos >
len || pos < 0) {
1806 .range = reverse ? 0 :
len,
1810 OnigPosition result =
rb_reg_onig_match(re, str, reg_onig_search, &args, ®s);
1812 if (result == ONIG_MISMATCH) {
1814 return ONIG_MISMATCH;
1834 onig_region_free(&RMATCH_EXT(match)->regs,
false);
1840 if (set_backref_str) {
1854 if (set_match) *set_match = match;
1860rb_reg_search0(
VALUE re,
VALUE str,
long pos,
int reverse,
int set_backref_str,
VALUE *match)
1862 return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, match);
1868 return rb_reg_search_set_match(re, str, pos, reverse, 1, NULL);
1881 (UChar *)(ptr +
len),
1916 if (nth >= regs->num_regs) {
1920 nth += regs->num_regs;
1921 if (nth <= 0)
return Qnil;
1923 return RBOOL(BEG(nth) != -1);
1930 long start, end,
len;
1936 if (nth >= regs->num_regs) {
1940 nth += regs->num_regs;
1941 if (nth <= 0)
return Qnil;
1944 if (start == -1)
return Qnil;
1984 if (BEG(0) == -1)
return Qnil;
2018 if (BEG(0) == -1)
return Qnil;
2019 str =
RMATCH(match)->str;
2026match_last_index(
VALUE match)
2031 if (
NIL_P(match))
return -1;
2034 if (BEG(0) == -1)
return -1;
2036 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
2044 int i = match_last_index(match);
2045 if (i <= 0)
return Qnil;
2051rb_reg_last_defined(
VALUE match)
2053 int i = match_last_index(match);
2054 if (i < 0)
return Qnil;
2059last_match_getter(
ID _x,
VALUE *_y)
2065prematch_getter(
ID _x,
VALUE *_y)
2071postmatch_getter(
ID _x,
VALUE *_y)
2077last_paren_match_getter(
ID _x,
VALUE *_y)
2083match_array(
VALUE match,
int start)
2093 target =
RMATCH(match)->str;
2095 for (i=start; i<regs->num_regs; i++) {
2096 if (regs->beg[i] == -1) {
2123match_to_a(
VALUE match)
2125 return match_array(match, 0);
2145match_captures(
VALUE match)
2147 return match_array(match, 1);
2151name_to_backref_number(
struct re_registers *regs,
VALUE regexp,
const char* name,
const char* name_end)
2153 if (
NIL_P(regexp))
return -1;
2154 return onig_name_to_backref_number(
RREGEXP_PTR(regexp),
2155 (
const unsigned char *)name, (
const unsigned char *)name_end, regs);
2158#define NAME_TO_NUMBER(regs, re, name, name_ptr, name_end) \
2160 !rb_enc_compatible(RREGEXP_SRC(re), (name)) ? 0 : \
2161 name_to_backref_number((regs), (re), (name_ptr), (name_end)))
2174 num = NAME_TO_NUMBER(regs, re, name,
2177 name_to_backref_error(name);
2183match_ary_subseq(
VALUE match,
long beg,
long len,
VALUE result)
2186 long j, end = olen < beg+
len ? olen : beg+
len;
2188 if (
len == 0)
return result;
2190 for (j = beg; j < end; j++) {
2193 if (beg +
len > j) {
2214 return match_ary_subseq(match, beg,
len, result);
2257match_aref(
int argc,
VALUE *argv,
VALUE match)
2264 if (
NIL_P(length)) {
2269 int num = namev_to_backref_number(
RMATCH_REGS(match),
RMATCH(match)->regexp, idx);
2274 return match_ary_aref(match, idx,
Qnil);
2287 if (beg < 0)
return Qnil;
2289 else if (beg > num_regs) {
2292 if (beg+
len > num_regs) {
2293 len = num_regs - beg;
2295 return match_ary_subseq(match, beg,
len,
Qnil);
2326match_values_at(
int argc,
VALUE *argv,
VALUE match)
2334 for (i=0; i<argc; i++) {
2339 int num = namev_to_backref_number(
RMATCH_REGS(match),
RMATCH(match)->regexp, argv[i]);
2344 match_ary_aref(match, argv[i], result);
2371match_to_s(
VALUE match)
2380match_named_captures_iter(
const OnigUChar *name,
const OnigUChar *name_end,
2381 int back_num,
int *back_refs,
OnigRegex regex,
void *arg)
2383 struct MEMO *memo = MEMO_CAST(arg);
2384 VALUE hash = memo->v1;
2385 VALUE match = memo->v2;
2386 long symbolize = memo->u3.state;
2388 VALUE key = rb_enc_str_new((
const char *)name, name_end-name, regex->enc);
2390 if (symbolize > 0) {
2399 for (i = 0; i < back_num; i++) {
2402 rb_hash_aset(hash, key, value);
2408 rb_hash_aset(hash, key,
Qnil);
2447match_named_captures(
int argc,
VALUE *argv,
VALUE match)
2454 return rb_hash_new();
2457 VALUE symbolize_names = 0;
2462 static ID keyword_ids[1];
2464 VALUE symbolize_names_val;
2466 if (!keyword_ids[0]) {
2469 rb_get_kwargs(opt, keyword_ids, 0, 1, &symbolize_names_val);
2470 if (!UNDEF_P(symbolize_names_val) &&
RTEST(symbolize_names_val)) {
2471 symbolize_names = 1;
2475 hash = rb_hash_new();
2476 memo = MEMO_NEW(hash, match, symbolize_names);
2478 onig_foreach_name(
RREGEXP(
RMATCH(match)->regexp)->ptr, match_named_captures_iter, (
void*)memo);
2500match_deconstruct_keys(
VALUE match,
VALUE keys)
2508 return rb_hash_new_with_size(0);
2512 h = rb_hash_new_with_size(onig_number_of_names(
RREGEXP_PTR(
RMATCH(match)->regexp)));
2515 memo = MEMO_NEW(h, match, 1);
2517 onig_foreach_name(
RREGEXP_PTR(
RMATCH(match)->regexp), match_named_captures_iter, (
void*)memo);
2525 return rb_hash_new_with_size(0);
2566match_string(
VALUE match)
2569 return RMATCH(match)->str;
2578match_inspect_name_iter(
const OnigUChar *name,
const OnigUChar *name_end,
2579 int back_num,
int *back_refs,
OnigRegex regex,
void *arg0)
2584 for (i = 0; i < back_num; i++) {
2585 arg[back_refs[i]].name = name;
2586 arg[back_refs[i]].len = name_end - name;
2613match_inspect(
VALUE match)
2619 int num_regs = regs->num_regs;
2624 return rb_sprintf(
"#<%"PRIsVALUE
":%p>", cname, (
void*)match);
2626 else if (
NIL_P(regexp)) {
2627 return rb_sprintf(
"#<%"PRIsVALUE
": %"PRIsVALUE
">",
2635 match_inspect_name_iter, names);
2640 for (i = 0; i < num_regs; i++) {
2647 rb_str_catf(str,
"%d", i);
2665read_escaped_byte(
const char **pp,
const char *end, onig_errmsg_buffer err)
2667 const char *p = *pp;
2669 int meta_prefix = 0, ctrl_prefix = 0;
2672 if (p == end || *p++ !=
'\\') {
2673 errcpy(err,
"too short escaped multibyte character");
2679 errcpy(err,
"too short escape sequence");
2683 case '\\': code =
'\\';
break;
2684 case 'n': code =
'\n';
break;
2685 case 't': code =
'\t';
break;
2686 case 'r': code =
'\r';
break;
2687 case 'f': code =
'\f';
break;
2688 case 'v': code =
'\013';
break;
2689 case 'a': code =
'\007';
break;
2690 case 'e': code =
'\033';
break;
2693 case '0':
case '1':
case '2':
case '3':
2694 case '4':
case '5':
case '6':
case '7':
2703 errcpy(err,
"invalid hex escape");
2711 errcpy(err,
"duplicate meta escape");
2715 if (p+1 < end && *p++ ==
'-' && (*p & 0x80) == 0) {
2725 errcpy(err,
"too short meta escape");
2729 if (p == end || *p++ !=
'-') {
2730 errcpy(err,
"too short control escape");
2735 errcpy(err,
"duplicate control escape");
2739 if (p < end && (*p & 0x80) == 0) {
2749 errcpy(err,
"too short control escape");
2753 errcpy(err,
"unexpected escape sequence");
2756 if (code < 0 || 0xff < code) {
2757 errcpy(err,
"invalid escape code");
2771unescape_escaped_nonascii(
const char **pp,
const char *end,
rb_encoding *enc,
2774 const char *p = *pp;
2776 unsigned char *area =
ALLOCA_N(
unsigned char, chmaxlen);
2777 char *chbuf = (
char *)area;
2782 memset(chbuf, 0, chmaxlen);
2784 byte = read_escaped_byte(&p, end, err);
2789 area[chlen++] = byte;
2790 while (chlen < chmaxlen &&
2792 byte = read_escaped_byte(&p, end, err);
2796 area[chlen++] = byte;
2799 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
2801 errcpy(err,
"invalid multibyte escape");
2804 if (1 < chlen || (area[0] & 0x80)) {
2809 else if (*encp != enc) {
2810 errcpy(err,
"escaped non ASCII character in UTF-8 regexp");
2816 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", area[0]&0xff);
2824check_unicode_range(
unsigned long code, onig_errmsg_buffer err)
2826 if ((0xd800 <= code && code <= 0xdfff) ||
2828 errcpy(err,
"invalid Unicode range");
2835append_utf8(
unsigned long uv,
2838 if (check_unicode_range(uv, err) != 0)
2842 snprintf(escbuf,
sizeof(escbuf),
"\\x%02X", (
int)uv);
2852 *encp = rb_utf8_encoding();
2853 else if (*encp != rb_utf8_encoding()) {
2854 errcpy(err,
"UTF-8 character in non UTF-8 regexp");
2862unescape_unicode_list(
const char **pp,
const char *end,
2865 const char *p = *pp;
2866 int has_unicode = 0;
2870 while (p < end &&
ISSPACE(*p)) p++;
2873 code = ruby_scan_hex(p, end-p, &
len);
2877 errcpy(err,
"invalid Unicode range");
2881 if (append_utf8(code, buf, encp, err) != 0)
2885 while (p < end &&
ISSPACE(*p)) p++;
2888 if (has_unicode == 0) {
2889 errcpy(err,
"invalid Unicode list");
2899unescape_unicode_bmp(
const char **pp,
const char *end,
2902 const char *p = *pp;
2907 errcpy(err,
"invalid Unicode escape");
2910 code = ruby_scan_hex(p, 4, &
len);
2912 errcpy(err,
"invalid Unicode escape");
2915 if (append_utf8(code, buf, encp, err) != 0)
2922unescape_nonascii0(
const char **pp,
const char *end,
rb_encoding *enc,
2924 onig_errmsg_buffer err,
int options,
int recurse)
2926 const char *p = *pp;
2929 int in_char_class = 0;
2931 int extended_mode = options & ONIG_OPTION_EXTEND;
2935 int chlen = rb_enc_precise_mbclen(p, end, enc);
2938 errcpy(err,
"invalid multibyte character");
2942 if (1 < chlen || (*p & 0x80)) {
2948 else if (*encp != enc) {
2949 errcpy(err,
"non ASCII character in UTF-8 regexp");
2958 errcpy(err,
"too short escape sequence");
2961 chlen = rb_enc_precise_mbclen(p, end, enc);
2963 goto invalid_multibyte;
2972 case '1':
case '2':
case '3':
2973 case '4':
case '5':
case '6':
case '7':
2975 size_t len = end-(p-1), octlen;
2976 if (ruby_scan_oct(p-1,
len < 3 ?
len : 3, &octlen) <= 0177) {
2992 if (rb_is_usascii_enc(enc)) {
2993 const char *pbeg = p;
2994 int byte = read_escaped_byte(&p, end, err);
2995 if (
byte == -1)
return -1;
3000 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
3007 errcpy(err,
"too short escape sequence");
3013 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
3015 if (p == end || *p++ !=
'}') {
3016 errcpy(err,
"invalid Unicode list");
3023 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
3045 if (extended_mode && !in_char_class) {
3047 while ((p < end) && ((c = *p++) !=
'\n')) {
3048 if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) {
3061 if (in_char_class) {
3068 if (!in_char_class && recurse) {
3069 if (--parens == 0) {
3076 if (!in_char_class && p + 1 < end && *p ==
'?') {
3077 if (*(p+1) ==
'#') {
3079 const char *orig_p = p;
3082 while (cont && (p < end)) {
3085 if (!(c & 0x80))
break;
3086 if (!*encp && enc == rb_utf8_encoding()) {
3092 chlen = rb_enc_precise_mbclen(p, end, enc);
3094 goto invalid_multibyte;
3115 int local_extend = 0;
3122 for (s = p+1; s < end; s++) {
3125 local_extend = invert ? -1 : 1;
3132 if (local_extend == 0 ||
3133 (local_extend == -1 && !extended_mode) ||
3134 (local_extend == 1 && extended_mode)) {
3141 int local_options = options;
3142 if (local_extend == 1) {
3143 local_options |= ONIG_OPTION_EXTEND;
3146 local_options &= ~ONIG_OPTION_EXTEND;
3150 int ret = unescape_nonascii0(&p, end, enc, buf, encp,
3153 if (ret < 0)
return ret;
3158 extended_mode = local_extend == 1;
3175 else if (!in_char_class && recurse) {
3193unescape_nonascii(
const char *p,
const char *end,
rb_encoding *enc,
3195 onig_errmsg_buffer err,
int options)
3197 return unescape_nonascii0(&p, end, enc, buf, encp, has_property,
3202rb_reg_preprocess(
const char *p,
const char *end,
rb_encoding *enc,
3203 rb_encoding **fixed_enc, onig_errmsg_buffer err,
int options)
3206 int has_property = 0;
3210 if (rb_enc_asciicompat(enc))
3214 rb_enc_associate(buf, enc);
3217 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err, options) != 0)
3220 if (has_property && !*fixed_enc) {
3225 rb_enc_associate(buf, *fixed_enc);
3232rb_reg_check_preprocess(
VALUE str)
3235 onig_errmsg_buffer err =
"";
3241 p = RSTRING_PTR(str);
3242 end = p + RSTRING_LEN(str);
3243 enc = rb_enc_get(str);
3245 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err, 0);
3249 return rb_reg_error_desc(str, 0, err);
3255rb_reg_preprocess_dregexp(
VALUE ary,
int options)
3259 onig_errmsg_buffer err =
"";
3265 rb_raise(rb_eArgError,
"no arguments given");
3274 src_enc = rb_enc_get(str);
3275 if (options & ARG_ENCODING_NONE &&
3276 src_enc != ascii8bit) {
3278 rb_raise(
rb_eRegexpError,
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
3280 src_enc = ascii8bit;
3284 p = RSTRING_PTR(str);
3285 end = p + RSTRING_LEN(str);
3287 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err, options);
3290 rb_raise(rb_eArgError,
"%s", err);
3292 if (fixed_enc != 0) {
3293 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
3294 rb_raise(
rb_eRegexpError,
"encoding mismatch in dynamic regexp : %s and %s",
3295 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
3297 regexp_enc = fixed_enc;
3306 rb_enc_associate(result, regexp_enc);
3313rb_reg_initialize_check(
VALUE obj)
3315 rb_check_frozen(obj);
3323 int options, onig_errmsg_buffer err,
3324 const char *sourcefile,
int sourceline)
3331 rb_reg_initialize_check(obj);
3333 if (rb_enc_dummy_p(enc)) {
3334 errcpy(err,
"can't make regexp with dummy encoding");
3338 unescaped = rb_reg_preprocess(s, s+
len, enc, &fixed_enc, err, options);
3339 if (
NIL_P(unescaped))
3343 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
3344 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
3345 errcpy(err,
"incompatible character encoding");
3348 if (fixed_enc != a_enc) {
3349 options |= ARG_ENCODING_FIXED;
3353 else if (!(options & ARG_ENCODING_FIXED)) {
3354 enc = rb_usascii_encoding();
3357 rb_enc_associate((
VALUE)re, enc);
3358 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
3361 if (options & ARG_ENCODING_NONE) {
3365 re->
ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
3366 options & ARG_REG_OPTION_MASK, err,
3367 sourcefile, sourceline);
3368 if (!re->
ptr)
return -1;
3377 if (regenc != enc) {
3378 str = rb_enc_associate(
rb_str_dup(str), enc = regenc);
3384rb_reg_initialize_str(
VALUE obj,
VALUE str,
int options, onig_errmsg_buffer err,
3385 const char *sourcefile,
int sourceline)
3388 rb_encoding *str_enc = rb_enc_get(str), *enc = str_enc;
3389 if (options & ARG_ENCODING_NONE) {
3391 if (enc != ascii8bit) {
3393 errcpy(err,
"/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
3399 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
3400 options, err, sourcefile, sourceline);
3401 if (ret == 0) reg_set_source(obj, str, str_enc);
3406rb_reg_s_alloc(
VALUE klass)
3426 return rb_reg_init_str(rb_reg_alloc(), s, options);
3430rb_reg_init_str(
VALUE re,
VALUE s,
int options)
3432 onig_errmsg_buffer err =
"";
3434 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
3435 rb_reg_raise_str(s, options, err);
3444 onig_errmsg_buffer err =
"";
3446 if (rb_reg_initialize(re, RSTRING_PTR(s), RSTRING_LEN(s),
3447 enc, options, err, NULL, 0) != 0) {
3448 rb_reg_raise_str(s, options, err);
3450 reg_set_source(re, s, enc);
3456rb_reg_new_ary(
VALUE ary,
int opt)
3466 VALUE re = rb_reg_alloc();
3467 onig_errmsg_buffer err =
"";
3469 if (rb_reg_initialize(re, s,
len, enc, options, err, NULL, 0) != 0) {
3470 rb_enc_reg_raise(s,
len, enc, options, err);
3484rb_reg_compile(
VALUE str,
int options,
const char *sourcefile,
int sourceline)
3486 VALUE re = rb_reg_alloc();
3487 onig_errmsg_buffer err =
"";
3490 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
3491 rb_set_errinfo(rb_reg_error_desc(str, options, err));
3498static VALUE reg_cache;
3503 if (rb_ractor_main_p()) {
3506 && memcmp(
RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
3516static st_index_t reg_hash(
VALUE re);
3528rb_reg_hash(
VALUE re)
3530 st_index_t hashval = reg_hash(re);
3563 if (re1 == re2)
return Qtrue;
3565 rb_reg_check(re1); rb_reg_check(re2);
3585match_hash(
VALUE match)
3592 hashval =
rb_hash_uint(hashval, reg_hash(match_regexp(match)));
3615 if (match1 == match2)
return Qtrue;
3619 if (!rb_reg_equal(match_regexp(match1), match_regexp(match2)))
return Qfalse;
3622 if (regs1->num_regs != regs2->num_regs)
return Qfalse;
3623 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs *
sizeof(*regs1->beg)))
return Qfalse;
3624 if (memcmp(regs1->end, regs2->end, regs1->num_regs *
sizeof(*regs1->end)))
return Qfalse;
3629reg_operand(
VALUE s,
int check)
3651 *strp = str = reg_operand(str, TRUE);
3662 return rb_reg_search_set_match(re, str, pos, 0, 1, set_match);
3724 long pos = reg_match_pos(re, &str, 0, NULL);
3725 if (pos < 0)
return Qnil;
3755 str = reg_operand(str, FALSE);
3761 return RBOOL(start >= 0);
3838rb_reg_match_m(
int argc,
VALUE *argv,
VALUE re)
3843 if (
rb_scan_args(argc, argv,
"11", &str, &initpos) == 2) {
3850 pos = reg_match_pos(re, &str, pos, &result);
3879rb_reg_match_m_p(
int argc,
VALUE *argv,
VALUE re)
3882 return rb_reg_match_p(re, argv[0], pos);
3893 if (pos < 0)
return Qfalse;
3899 pos = beg - RSTRING_PTR(str);
3905 .range = RSTRING_LEN(str),
3918str_to_option(
VALUE str)
3924 if (
NIL_P(str))
return -1;
3926 for (
long i = 0; i <
len; ++i) {
3927 int f = char_to_option(ptr[i]);
3929 rb_raise(rb_eArgError,
"unknown regexp option: %"PRIsVALUE, str);
3937set_timeout(rb_hrtime_t *hrt,
VALUE timeout)
3939 double timeout_d =
NIL_P(timeout) ? 0.0 :
NUM2DBL(timeout);
3940 if (!
NIL_P(timeout) && timeout_d <= 0) {
3941 rb_raise(rb_eArgError,
"invalid timeout: %"PRIsVALUE, timeout);
3943 double2hrtime(hrt, timeout_d);
3952 rb_reg_initialize_check(copy);
3953 if ((r = onig_reg_copy(&re,
RREGEXP_PTR(orig))) != 0) {
3960 rb_enc_copy(copy, orig);
3975void rb_warn_deprecated_to_remove(
const char *removal,
const char *fmt,
const char *suggest, ...);
4032rb_reg_initialize_m(
int argc,
VALUE *argv,
VALUE self)
4035 VALUE re = reg_extract_args(argc, argv, &args);
4044 set_timeout(&
RREGEXP_PTR(self)->timelimit, args.timeout);
4059 args->timeout =
Qnil;
4060 if (!
NIL_P(kwargs)) {
4061 static ID keywords[1];
4082 else if ((f = str_to_option(opts)) >= 0) flags = f;
4083 else if (rb_bool_expected(opts,
"ignorecase", FALSE))
4084 flags = ONIG_OPTION_IGNORECASE;
4090 args->flags = flags;
4097 if (enc && rb_enc_get(str) != enc)
4098 rb_reg_init_str_enc(self, str, enc, flags);
4100 rb_reg_init_str(self, str, flags);
4113 s = RSTRING_PTR(str);
4114 send = s + RSTRING_LEN(str);
4116 c = rb_enc_ascget(s, send, &clen, enc);
4118 s += mbclen(s, send, enc);
4122 case '[':
case ']':
case '{':
case '}':
4123 case '(':
case ')':
case '|':
case '-':
4124 case '*':
case '.':
case '\\':
4125 case '?':
case '+':
case '^':
case '$':
4127 case '\t':
case '\f':
case '\v':
case '\n':
case '\r':
4134 rb_enc_associate(tmp, rb_usascii_encoding());
4141 rb_enc_associate(tmp, rb_usascii_encoding());
4144 rb_enc_copy(tmp, str);
4146 t = RSTRING_PTR(tmp);
4148 const char *p = RSTRING_PTR(str);
4149 memcpy(t, p, s - p);
4153 c = rb_enc_ascget(s, send, &clen, enc);
4155 int n = mbclen(s, send, enc);
4163 case '[':
case ']':
case '{':
case '}':
4164 case '(':
case ')':
case '|':
case '-':
4165 case '*':
case '.':
case '\\':
4166 case '?':
case '+':
case '^':
case '$':
4168 t += rb_enc_mbcput(
'\\', t, enc);
4171 t += rb_enc_mbcput(
'\\', t, enc);
4172 t += rb_enc_mbcput(
' ', t, enc);
4175 t += rb_enc_mbcput(
'\\', t, enc);
4176 t += rb_enc_mbcput(
't', t, enc);
4179 t += rb_enc_mbcput(
'\\', t, enc);
4180 t += rb_enc_mbcput(
'n', t, enc);
4183 t += rb_enc_mbcput(
'\\', t, enc);
4184 t += rb_enc_mbcput(
'r', t, enc);
4187 t += rb_enc_mbcput(
'\\', t, enc);
4188 t += rb_enc_mbcput(
'f', t, enc);
4191 t += rb_enc_mbcput(
'\\', t, enc);
4192 t += rb_enc_mbcput(
'v', t, enc);
4195 t += rb_enc_mbcput(c, t, enc);
4197 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
4230 options =
RREGEXP_PTR(re)->options & ARG_REG_OPTION_MASK;
4231 if (
RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
4232 if (
RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
4237rb_check_regexp_type(
VALUE re)
4263 return rb_check_regexp_type(re);
4276 else if (argc == 1) {
4278 VALUE re = rb_check_regexp_type(arg);
4283 quoted = rb_reg_s_quote(
Qnil, arg);
4292 int has_asciionly = 0;
4296 for (i = 0; i < argc; i++) {
4303 v = rb_check_regexp_type(e);
4306 if (!rb_enc_asciicompat(enc)) {
4307 if (!has_ascii_incompat)
4308 has_ascii_incompat = enc;
4309 else if (has_ascii_incompat != enc)
4310 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4311 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
4313 else if (rb_reg_fixed_encoding_p(v)) {
4314 if (!has_ascii_compat_fixed)
4315 has_ascii_compat_fixed = enc;
4316 else if (has_ascii_compat_fixed != enc)
4317 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4318 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
4323 v = rb_reg_str_with_term(v, -1);
4328 enc = rb_enc_get(e);
4329 if (!rb_enc_asciicompat(enc)) {
4330 if (!has_ascii_incompat)
4331 has_ascii_incompat = enc;
4332 else if (has_ascii_incompat != enc)
4333 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4334 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
4340 if (!has_ascii_compat_fixed)
4341 has_ascii_compat_fixed = enc;
4342 else if (has_ascii_compat_fixed != enc)
4343 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4344 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
4346 v = rb_reg_s_quote(
Qnil, e);
4348 if (has_ascii_incompat) {
4349 if (has_asciionly) {
4350 rb_raise(rb_eArgError,
"ASCII incompatible encoding: %s",
4351 rb_enc_name(has_ascii_incompat));
4353 if (has_ascii_compat_fixed) {
4354 rb_raise(rb_eArgError,
"incompatible encodings: %s and %s",
4355 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
4360 rb_enc_copy(source, v);
4365 if (has_ascii_incompat) {
4366 result_enc = has_ascii_incompat;
4368 else if (has_ascii_compat_fixed) {
4369 result_enc = has_ascii_compat_fixed;
4372 result_enc = rb_ascii8bit_encoding();
4375 rb_enc_associate(source, result_enc);
4420 return rb_reg_s_union(self, v);
4422 return rb_reg_s_union(self, args);
4447rb_reg_s_linear_time_p(
int argc,
VALUE *argv,
VALUE self)
4450 VALUE re = reg_extract_args(argc, argv, &args);
4453 re =
reg_init_args(rb_reg_alloc(), args.str, args.enc, args.flags);
4456 return RBOOL(onig_check_linear_time(
RREGEXP_PTR(re)));
4465 return reg_copy(copy, re);
4476 int acompat = rb_enc_asciicompat(str_enc);
4478#define ASCGET(s,e,cl) (acompat ? (*(cl)=1,ISASCII((s)[0])?(s)[0]:-1) : rb_enc_ascget((s), (e), (cl), str_enc))
4485 int c = ASCGET(s, e, &clen);
4489 s += mbclen(s, e, str_enc);
4495 if (c !=
'\\' || s == e)
continue;
4502 c = ASCGET(s, e, &clen);
4504 s += mbclen(s, e, str_enc);
4513 case '1':
case '2':
case '3':
case '4':
4514 case '5':
case '6':
case '7':
case '8':
case '9':
4515 if (!
NIL_P(regexp) && onig_noname_group_capture_is_active(
RREGEXP_PTR(regexp))) {
4524 if (s < e && ASCGET(s, e, &clen) ==
'<') {
4525 char *name, *name_end;
4527 name_end = name = s + clen;
4528 while (name_end < e) {
4529 c = ASCGET(name_end, e, &clen);
4530 if (c ==
'>')
break;
4531 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
4535 (
long)(name_end - name));
4536 if ((no = NAME_TO_NUMBER(regs, regexp, n, name, name_end)) < 1) {
4537 name_to_backref_error(n);
4539 p = s = name_end + clen;
4564 no = regs->num_regs-1;
4565 while (BEG(no) == -1 && no > 0) no--;
4566 if (no == 0)
continue;
4579 if (no >= regs->num_regs)
continue;
4580 if (BEG(no) == -1)
continue;
4585 if (!val)
return str;
4594ignorecase_getter(
ID _x,
VALUE *_y)
4617get_LAST_MATCH_INFO(
ID _x,
VALUE *_y)
4619 return match_getter();
4670rb_reg_s_last_match(
int argc,
VALUE *argv,
VALUE _)
4676 n = match_backref_number(match, argv[0]);
4679 return match_getter();
4683re_warn(
const char *s)
4690rb_reg_timeout_p(
regex_t *reg,
void *end_time_)
4692 rb_hrtime_t *end_time = (rb_hrtime_t *)end_time_;
4694 if (*end_time == 0) {
4698 rb_hrtime_t timelimit = reg->timelimit;
4702 timelimit = rb_reg_match_time_limit;
4706 *end_time = rb_hrtime_add(timelimit, rb_hrtime_now());
4710 *end_time = RB_HRTIME_MAX;
4714 if (*end_time < rb_hrtime_now()) {
4732rb_reg_s_timeout_get(
VALUE dummy)
4734 double d = hrtime2double(rb_reg_match_time_limit);
4735 if (d == 0.0)
return Qnil;
4753rb_reg_s_timeout_set(
VALUE dummy,
VALUE timeout)
4755 rb_ractor_ensure_main_ractor(
"can not access Regexp.timeout from non-main Ractors");
4757 set_timeout(&rb_reg_match_time_limit, timeout);
4778rb_reg_timeout_get(
VALUE re)
4781 double d = hrtime2double(
RREGEXP_PTR(re)->timelimit);
4782 if (d == 0.0)
return Qnil;
4809 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
4810 onig_set_warn_func(re_warn);
4811 onig_set_verb_warn_func(re_warn);
4819 rb_gvar_ractor_local(
"$~");
4820 rb_gvar_ractor_local(
"$&");
4821 rb_gvar_ractor_local(
"$`");
4822 rb_gvar_ractor_local(
"$'");
4823 rb_gvar_ractor_local(
"$+");
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define rb_str_new2
Old name of rb_str_new_cstr.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define REALLOC_N
Old name of RB_REALLOC_N.
#define OBJ_INIT_COPY(obj, orig)
Old name of RB_OBJ_INIT_COPY.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define rb_str_buf_new2
Old name of rb_str_buf_new_cstr.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define FIX2INT
Old name of RB_FIX2INT.
#define NUM2DBL
Old name of rb_num2dbl.
#define rb_str_new3
Old name of rb_str_new_shared.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define rb_exc_new3
Old name of rb_exc_new_str.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define NUM2INT
Old name of RB_NUM2INT.
#define INT2NUM
Old name of RB_INT2NUM.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define T_ARRAY
Old name of RUBY_T_ARRAY.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define T_SYMBOL
Old name of RUBY_T_SYMBOL.
#define DBL2NUM
Old name of rb_float_new.
#define T_MATCH
Old name of RUBY_T_MATCH.
#define FL_TEST
Old name of RB_FL_TEST.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define scan_oct(s, l, e)
Old name of ruby_scan_oct.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define rb_str_new4
Old name of rb_str_new_frozen.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
VALUE rb_eStandardError
StandardError exception.
VALUE rb_eRegexpError
RegexpError exception.
#define ruby_verbose
This variable controls whether the interpreter is in debug mode.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
void rb_warn(const char *fmt,...)
Identical to rb_warning(), except it reports unless $VERBOSE is nil.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_obj_reveal(VALUE obj, VALUE klass)
Make a hidden object visible again.
VALUE rb_check_convert_type(VALUE val, int type, const char *name, const char *mid)
Identical to rb_convert_type(), except it returns RUBY_Qnil instead of raising exceptions,...
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_class_new_instance(int argc, const VALUE *argv, VALUE klass)
Allocates, then initialises an instance of the given class.
VALUE rb_cMatch
MatchData class.
VALUE rb_obj_hide(VALUE obj)
Make the object invisible from Ruby code.
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
VALUE rb_cRegexp
Regexp class.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Converts a character option to its encoding.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
VALUE rb_enc_reg_new(const char *ptr, long len, rb_encoding *enc, int opts)
Identical to rb_reg_new(), except it additionally takes an encoding.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
#define RGENGC_WB_PROTECTED_MATCH
This is a compile-time flag to enable/disable write barrier for struct RMatch.
#define RGENGC_WB_PROTECTED_REGEXP
This is a compile-time flag to enable/disable write barrier for struct RRegexp.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new_capa(long capa)
Identical to rb_ary_new(), except it additionally specifies how many rooms of objects it should alloc...
VALUE rb_ary_resize(VALUE ary, long len)
Expands or shrinks the passed array to the passed length.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
int rb_uv_to_utf8(char buf[6], unsigned long uv)
Encodes a Unicode codepoint into its UTF-8 representation.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_lastline_get(void)
Queries the last line, or the $_.
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_last_match(VALUE md)
This just returns the argument, stringified.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
VALUE rb_reg_match_post(VALUE md)
The portion of the original string after the given match.
VALUE rb_reg_nth_defined(int n, VALUE md)
Identical to rb_reg_nth_match(), except it just returns Boolean.
VALUE rb_reg_match_pre(VALUE md)
The portion of the original string before the given match.
VALUE rb_reg_new_str(VALUE src, int opts)
Identical to rb_reg_new(), except it takes the expression in Ruby's string instead of C's.
VALUE rb_reg_match_last(VALUE md)
The portion of the original string that captured at the very last.
VALUE rb_reg_match2(VALUE re)
Identical to rb_reg_match(), except it matches against rb_lastline_get() (or, the $_).
VALUE rb_reg_new(const char *src, long len, int opts)
Creates a new Regular expression.
#define rb_hash_uint(h, i)
Just another name of st_hash_uint.
#define rb_hash_end(h)
Just another name of st_hash_end.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
st_index_t rb_hash_start(st_index_t i)
Starts a series of hashing.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_class_path(VALUE mod)
Identical to rb_mod_name(), except it returns #<Class: ...> style inspection for anonymous modules.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
int len
Length of the buffer.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
regex_t * rb_reg_prepare_re(VALUE re, VALUE str)
Exercises various checks and preprocesses so that the given regular expression can be applied to the ...
long rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int dir)
Tell us if this is a wrong idea, but it seems this function has no usage at all.
OnigPosition rb_reg_onig_match(VALUE re, VALUE str, OnigPosition(*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), void *args, struct re_registers *regs)
Runs a regular expression match using function match.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_quote(VALUE str)
Escapes any characters that would have special meaning in a regular expression.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
int rb_reg_region_copy(struct re_registers *dst, const struct re_registers *src)
Duplicates a match data.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_virtual_variable(const char *q, type *w, void_type *e)
Define a function-backended global variable.
#define RARRAY_LEN
Just another name of rb_array_len.
#define RARRAY_AREF(a, i)
#define RBASIC(obj)
Convenient casting macro.
#define RMATCH(obj)
Convenient casting macro.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
#define RREGEXP(obj)
Convenient casting macro.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
#define RREGEXP_PTR(obj)
Convenient accessor macro.
static long RREGEXP_SRC_LEN(VALUE rexp)
Convenient getter function.
static char * RREGEXP_SRC_PTR(VALUE rexp)
Convenient getter function.
#define StringValue(v)
Ensures that the parameter object is a String.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
Regular expression execution context.
VALUE regexp
The expression of this match.
VALUE str
The target string that the match was made against.
Ruby's regular expression.
struct RBasic basic
Basic part, including flags and class.
const VALUE src
Source code of this expression.
unsigned long usecnt
Reference count.
struct re_pattern_buffer * ptr
The pattern buffer.
struct rmatch_offset * char_offset
Capture group offsets, in C array.
int char_offset_num_allocated
Number of rmatch_offset that ::rmatch::char_offset holds.
struct re_registers regs
"Registers" of a match.
Represents the region of a capture group.
long beg
Beginning of a group.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
#define SIZEOF_VALUE
Identical to sizeof(VALUE), except it is a macro that can also be used inside of preprocessor directi...
uintptr_t VALUE
Type that represents a Ruby object.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.