Ruby 4.1.0dev (2026-04-17 revision 11e3c78b61da705c783dd12fb7f158c0d256ede0)
regcomp.c (11e3c78b61da705c783dd12fb7f158c0d256ede0)
1/**********************************************************************
2 regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include "regparse.h"
32
33OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
34
35extern OnigCaseFoldType
36onig_get_default_case_fold_flag(void)
37{
38 return OnigDefaultCaseFoldFlag;
39}
40
41extern int
42onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
43{
44 OnigDefaultCaseFoldFlag = case_fold_flag;
45 return 0;
46}
47
48
49#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
50static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
51#endif
52
53#if 0
54static UChar*
55str_dup(UChar* s, UChar* end)
56{
57 ptrdiff_t len = end - s;
58
59 if (len > 0) {
60 UChar* r = (UChar* )xmalloc(len + 1);
61 CHECK_NULL_RETURN(r);
62 xmemcpy(r, s, len);
63 r[len] = (UChar )0;
64 return r;
65 }
66 else return NULL;
67}
68#endif
69
70static void
71swap_node(Node* a, Node* b)
72{
73 Node c;
74 c = *a; *a = *b; *b = c;
75
76 if (NTYPE(a) == NT_STR) {
77 StrNode* sn = NSTR(a);
78 if (sn->capa == 0) {
79 size_t len = sn->end - sn->s;
80 sn->s = sn->buf;
81 sn->end = sn->s + len;
82 }
83 }
84
85 if (NTYPE(b) == NT_STR) {
86 StrNode* sn = NSTR(b);
87 if (sn->capa == 0) {
88 size_t len = sn->end - sn->s;
89 sn->s = sn->buf;
90 sn->end = sn->s + len;
91 }
92 }
93}
94
95static OnigDistance
96distance_add(OnigDistance d1, OnigDistance d2)
97{
98 if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
99 return ONIG_INFINITE_DISTANCE;
100 else {
101 if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
102 else return ONIG_INFINITE_DISTANCE;
103 }
104}
105
106static OnigDistance
107distance_multiply(OnigDistance d, int m)
108{
109 if (m == 0) return 0;
110
111 if (d < ONIG_INFINITE_DISTANCE / m)
112 return d * m;
113 else
114 return ONIG_INFINITE_DISTANCE;
115}
116
117static int
118bitset_is_empty(BitSetRef bs)
119{
120 int i;
121 for (i = 0; i < BITSET_SIZE; i++) {
122 if (bs[i] != 0) return 0;
123 }
124 return 1;
125}
126
127#ifdef ONIG_DEBUG
128static int
129bitset_on_num(BitSetRef bs)
130{
131 int i, n;
132
133 n = 0;
134 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
135 if (BITSET_AT(bs, i)) n++;
136 }
137 return n;
138}
139#endif
140
141// Attempt to right size allocated buffers for a regex post compile
142static void
143onig_reg_resize(regex_t *reg)
144{
145 do {
146 if (!reg->used) {
147 xfree(reg->p);
148 reg->alloc = 0;
149 reg->p = 0;
150 }
151 else if (reg->alloc > reg->used) {
152 unsigned char *new_ptr = xrealloc(reg->p, reg->used);
153 // Skip the right size optimization if memory allocation fails
154 if (new_ptr) {
155 reg->alloc = reg->used;
156 reg->p = new_ptr;
157 }
158 }
159 } while ((reg = reg->chain) != 0);
160}
161
162extern int
163onig_bbuf_init(BBuf* buf, OnigDistance size)
164{
165 if (size <= 0) {
166 size = 0;
167 buf->p = NULL;
168 }
169 else {
170 buf->p = (UChar* )xmalloc(size);
171 if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
172 }
173
174 buf->alloc = (unsigned int )size;
175 buf->used = 0;
176 return 0;
177}
178
179
180#ifdef USE_SUBEXP_CALL
181
182static int
183unset_addr_list_init(UnsetAddrList* uslist, int size)
184{
185 UnsetAddr* p;
186
187 p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
188 CHECK_NULL_RETURN_MEMERR(p);
189 uslist->num = 0;
190 uslist->alloc = size;
191 uslist->us = p;
192 return 0;
193}
194
195static void
196unset_addr_list_end(UnsetAddrList* uslist)
197{
198 xfree(uslist->us);
199}
200
201static int
202unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
203{
204 UnsetAddr* p;
205 int size;
206
207 if (uslist->num >= uslist->alloc) {
208 size = uslist->alloc * 2;
209 p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
210 CHECK_NULL_RETURN_MEMERR(p);
211 uslist->alloc = size;
212 uslist->us = p;
213 }
214
215 uslist->us[uslist->num].offset = offset;
216 uslist->us[uslist->num].target = node;
217 uslist->num++;
218 return 0;
219}
220#endif /* USE_SUBEXP_CALL */
221
222
223static int
224add_opcode(regex_t* reg, int opcode)
225{
226 BBUF_ADD1(reg, opcode);
227 return 0;
228}
229
230#ifdef USE_COMBINATION_EXPLOSION_CHECK
231static int
232add_state_check_num(regex_t* reg, int num)
233{
234 StateCheckNumType n = (StateCheckNumType )num;
235
236 BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
237 return 0;
238}
239#endif
240
241static int
242add_rel_addr(regex_t* reg, int addr)
243{
244 RelAddrType ra = (RelAddrType )addr;
245
246 BBUF_ADD(reg, &ra, SIZE_RELADDR);
247 return 0;
248}
249
250static int
251add_abs_addr(regex_t* reg, int addr)
252{
253 AbsAddrType ra = (AbsAddrType )addr;
254
255 BBUF_ADD(reg, &ra, SIZE_ABSADDR);
256 return 0;
257}
258
259static int
260add_length(regex_t* reg, OnigDistance len)
261{
262 LengthType l = (LengthType )len;
263
264 BBUF_ADD(reg, &l, SIZE_LENGTH);
265 return 0;
266}
267
268static int
269add_mem_num(regex_t* reg, int num)
270{
271 MemNumType n = (MemNumType )num;
272
273 BBUF_ADD(reg, &n, SIZE_MEMNUM);
274 return 0;
275}
276
277#if 0
278static int
279add_pointer(regex_t* reg, void* addr)
280{
281 PointerType ptr = (PointerType )addr;
282
283 BBUF_ADD(reg, &ptr, SIZE_POINTER);
284 return 0;
285}
286#endif
287
288static int
289add_option(regex_t* reg, OnigOptionType option)
290{
291 BBUF_ADD(reg, &option, SIZE_OPTION);
292 return 0;
293}
294
295static int
296add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
297{
298 int r;
299
300 r = add_opcode(reg, opcode);
301 if (r) return r;
302 r = add_rel_addr(reg, addr);
303 return r;
304}
305
306static int
307add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
308{
309 BBUF_ADD(reg, bytes, len);
310 return 0;
311}
312
313static int
314add_bitset(regex_t* reg, BitSetRef bs)
315{
316 BBUF_ADD(reg, bs, SIZE_BITSET);
317 return 0;
318}
319
320static int
321add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
322{
323 int r;
324
325 r = add_opcode(reg, opcode);
326 if (r) return r;
327 r = add_option(reg, option);
328 return r;
329}
330
331static int compile_length_tree(Node* node, regex_t* reg);
332static int compile_tree(Node* node, regex_t* reg);
333
334
335#define IS_NEED_STR_LEN_OP_EXACT(op) \
336 ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
337 (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
338
339static int
340select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
341{
342 int op;
343 OnigDistance str_len = roomof(byte_len, mb_len);
344
345 if (ignore_case) {
346 switch (str_len) {
347 case 1: op = OP_EXACT1_IC; break;
348 default: op = OP_EXACTN_IC; break;
349 }
350 }
351 else {
352 switch (mb_len) {
353 case 1:
354 switch (str_len) {
355 case 1: op = OP_EXACT1; break;
356 case 2: op = OP_EXACT2; break;
357 case 3: op = OP_EXACT3; break;
358 case 4: op = OP_EXACT4; break;
359 case 5: op = OP_EXACT5; break;
360 default: op = OP_EXACTN; break;
361 }
362 break;
363
364 case 2:
365 switch (str_len) {
366 case 1: op = OP_EXACTMB2N1; break;
367 case 2: op = OP_EXACTMB2N2; break;
368 case 3: op = OP_EXACTMB2N3; break;
369 default: op = OP_EXACTMB2N; break;
370 }
371 break;
372
373 case 3:
374 op = OP_EXACTMB3N;
375 break;
376
377 default:
378 op = OP_EXACTMBN;
379 break;
380 }
381 }
382 return op;
383}
384
385static int
386compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
387{
388 int r;
389 int saved_num_null_check = reg->num_null_check;
390
391 if (empty_info != 0) {
392 r = add_opcode(reg, OP_NULL_CHECK_START);
393 if (r) return r;
394 r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
395 if (r) return r;
396 reg->num_null_check++;
397 }
398
399 r = compile_tree(node, reg);
400 if (r) return r;
401
402 if (empty_info != 0) {
403 if (empty_info == NQ_TARGET_IS_EMPTY)
404 r = add_opcode(reg, OP_NULL_CHECK_END);
405 else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
406 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
407 else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
408 r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
409
410 if (r) return r;
411 r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
412 }
413 return r;
414}
415
416#ifdef USE_SUBEXP_CALL
417static int
418compile_call(CallNode* node, regex_t* reg)
419{
420 int r;
421
422 r = add_opcode(reg, OP_CALL);
423 if (r) return r;
424 r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
425 node->target);
426 if (r) return r;
427 r = add_abs_addr(reg, 0 /*dummy addr.*/);
428 return r;
429}
430#endif
431
432static int
433compile_tree_n_times(Node* node, int n, regex_t* reg)
434{
435 int i, r;
436
437 for (i = 0; i < n; i++) {
438 r = compile_tree(node, reg);
439 if (r) return r;
440 }
441 return 0;
442}
443
444static int
445add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
446 regex_t* reg ARG_UNUSED, int ignore_case)
447{
448 int len;
449 int op = select_str_opcode(mb_len, byte_len, ignore_case);
450
451 len = SIZE_OPCODE;
452
453 if (op == OP_EXACTMBN) len += SIZE_LENGTH;
454 if (IS_NEED_STR_LEN_OP_EXACT(op))
455 len += SIZE_LENGTH;
456
457 len += (int )byte_len;
458 return len;
459}
460
461static int
462add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
463 regex_t* reg, int ignore_case)
464{
465 int op = select_str_opcode(mb_len, byte_len, ignore_case);
466 add_opcode(reg, op);
467
468 if (op == OP_EXACTMBN)
469 add_length(reg, mb_len);
470
471 if (IS_NEED_STR_LEN_OP_EXACT(op)) {
472 if (op == OP_EXACTN_IC)
473 add_length(reg, byte_len);
474 else
475 add_length(reg, byte_len / mb_len);
476 }
477
478 add_bytes(reg, s, byte_len);
479 return 0;
480}
481
482
483static int
484compile_length_string_node(Node* node, regex_t* reg)
485{
486 int rlen, r, len, prev_len, blen, ambig;
487 OnigEncoding enc = reg->enc;
488 UChar *p, *prev;
489 StrNode* sn;
490
491 sn = NSTR(node);
492 if (sn->end <= sn->s)
493 return 0;
494
495 ambig = NSTRING_IS_AMBIG(node);
496
497 p = prev = sn->s;
498 prev_len = enclen(enc, p, sn->end);
499 p += prev_len;
500 blen = prev_len;
501 rlen = 0;
502
503 for (; p < sn->end; ) {
504 len = enclen(enc, p, sn->end);
505 if (len == prev_len || ambig) {
506 blen += len;
507 }
508 else {
509 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
510 rlen += r;
511 prev = p;
512 blen = len;
513 prev_len = len;
514 }
515 p += len;
516 }
517 r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
518 rlen += r;
519 return rlen;
520}
521
522static int
523compile_length_string_raw_node(StrNode* sn, regex_t* reg)
524{
525 if (sn->end <= sn->s)
526 return 0;
527
528 return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
529}
530
531static int
532compile_string_node(Node* node, regex_t* reg)
533{
534 int r, len, prev_len, blen, ambig;
535 OnigEncoding enc = reg->enc;
536 UChar *p, *prev, *end;
537 StrNode* sn;
538
539 sn = NSTR(node);
540 if (sn->end <= sn->s)
541 return 0;
542
543 end = sn->end;
544 ambig = NSTRING_IS_AMBIG(node);
545
546 p = prev = sn->s;
547 prev_len = enclen(enc, p, end);
548 p += prev_len;
549 blen = prev_len;
550
551 for (; p < end; ) {
552 len = enclen(enc, p, end);
553 if (len == prev_len || ambig) {
554 blen += len;
555 }
556 else {
557 r = add_compile_string(prev, prev_len, blen, reg, ambig);
558 if (r) return r;
559
560 prev = p;
561 blen = len;
562 prev_len = len;
563 }
564
565 p += len;
566 }
567 return add_compile_string(prev, prev_len, blen, reg, ambig);
568}
569
570static int
571compile_string_raw_node(StrNode* sn, regex_t* reg)
572{
573 if (sn->end <= sn->s)
574 return 0;
575
576 return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
577}
578
579static int
580add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
581{
582#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
583 add_length(reg, mbuf->used);
584 return add_bytes(reg, mbuf->p, mbuf->used);
585#else
586 int r, pad_size;
587 UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
588
589 GET_ALIGNMENT_PAD_SIZE(p, pad_size);
590 add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
591 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
592
593 r = add_bytes(reg, mbuf->p, mbuf->used);
594
595 /* padding for return value from compile_length_cclass_node() to be fix. */
596 pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
597 if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
598 return r;
599#endif
600}
601
602static int
603compile_length_cclass_node(CClassNode* cc, regex_t* reg)
604{
605 int len;
606
607 if (IS_NULL(cc->mbuf)) {
608 len = SIZE_OPCODE + SIZE_BITSET;
609 }
610 else {
611 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
612 len = SIZE_OPCODE;
613 }
614 else {
615 len = SIZE_OPCODE + SIZE_BITSET;
616 }
617#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
618 len += SIZE_LENGTH + cc->mbuf->used;
619#else
620 len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
621#endif
622 }
623
624 return len;
625}
626
627static int
628compile_cclass_node(CClassNode* cc, regex_t* reg)
629{
630 int r;
631
632 if (IS_NULL(cc->mbuf)) {
633 if (IS_NCCLASS_NOT(cc))
634 add_opcode(reg, OP_CCLASS_NOT);
635 else
636 add_opcode(reg, OP_CCLASS);
637
638 r = add_bitset(reg, cc->bs);
639 }
640 else {
641 if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
642 if (IS_NCCLASS_NOT(cc))
643 add_opcode(reg, OP_CCLASS_MB_NOT);
644 else
645 add_opcode(reg, OP_CCLASS_MB);
646
647 r = add_multi_byte_cclass(cc->mbuf, reg);
648 }
649 else {
650 if (IS_NCCLASS_NOT(cc))
651 add_opcode(reg, OP_CCLASS_MIX_NOT);
652 else
653 add_opcode(reg, OP_CCLASS_MIX);
654
655 r = add_bitset(reg, cc->bs);
656 if (r) return r;
657 r = add_multi_byte_cclass(cc->mbuf, reg);
658 }
659 }
660
661 return r;
662}
663
664static int
665entry_repeat_range(regex_t* reg, int id, int lower, int upper)
666{
667#define REPEAT_RANGE_ALLOC 4
668
670
671 if (reg->repeat_range_alloc == 0) {
672 p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
673 CHECK_NULL_RETURN_MEMERR(p);
674 reg->repeat_range = p;
675 reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
676 }
677 else if (reg->repeat_range_alloc <= id) {
678 int n;
679 n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
680 p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
681 sizeof(OnigRepeatRange) * n);
682 CHECK_NULL_RETURN_MEMERR(p);
683 reg->repeat_range = p;
684 reg->repeat_range_alloc = n;
685 }
686 else {
687 p = reg->repeat_range;
688 }
689
690 p[id].lower = lower;
691 p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
692 return 0;
693}
694
695static int
696compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
697 regex_t* reg)
698{
699 int r;
700 int num_repeat = reg->num_repeat;
701
702 r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
703 if (r) return r;
704 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
705 reg->num_repeat++;
706 if (r) return r;
707 r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
708 if (r) return r;
709
710 r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
711 if (r) return r;
712
713 r = compile_tree_empty_check(qn->target, reg, empty_info);
714 if (r) return r;
715
716 if (
717#ifdef USE_SUBEXP_CALL
718 reg->num_call > 0 ||
719#endif
720 IS_QUANTIFIER_IN_REPEAT(qn)) {
721 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
722 }
723 else {
724 r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
725 }
726 if (r) return r;
727 r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
728 return r;
729}
730
731static int
732is_anychar_star_quantifier(QtfrNode* qn)
733{
734 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
735 NTYPE(qn->target) == NT_CANY)
736 return 1;
737 else
738 return 0;
739}
740
741#define QUANTIFIER_EXPAND_LIMIT_SIZE 50
742#define CKN_ON (ckn > 0)
743
744#ifdef USE_COMBINATION_EXPLOSION_CHECK
745
746static int
747compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
748{
749 int len, mod_tlen, cklen;
750 int ckn;
751 int infinite = IS_REPEAT_INFINITE(qn->upper);
752 int empty_info = qn->target_empty_info;
753 int tlen = compile_length_tree(qn->target, reg);
754
755 if (tlen < 0) return tlen;
756
757 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
758
759 cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
760
761 /* anychar repeat */
762 if (NTYPE(qn->target) == NT_CANY) {
763 if (qn->greedy && infinite) {
764 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
765 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
766 else
767 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
768 }
769 }
770
771 if (empty_info != 0)
772 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
773 else
774 mod_tlen = tlen;
775
776 if (infinite && qn->lower <= 1) {
777 if (qn->greedy) {
778 if (qn->lower == 1)
779 len = SIZE_OP_JUMP;
780 else
781 len = 0;
782
783 len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
784 }
785 else {
786 if (qn->lower == 0)
787 len = SIZE_OP_JUMP;
788 else
789 len = 0;
790
791 len += mod_tlen + SIZE_OP_PUSH + cklen;
792 }
793 }
794 else if (qn->upper == 0) {
795 if (qn->is_referred != 0) /* /(?<n>..){0}/ */
796 len = SIZE_OP_JUMP + tlen;
797 else
798 len = 0;
799 }
800 else if (qn->upper == 1 && qn->greedy) {
801 if (qn->lower == 0) {
802 if (CKN_ON) {
803 len = SIZE_OP_STATE_CHECK_PUSH + tlen;
804 }
805 else {
806 len = SIZE_OP_PUSH + tlen;
807 }
808 }
809 else {
810 len = tlen;
811 }
812 }
813 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
814 len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
815 }
816 else {
817 len = SIZE_OP_REPEAT_INC
818 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
819 if (CKN_ON)
820 len += SIZE_OP_STATE_CHECK;
821 }
822
823 return len;
824}
825
826static int
827compile_quantifier_node(QtfrNode* qn, regex_t* reg)
828{
829 int r, mod_tlen;
830 int ckn;
831 int infinite = IS_REPEAT_INFINITE(qn->upper);
832 int empty_info = qn->target_empty_info;
833 int tlen = compile_length_tree(qn->target, reg);
834
835 if (tlen < 0) return tlen;
836
837 ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
838
839 if (is_anychar_star_quantifier(qn)) {
840 r = compile_tree_n_times(qn->target, qn->lower, reg);
841 if (r) return r;
842 if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
843 if (IS_MULTILINE(reg->options))
844 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
845 else
846 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
847 if (r) return r;
848 if (CKN_ON) {
849 r = add_state_check_num(reg, ckn);
850 if (r) return r;
851 }
852
853 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
854 }
855 else {
856 if (IS_MULTILINE(reg->options)) {
857 r = add_opcode(reg, (CKN_ON ?
858 OP_STATE_CHECK_ANYCHAR_ML_STAR
859 : OP_ANYCHAR_ML_STAR));
860 }
861 else {
862 r = add_opcode(reg, (CKN_ON ?
863 OP_STATE_CHECK_ANYCHAR_STAR
864 : OP_ANYCHAR_STAR));
865 }
866 if (r) return r;
867 if (CKN_ON)
868 r = add_state_check_num(reg, ckn);
869
870 return r;
871 }
872 }
873
874 if (empty_info != 0)
875 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
876 else
877 mod_tlen = tlen;
878
879 if (infinite && qn->lower <= 1) {
880 if (qn->greedy) {
881 if (qn->lower == 1) {
882 r = add_opcode_rel_addr(reg, OP_JUMP,
883 (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
884 if (r) return r;
885 }
886
887 if (CKN_ON) {
888 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
889 if (r) return r;
890 r = add_state_check_num(reg, ckn);
891 if (r) return r;
892 r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
893 }
894 else {
895 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
896 }
897 if (r) return r;
898 r = compile_tree_empty_check(qn->target, reg, empty_info);
899 if (r) return r;
900 r = add_opcode_rel_addr(reg, OP_JUMP,
901 -(mod_tlen + (int )SIZE_OP_JUMP
902 + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
903 }
904 else {
905 if (qn->lower == 0) {
906 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
907 if (r) return r;
908 }
909 r = compile_tree_empty_check(qn->target, reg, empty_info);
910 if (r) return r;
911 if (CKN_ON) {
912 r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
913 if (r) return r;
914 r = add_state_check_num(reg, ckn);
915 if (r) return r;
916 r = add_rel_addr(reg,
917 -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
918 }
919 else
920 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
921 }
922 }
923 else if (qn->upper == 0) {
924 if (qn->is_referred != 0) { /* /(?<n>..){0}/ */
925 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
926 if (r) return r;
927 r = compile_tree(qn->target, reg);
928 }
929 else
930 r = 0;
931 }
932 else if (qn->upper == 1 && qn->greedy) {
933 if (qn->lower == 0) {
934 if (CKN_ON) {
935 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
936 if (r) return r;
937 r = add_state_check_num(reg, ckn);
938 if (r) return r;
939 r = add_rel_addr(reg, tlen);
940 }
941 else {
942 r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
943 }
944 if (r) return r;
945 }
946
947 r = compile_tree(qn->target, reg);
948 }
949 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
950 if (CKN_ON) {
951 r = add_opcode(reg, OP_STATE_CHECK_PUSH);
952 if (r) return r;
953 r = add_state_check_num(reg, ckn);
954 if (r) return r;
955 r = add_rel_addr(reg, SIZE_OP_JUMP);
956 }
957 else {
958 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
959 }
960
961 if (r) return r;
962 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
963 if (r) return r;
964 r = compile_tree(qn->target, reg);
965 }
966 else {
967 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
968 if (CKN_ON) {
969 if (r) return r;
970 r = add_opcode(reg, OP_STATE_CHECK);
971 if (r) return r;
972 r = add_state_check_num(reg, ckn);
973 }
974 }
975 return r;
976}
977
978#else /* USE_COMBINATION_EXPLOSION_CHECK */
979
980static int
981compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
982{
983 int len, mod_tlen;
984 int infinite = IS_REPEAT_INFINITE(qn->upper);
985 int empty_info = qn->target_empty_info;
986 int tlen = compile_length_tree(qn->target, reg);
987
988 if (tlen < 0) return tlen;
989
990 /* anychar repeat */
991 if (NTYPE(qn->target) == NT_CANY) {
992 if (qn->greedy && infinite) {
993 if (IS_NOT_NULL(qn->next_head_exact))
994 return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
995 else
996 return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
997 }
998 }
999
1000 if (empty_info != 0)
1001 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1002 else
1003 mod_tlen = tlen;
1004
1005 if (infinite &&
1006 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1007 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1008 len = SIZE_OP_JUMP;
1009 }
1010 else {
1011 len = tlen * qn->lower;
1012 }
1013
1014 if (qn->greedy) {
1015#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1016 if (IS_NOT_NULL(qn->head_exact))
1017 len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
1018 else
1019#endif
1020 if (IS_NOT_NULL(qn->next_head_exact))
1021 len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
1022 else
1023 len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
1024 }
1025 else
1026 len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
1027 }
1028 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1029 len = SIZE_OP_JUMP + tlen;
1030 }
1031 else if (!infinite && qn->greedy &&
1032 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1033 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1034 len = tlen * qn->lower;
1035 len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
1036 }
1037 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1038 len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
1039 }
1040 else {
1041 len = SIZE_OP_REPEAT_INC
1042 + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
1043 }
1044
1045 return len;
1046}
1047
1048static int
1049compile_quantifier_node(QtfrNode* qn, regex_t* reg)
1050{
1051 int i, r, mod_tlen;
1052 int infinite = IS_REPEAT_INFINITE(qn->upper);
1053 int empty_info = qn->target_empty_info;
1054 int tlen = compile_length_tree(qn->target, reg);
1055
1056 if (tlen < 0) return tlen;
1057
1058 if (is_anychar_star_quantifier(qn)) {
1059 r = compile_tree_n_times(qn->target, qn->lower, reg);
1060 if (r) return r;
1061 if (IS_NOT_NULL(qn->next_head_exact)) {
1062 if (IS_MULTILINE(reg->options))
1063 r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
1064 else
1065 r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
1066 if (r) return r;
1067 return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1068 }
1069 else {
1070 if (IS_MULTILINE(reg->options))
1071 return add_opcode(reg, OP_ANYCHAR_ML_STAR);
1072 else
1073 return add_opcode(reg, OP_ANYCHAR_STAR);
1074 }
1075 }
1076
1077 if (empty_info != 0)
1078 mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
1079 else
1080 mod_tlen = tlen;
1081
1082 if (infinite &&
1083 (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1084 if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
1085 if (qn->greedy) {
1086#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1087 if (IS_NOT_NULL(qn->head_exact))
1088 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
1089 else
1090#endif
1091 if (IS_NOT_NULL(qn->next_head_exact))
1092 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
1093 else
1094 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
1095 }
1096 else {
1097 r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
1098 }
1099 if (r) return r;
1100 }
1101 else {
1102 r = compile_tree_n_times(qn->target, qn->lower, reg);
1103 if (r) return r;
1104 }
1105
1106 if (qn->greedy) {
1107#ifdef USE_OP_PUSH_OR_JUMP_EXACT
1108 if (IS_NOT_NULL(qn->head_exact)) {
1109 r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
1110 mod_tlen + SIZE_OP_JUMP);
1111 if (r) return r;
1112 add_bytes(reg, NSTR(qn->head_exact)->s, 1);
1113 r = compile_tree_empty_check(qn->target, reg, empty_info);
1114 if (r) return r;
1115 r = add_opcode_rel_addr(reg, OP_JUMP,
1116 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
1117 }
1118 else
1119#endif
1120 if (IS_NOT_NULL(qn->next_head_exact)) {
1121 r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
1122 mod_tlen + SIZE_OP_JUMP);
1123 if (r) return r;
1124 add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
1125 r = compile_tree_empty_check(qn->target, reg, empty_info);
1126 if (r) return r;
1127 r = add_opcode_rel_addr(reg, OP_JUMP,
1128 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
1129 }
1130 else {
1131 r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
1132 if (r) return r;
1133 r = compile_tree_empty_check(qn->target, reg, empty_info);
1134 if (r) return r;
1135 r = add_opcode_rel_addr(reg, OP_JUMP,
1136 -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
1137 }
1138 }
1139 else {
1140 r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
1141 if (r) return r;
1142 r = compile_tree_empty_check(qn->target, reg, empty_info);
1143 if (r) return r;
1144 r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
1145 }
1146 }
1147 else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
1148 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1149 if (r) return r;
1150 r = compile_tree(qn->target, reg);
1151 }
1152 else if (!infinite && qn->greedy &&
1153 (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
1154 <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
1155 int n = qn->upper - qn->lower;
1156
1157 r = compile_tree_n_times(qn->target, qn->lower, reg);
1158 if (r) return r;
1159
1160 for (i = 0; i < n; i++) {
1161 r = add_opcode_rel_addr(reg, OP_PUSH,
1162 (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
1163 if (r) return r;
1164 r = compile_tree(qn->target, reg);
1165 if (r) return r;
1166 }
1167 }
1168 else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
1169 r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
1170 if (r) return r;
1171 r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
1172 if (r) return r;
1173 r = compile_tree(qn->target, reg);
1174 }
1175 else {
1176 r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
1177 }
1178 return r;
1179}
1180#endif /* USE_COMBINATION_EXPLOSION_CHECK */
1181
1182static int
1183compile_length_option_node(EncloseNode* node, regex_t* reg)
1184{
1185 int tlen;
1186 OnigOptionType prev = reg->options;
1187
1188 reg->options = node->option;
1189 tlen = compile_length_tree(node->target, reg);
1190 reg->options = prev;
1191
1192 if (tlen < 0) return tlen;
1193
1194 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1195 return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
1196 + tlen + SIZE_OP_SET_OPTION;
1197 }
1198 else
1199 return tlen;
1200}
1201
1202static int
1203compile_option_node(EncloseNode* node, regex_t* reg)
1204{
1205 int r;
1206 OnigOptionType prev = reg->options;
1207
1208 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1209 r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
1210 if (r) return r;
1211 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1212 if (r) return r;
1213 r = add_opcode(reg, OP_FAIL);
1214 if (r) return r;
1215 }
1216
1217 reg->options = node->option;
1218 r = compile_tree(node->target, reg);
1219 reg->options = prev;
1220
1221 if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
1222 if (r) return r;
1223 r = add_opcode_option(reg, OP_SET_OPTION, prev);
1224 }
1225 return r;
1226}
1227
1228static int
1229compile_length_enclose_node(EncloseNode* node, regex_t* reg)
1230{
1231 int len;
1232 int tlen;
1233
1234 if (node->type == ENCLOSE_OPTION)
1235 return compile_length_option_node(node, reg);
1236
1237 if (node->target) {
1238 tlen = compile_length_tree(node->target, reg);
1239 if (tlen < 0) return tlen;
1240 }
1241 else
1242 tlen = 0;
1243
1244 switch (node->type) {
1245 case ENCLOSE_MEMORY:
1246#ifdef USE_SUBEXP_CALL
1247 if (IS_ENCLOSE_CALLED(node)) {
1248 len = SIZE_OP_MEMORY_START_PUSH + tlen
1249 + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
1250 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1251 len += (IS_ENCLOSE_RECURSION(node)
1252 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1253 else
1254 len += (IS_ENCLOSE_RECURSION(node)
1255 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1256 }
1257 else if (IS_ENCLOSE_RECURSION(node)) {
1258 len = SIZE_OP_MEMORY_START_PUSH;
1259 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1260 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
1261 }
1262 else
1263#endif
1264 {
1265 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1266 len = SIZE_OP_MEMORY_START_PUSH;
1267 else
1268 len = SIZE_OP_MEMORY_START;
1269
1270 len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
1271 ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
1272 }
1273 break;
1274
1275 case ENCLOSE_STOP_BACKTRACK:
1276 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1277 /* optimization because the match cache optimization pushes an extra item to */
1278 /* the stack and it breaks the assumption for this optimization. */
1279#ifndef USE_MATCH_CACHE
1280 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1281 QtfrNode* qn = NQTFR(node->target);
1282 tlen = compile_length_tree(qn->target, reg);
1283 if (tlen < 0) return tlen;
1284
1285 len = tlen * qn->lower
1286 + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
1287 }
1288 else {
1289#endif
1290 len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
1291#ifndef USE_MATCH_CACHE
1292 }
1293#endif
1294 break;
1295
1296 case ENCLOSE_CONDITION:
1297 len = SIZE_OP_CONDITION;
1298 if (NTYPE(node->target) == NT_ALT) {
1299 Node* x = node->target;
1300
1301 tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
1302 if (tlen < 0) return tlen;
1303 len += tlen + SIZE_OP_JUMP;
1304 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1305 x = NCDR(x);
1306 tlen = compile_length_tree(NCAR(x), reg); /* no-node */
1307 if (tlen < 0) return tlen;
1308 len += tlen;
1309 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1310 }
1311 else {
1312 return ONIGERR_PARSER_BUG;
1313 }
1314 break;
1315
1316 case ENCLOSE_ABSENT:
1317 len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END;
1318 break;
1319
1320 default:
1321 return ONIGERR_TYPE_BUG;
1322 break;
1323 }
1324
1325 return len;
1326}
1327
1328static int get_char_length_tree(Node* node, regex_t* reg, int* len);
1329
1330static int
1331compile_enclose_node(EncloseNode* node, regex_t* reg)
1332{
1333 int r, len;
1334
1335 if (node->type == ENCLOSE_OPTION)
1336 return compile_option_node(node, reg);
1337
1338 switch (node->type) {
1339 case ENCLOSE_MEMORY:
1340#ifdef USE_SUBEXP_CALL
1341 if (IS_ENCLOSE_CALLED(node)) {
1342 r = add_opcode(reg, OP_CALL);
1343 if (r) return r;
1344 node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
1345 node->state |= NST_ADDR_FIXED;
1346 r = add_abs_addr(reg, (int )node->call_addr);
1347 if (r) return r;
1348 len = compile_length_tree(node->target, reg);
1349 len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
1350 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1351 len += (IS_ENCLOSE_RECURSION(node)
1352 ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
1353 else
1354 len += (IS_ENCLOSE_RECURSION(node)
1355 ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
1356
1357 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1358 if (r) return r;
1359 }
1360#endif
1361 if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
1362 r = add_opcode(reg, OP_MEMORY_START_PUSH);
1363 else
1364 r = add_opcode(reg, OP_MEMORY_START);
1365 if (r) return r;
1366 r = add_mem_num(reg, node->regnum);
1367 if (r) return r;
1368 r = compile_tree(node->target, reg);
1369 if (r) return r;
1370#ifdef USE_SUBEXP_CALL
1371 if (IS_ENCLOSE_CALLED(node)) {
1372 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1373 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1374 ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
1375 else
1376 r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
1377 ? OP_MEMORY_END_REC : OP_MEMORY_END));
1378
1379 if (r) return r;
1380 r = add_mem_num(reg, node->regnum);
1381 if (r) return r;
1382 r = add_opcode(reg, OP_RETURN);
1383 }
1384 else if (IS_ENCLOSE_RECURSION(node)) {
1385 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1386 r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
1387 else
1388 r = add_opcode(reg, OP_MEMORY_END_REC);
1389 if (r) return r;
1390 r = add_mem_num(reg, node->regnum);
1391 }
1392 else
1393#endif
1394 {
1395 if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
1396 r = add_opcode(reg, OP_MEMORY_END_PUSH);
1397 else
1398 r = add_opcode(reg, OP_MEMORY_END);
1399 if (r) return r;
1400 r = add_mem_num(reg, node->regnum);
1401 }
1402 break;
1403
1404 case ENCLOSE_STOP_BACKTRACK:
1405 /* Disable POP_STOP_BT optimization for simple repeat under the match cache */
1406 /* optimization because the match cache optimization pushes an extra item to */
1407 /* the stack and it breaks the assumption for this optimization. */
1408#ifndef USE_MATCH_CACHE
1409 if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
1410 QtfrNode* qn = NQTFR(node->target);
1411 r = compile_tree_n_times(qn->target, qn->lower, reg);
1412 if (r) return r;
1413
1414 len = compile_length_tree(qn->target, reg);
1415 if (len < 0) return len;
1416
1417 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
1418 if (r) return r;
1419 r = compile_tree(qn->target, reg);
1420 if (r) return r;
1421 r = add_opcode(reg, OP_POP);
1422 if (r) return r;
1423 r = add_opcode_rel_addr(reg, OP_JUMP,
1424 -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
1425 }
1426 else {
1427#endif
1428 r = add_opcode(reg, OP_PUSH_STOP_BT);
1429 if (r) return r;
1430 r = compile_tree(node->target, reg);
1431 if (r) return r;
1432 r = add_opcode(reg, OP_POP_STOP_BT);
1433#ifndef USE_MATCH_CACHE
1434 }
1435#endif
1436 break;
1437
1438 case ENCLOSE_CONDITION:
1439 r = add_opcode(reg, OP_CONDITION);
1440 if (r) return r;
1441 r = add_mem_num(reg, node->regnum);
1442 if (r) return r;
1443
1444 if (NTYPE(node->target) == NT_ALT) {
1445 Node* x = node->target;
1446 int len2;
1447
1448 len = compile_length_tree(NCAR(x), reg); /* yes-node */
1449 if (len < 0) return len;
1450 if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
1451 x = NCDR(x);
1452 len2 = compile_length_tree(NCAR(x), reg); /* no-node */
1453 if (len2 < 0) return len2;
1454 if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
1455
1456 x = node->target;
1457 r = add_rel_addr(reg, len + SIZE_OP_JUMP);
1458 if (r) return r;
1459 r = compile_tree(NCAR(x), reg); /* yes-node */
1460 if (r) return r;
1461 r = add_opcode_rel_addr(reg, OP_JUMP, len2);
1462 if (r) return r;
1463 x = NCDR(x);
1464 r = compile_tree(NCAR(x), reg); /* no-node */
1465 }
1466 else {
1467 return ONIGERR_PARSER_BUG;
1468 }
1469 break;
1470
1471 case ENCLOSE_ABSENT:
1472 len = compile_length_tree(node->target, reg);
1473 if (len < 0) return len;
1474
1475 r = add_opcode(reg, OP_PUSH_ABSENT_POS);
1476 if (r) return r;
1477 r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END);
1478 if (r) return r;
1479 r = compile_tree(node->target, reg);
1480 if (r) return r;
1481 r = add_opcode(reg, OP_ABSENT_END);
1482 break;
1483
1484 default:
1485 return ONIGERR_TYPE_BUG;
1486 break;
1487 }
1488
1489 return r;
1490}
1491
1492static int
1493compile_length_anchor_node(AnchorNode* node, regex_t* reg)
1494{
1495 int len;
1496 int tlen = 0;
1497
1498 if (node->target) {
1499 tlen = compile_length_tree(node->target, reg);
1500 if (tlen < 0) return tlen;
1501 }
1502
1503 switch (node->type) {
1504 case ANCHOR_PREC_READ:
1505 len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
1506 break;
1507 case ANCHOR_PREC_READ_NOT:
1508 len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
1509 break;
1510 case ANCHOR_LOOK_BEHIND:
1511 len = SIZE_OP_LOOK_BEHIND + tlen;
1512 break;
1513 case ANCHOR_LOOK_BEHIND_NOT:
1514 len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
1515 break;
1516
1517 default:
1518 len = SIZE_OPCODE;
1519 break;
1520 }
1521
1522 return len;
1523}
1524
1525static int
1526compile_anchor_node(AnchorNode* node, regex_t* reg)
1527{
1528 int r, len;
1529
1530 switch (node->type) {
1531 case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
1532 case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
1533 case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
1534 case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
1535 case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
1536 case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
1537
1538 case ANCHOR_WORD_BOUND:
1539 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
1540 else r = add_opcode(reg, OP_WORD_BOUND);
1541 break;
1542 case ANCHOR_NOT_WORD_BOUND:
1543 if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
1544 else r = add_opcode(reg, OP_NOT_WORD_BOUND);
1545 break;
1546#ifdef USE_WORD_BEGIN_END
1547 case ANCHOR_WORD_BEGIN:
1548 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
1549 else r = add_opcode(reg, OP_WORD_BEGIN);
1550 break;
1551 case ANCHOR_WORD_END:
1552 if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
1553 else r = add_opcode(reg, OP_WORD_END);
1554 break;
1555#endif
1556 case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
1557
1558 case ANCHOR_PREC_READ:
1559 r = add_opcode(reg, OP_PUSH_POS);
1560 if (r) return r;
1561 r = compile_tree(node->target, reg);
1562 if (r) return r;
1563 r = add_opcode(reg, OP_POP_POS);
1564 break;
1565
1566 case ANCHOR_PREC_READ_NOT:
1567 len = compile_length_tree(node->target, reg);
1568 if (len < 0) return len;
1569 r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
1570 if (r) return r;
1571 r = compile_tree(node->target, reg);
1572 if (r) return r;
1573 r = add_opcode(reg, OP_FAIL_POS);
1574 break;
1575
1576 case ANCHOR_LOOK_BEHIND:
1577 {
1578 int n;
1579 r = add_opcode(reg, OP_LOOK_BEHIND);
1580 if (r) return r;
1581 if (node->char_len < 0) {
1582 r = get_char_length_tree(node->target, reg, &n);
1583 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1584 }
1585 else
1586 n = node->char_len;
1587 r = add_length(reg, n);
1588 if (r) return r;
1589 r = compile_tree(node->target, reg);
1590 }
1591 break;
1592
1593 case ANCHOR_LOOK_BEHIND_NOT:
1594 {
1595 int n;
1596 len = compile_length_tree(node->target, reg);
1597 r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
1598 len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
1599 if (r) return r;
1600 if (node->char_len < 0) {
1601 r = get_char_length_tree(node->target, reg, &n);
1602 if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
1603 }
1604 else
1605 n = node->char_len;
1606 r = add_length(reg, n);
1607 if (r) return r;
1608 r = compile_tree(node->target, reg);
1609 if (r) return r;
1610 r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
1611 }
1612 break;
1613
1614 default:
1615 return ONIGERR_TYPE_BUG;
1616 break;
1617 }
1618
1619 return r;
1620}
1621
1622static int
1623compile_length_tree(Node* node, regex_t* reg)
1624{
1625 int len, type, r;
1626
1627 type = NTYPE(node);
1628 switch (type) {
1629 case NT_LIST:
1630 len = 0;
1631 do {
1632 r = compile_length_tree(NCAR(node), reg);
1633 if (r < 0) return r;
1634 len += r;
1635 } while (IS_NOT_NULL(node = NCDR(node)));
1636 r = len;
1637 break;
1638
1639 case NT_ALT:
1640 {
1641 int n = 0;
1642 len = 0;
1643 do {
1644 r = compile_length_tree(NCAR(node), reg);
1645 if (r < 0) return r;
1646 len += r;
1647 n++;
1648 } while (IS_NOT_NULL(node = NCDR(node)));
1649 r = len;
1650 r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
1651 }
1652 break;
1653
1654 case NT_STR:
1655 if (NSTRING_IS_RAW(node))
1656 r = compile_length_string_raw_node(NSTR(node), reg);
1657 else
1658 r = compile_length_string_node(node, reg);
1659 break;
1660
1661 case NT_CCLASS:
1662 r = compile_length_cclass_node(NCCLASS(node), reg);
1663 break;
1664
1665 case NT_CTYPE:
1666 case NT_CANY:
1667 r = SIZE_OPCODE;
1668 break;
1669
1670 case NT_BREF:
1671 {
1672 BRefNode* br = NBREF(node);
1673
1674#ifdef USE_BACKREF_WITH_LEVEL
1675 if (IS_BACKREF_NEST_LEVEL(br)) {
1676 r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
1677 SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1678 }
1679 else
1680#endif
1681 if (br->back_num == 1) {
1682 r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
1683 ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
1684 }
1685 else {
1686 r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
1687 }
1688 }
1689 break;
1690
1691#ifdef USE_SUBEXP_CALL
1692 case NT_CALL:
1693 r = SIZE_OP_CALL;
1694 break;
1695#endif
1696
1697 case NT_QTFR:
1698 r = compile_length_quantifier_node(NQTFR(node), reg);
1699 break;
1700
1701 case NT_ENCLOSE:
1702 r = compile_length_enclose_node(NENCLOSE(node), reg);
1703 break;
1704
1705 case NT_ANCHOR:
1706 r = compile_length_anchor_node(NANCHOR(node), reg);
1707 break;
1708
1709 default:
1710 return ONIGERR_TYPE_BUG;
1711 break;
1712 }
1713
1714 return r;
1715}
1716
1717static int
1718compile_tree(Node* node, regex_t* reg)
1719{
1720 int n, type, len, pos, r = 0;
1721
1722 type = NTYPE(node);
1723 switch (type) {
1724 case NT_LIST:
1725 do {
1726 r = compile_tree(NCAR(node), reg);
1727 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1728 break;
1729
1730 case NT_ALT:
1731 {
1732 Node* x = node;
1733 len = 0;
1734 do {
1735 len += compile_length_tree(NCAR(x), reg);
1736 if (NCDR(x) != NULL) {
1737 len += SIZE_OP_PUSH + SIZE_OP_JUMP;
1738 }
1739 } while (IS_NOT_NULL(x = NCDR(x)));
1740 pos = reg->used + len; /* goal position */
1741
1742 do {
1743 len = compile_length_tree(NCAR(node), reg);
1744 if (IS_NOT_NULL(NCDR(node))) {
1745 r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
1746 if (r) break;
1747 }
1748 r = compile_tree(NCAR(node), reg);
1749 if (r) break;
1750 if (IS_NOT_NULL(NCDR(node))) {
1751 len = pos - (reg->used + SIZE_OP_JUMP);
1752 r = add_opcode_rel_addr(reg, OP_JUMP, len);
1753 if (r) break;
1754 }
1755 } while (IS_NOT_NULL(node = NCDR(node)));
1756 }
1757 break;
1758
1759 case NT_STR:
1760 if (NSTRING_IS_RAW(node))
1761 r = compile_string_raw_node(NSTR(node), reg);
1762 else
1763 r = compile_string_node(node, reg);
1764 break;
1765
1766 case NT_CCLASS:
1767 r = compile_cclass_node(NCCLASS(node), reg);
1768 break;
1769
1770 case NT_CTYPE:
1771 {
1772 int op;
1773
1774 switch (NCTYPE(node)->ctype) {
1775 case ONIGENC_CTYPE_WORD:
1776 if (NCTYPE(node)->ascii_range != 0) {
1777 if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
1778 else op = OP_ASCII_WORD;
1779 }
1780 else {
1781 if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
1782 else op = OP_WORD;
1783 }
1784 break;
1785 default:
1786 return ONIGERR_TYPE_BUG;
1787 break;
1788 }
1789 r = add_opcode(reg, op);
1790 }
1791 break;
1792
1793 case NT_CANY:
1794 if (IS_MULTILINE(reg->options))
1795 r = add_opcode(reg, OP_ANYCHAR_ML);
1796 else
1797 r = add_opcode(reg, OP_ANYCHAR);
1798 break;
1799
1800 case NT_BREF:
1801 {
1802 BRefNode* br = NBREF(node);
1803
1804#ifdef USE_BACKREF_WITH_LEVEL
1805 if (IS_BACKREF_NEST_LEVEL(br)) {
1806 r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
1807 if (r) return r;
1808 r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
1809 if (r) return r;
1810 r = add_length(reg, br->nest_level);
1811 if (r) return r;
1812
1813 goto add_bacref_mems;
1814 }
1815 else
1816#endif
1817 if (br->back_num == 1) {
1818 n = br->back_static[0];
1819 if (IS_IGNORECASE(reg->options)) {
1820 r = add_opcode(reg, OP_BACKREFN_IC);
1821 if (r) return r;
1822 r = add_mem_num(reg, n);
1823 }
1824 else {
1825 switch (n) {
1826 case 1: r = add_opcode(reg, OP_BACKREF1); break;
1827 case 2: r = add_opcode(reg, OP_BACKREF2); break;
1828 default:
1829 r = add_opcode(reg, OP_BACKREFN);
1830 if (r) return r;
1831 r = add_mem_num(reg, n);
1832 break;
1833 }
1834 }
1835 }
1836 else {
1837 int i;
1838 int* p;
1839
1840 if (IS_IGNORECASE(reg->options)) {
1841 r = add_opcode(reg, OP_BACKREF_MULTI_IC);
1842 }
1843 else {
1844 r = add_opcode(reg, OP_BACKREF_MULTI);
1845 }
1846 if (r) return r;
1847
1848#ifdef USE_BACKREF_WITH_LEVEL
1849 add_bacref_mems:
1850#endif
1851 r = add_length(reg, br->back_num);
1852 if (r) return r;
1853 p = BACKREFS_P(br);
1854 for (i = br->back_num - 1; i >= 0; i--) {
1855 r = add_mem_num(reg, p[i]);
1856 if (r) return r;
1857 }
1858 }
1859 }
1860 break;
1861
1862#ifdef USE_SUBEXP_CALL
1863 case NT_CALL:
1864 r = compile_call(NCALL(node), reg);
1865 break;
1866#endif
1867
1868 case NT_QTFR:
1869 r = compile_quantifier_node(NQTFR(node), reg);
1870 break;
1871
1872 case NT_ENCLOSE:
1873 r = compile_enclose_node(NENCLOSE(node), reg);
1874 break;
1875
1876 case NT_ANCHOR:
1877 r = compile_anchor_node(NANCHOR(node), reg);
1878 break;
1879
1880 default:
1881#ifdef ONIG_DEBUG
1882 fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
1883#endif
1884 break;
1885 }
1886
1887 return r;
1888}
1889
1890#ifdef USE_NAMED_GROUP
1891
1892static int
1893noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
1894{
1895 int r = 0;
1896 Node* node = *plink;
1897
1898 switch (NTYPE(node)) {
1899 case NT_LIST:
1900 case NT_ALT:
1901 do {
1902 r = noname_disable_map(&(NCAR(node)), map, counter);
1903 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1904 break;
1905
1906 case NT_QTFR:
1907 {
1908 Node** ptarget = &(NQTFR(node)->target);
1909 Node* old = *ptarget;
1910 r = noname_disable_map(ptarget, map, counter);
1911 if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
1912 onig_reduce_nested_quantifier(node, *ptarget);
1913 }
1914 }
1915 break;
1916
1917 case NT_ENCLOSE:
1918 {
1919 EncloseNode* en = NENCLOSE(node);
1920 if (en->type == ENCLOSE_MEMORY) {
1921 if (IS_ENCLOSE_NAMED_GROUP(en)) {
1922 (*counter)++;
1923 map[en->regnum].new_val = *counter;
1924 en->regnum = *counter;
1925 }
1926 else if (en->regnum != 0) {
1927 *plink = en->target;
1928 en->target = NULL_NODE;
1929 onig_node_free(node);
1930 r = noname_disable_map(plink, map, counter);
1931 break;
1932 }
1933 }
1934 r = noname_disable_map(&(en->target), map, counter);
1935 }
1936 break;
1937
1938 case NT_ANCHOR:
1939 if (NANCHOR(node)->target)
1940 r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
1941 break;
1942
1943 default:
1944 break;
1945 }
1946
1947 return r;
1948}
1949
1950static int
1951renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem)
1952{
1953 int i, pos, n, old_num;
1954 int *backs;
1955 BRefNode* bn = NBREF(node);
1956
1957 if (! IS_BACKREF_NAME_REF(bn))
1958 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
1959
1960 old_num = bn->back_num;
1961 if (IS_NULL(bn->back_dynamic))
1962 backs = bn->back_static;
1963 else
1964 backs = bn->back_dynamic;
1965
1966 for (i = 0, pos = 0; i < old_num; i++) {
1967 if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF;
1968 n = map[backs[i]].new_val;
1969 if (n > 0) {
1970 backs[pos] = n;
1971 pos++;
1972 }
1973 }
1974
1975 bn->back_num = pos;
1976 return 0;
1977}
1978
1979static int
1980renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem)
1981{
1982 int r = 0;
1983
1984 switch (NTYPE(node)) {
1985 case NT_LIST:
1986 case NT_ALT:
1987 do {
1988 r = renumber_by_map(NCAR(node), map, num_mem);
1989 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
1990 break;
1991 case NT_QTFR:
1992 r = renumber_by_map(NQTFR(node)->target, map, num_mem);
1993 break;
1994 case NT_ENCLOSE:
1995 {
1996 EncloseNode* en = NENCLOSE(node);
1997 if (en->type == ENCLOSE_CONDITION) {
1998 if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF;
1999 en->regnum = map[en->regnum].new_val;
2000 }
2001 r = renumber_by_map(en->target, map, num_mem);
2002 }
2003 break;
2004
2005 case NT_BREF:
2006 r = renumber_node_backref(node, map, num_mem);
2007 break;
2008
2009 case NT_ANCHOR:
2010 if (NANCHOR(node)->target)
2011 r = renumber_by_map(NANCHOR(node)->target, map, num_mem);
2012 break;
2013
2014 default:
2015 break;
2016 }
2017
2018 return r;
2019}
2020
2021static int
2022numbered_ref_check(Node* node)
2023{
2024 int r = 0;
2025
2026 switch (NTYPE(node)) {
2027 case NT_LIST:
2028 case NT_ALT:
2029 do {
2030 r = numbered_ref_check(NCAR(node));
2031 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2032 break;
2033 case NT_QTFR:
2034 r = numbered_ref_check(NQTFR(node)->target);
2035 break;
2036 case NT_ENCLOSE:
2037 r = numbered_ref_check(NENCLOSE(node)->target);
2038 break;
2039
2040 case NT_BREF:
2041 if (! IS_BACKREF_NAME_REF(NBREF(node)))
2042 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
2043 break;
2044
2045 case NT_ANCHOR:
2046 if (NANCHOR(node)->target)
2047 r = numbered_ref_check(NANCHOR(node)->target);
2048 break;
2049
2050 default:
2051 break;
2052 }
2053
2054 return r;
2055}
2056
2057static int
2058disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
2059{
2060 int r, i, pos, counter;
2061 BitStatusType loc;
2062 GroupNumRemap* map;
2063
2064 map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
2065 CHECK_NULL_RETURN_MEMERR(map);
2066 for (i = 1; i <= env->num_mem; i++) {
2067 map[i].new_val = 0;
2068 }
2069 counter = 0;
2070 r = noname_disable_map(root, map, &counter);
2071 if (r != 0) return r;
2072
2073 r = renumber_by_map(*root, map, env->num_mem);
2074 if (r != 0) return r;
2075
2076 for (i = 1, pos = 1; i <= env->num_mem; i++) {
2077 if (map[i].new_val > 0) {
2078 SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
2079 pos++;
2080 }
2081 }
2082
2083 loc = env->capture_history;
2084 BIT_STATUS_CLEAR(env->capture_history);
2085 for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
2086 if (BIT_STATUS_AT(loc, i)) {
2087 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
2088 }
2089 }
2090
2091 env->num_mem = env->num_named;
2092 reg->num_mem = env->num_named;
2093
2094 return onig_renumber_name_table(reg, map);
2095}
2096#endif /* USE_NAMED_GROUP */
2097
2098#ifdef USE_SUBEXP_CALL
2099static int
2100unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
2101{
2102 int i, offset;
2103 EncloseNode* en;
2104 AbsAddrType addr;
2105
2106 for (i = 0; i < uslist->num; i++) {
2107 en = NENCLOSE(uslist->us[i].target);
2108 if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
2109 addr = en->call_addr;
2110 offset = uslist->us[i].offset;
2111
2112 BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
2113 }
2114 return 0;
2115}
2116#endif
2117
2118#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
2119static int
2120quantifiers_memory_node_info(Node* node)
2121{
2122 int r = 0;
2123
2124 switch (NTYPE(node)) {
2125 case NT_LIST:
2126 case NT_ALT:
2127 {
2128 int v;
2129 do {
2130 v = quantifiers_memory_node_info(NCAR(node));
2131 if (v > r) r = v;
2132 } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
2133 }
2134 break;
2135
2136# ifdef USE_SUBEXP_CALL
2137 case NT_CALL:
2138 if (IS_CALL_RECURSION(NCALL(node))) {
2139 return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
2140 }
2141 else
2142 r = quantifiers_memory_node_info(NCALL(node)->target);
2143 break;
2144# endif
2145
2146 case NT_QTFR:
2147 {
2148 QtfrNode* qn = NQTFR(node);
2149 if (qn->upper != 0) {
2150 r = quantifiers_memory_node_info(qn->target);
2151 }
2152 }
2153 break;
2154
2155 case NT_ENCLOSE:
2156 {
2157 EncloseNode* en = NENCLOSE(node);
2158 switch (en->type) {
2159 case ENCLOSE_MEMORY:
2160 return NQ_TARGET_IS_EMPTY_MEM;
2161 break;
2162
2163 case ENCLOSE_OPTION:
2164 case ENCLOSE_STOP_BACKTRACK:
2165 case ENCLOSE_CONDITION:
2166 case ENCLOSE_ABSENT:
2167 r = quantifiers_memory_node_info(en->target);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173 break;
2174
2175 case NT_BREF:
2176 case NT_STR:
2177 case NT_CTYPE:
2178 case NT_CCLASS:
2179 case NT_CANY:
2180 case NT_ANCHOR:
2181 default:
2182 break;
2183 }
2184
2185 return r;
2186}
2187#endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
2188
2189static int
2190get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
2191{
2192 OnigDistance tmin;
2193 int r = 0;
2194
2195 *min = 0;
2196 switch (NTYPE(node)) {
2197 case NT_BREF:
2198 {
2199 int i;
2200 int* backs;
2201 Node** nodes = SCANENV_MEM_NODES(env);
2202 BRefNode* br = NBREF(node);
2203 if (br->state & NST_RECURSION) break;
2204
2205 backs = BACKREFS_P(br);
2206 if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2207 r = get_min_match_length(nodes[backs[0]], min, env);
2208 if (r != 0) break;
2209 for (i = 1; i < br->back_num; i++) {
2210 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2211 r = get_min_match_length(nodes[backs[i]], &tmin, env);
2212 if (r != 0) break;
2213 if (*min > tmin) *min = tmin;
2214 }
2215 }
2216 break;
2217
2218#ifdef USE_SUBEXP_CALL
2219 case NT_CALL:
2220 if (IS_CALL_RECURSION(NCALL(node))) {
2221 EncloseNode* en = NENCLOSE(NCALL(node)->target);
2222 if (IS_ENCLOSE_MIN_FIXED(en))
2223 *min = en->min_len;
2224 }
2225 else
2226 r = get_min_match_length(NCALL(node)->target, min, env);
2227 break;
2228#endif
2229
2230 case NT_LIST:
2231 do {
2232 r = get_min_match_length(NCAR(node), &tmin, env);
2233 if (r == 0) *min += tmin;
2234 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2235 break;
2236
2237 case NT_ALT:
2238 {
2239 Node *x, *y;
2240 y = node;
2241 do {
2242 x = NCAR(y);
2243 r = get_min_match_length(x, &tmin, env);
2244 if (r != 0) break;
2245 if (y == node) *min = tmin;
2246 else if (*min > tmin) *min = tmin;
2247 } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
2248 }
2249 break;
2250
2251 case NT_STR:
2252 {
2253 StrNode* sn = NSTR(node);
2254 *min = sn->end - sn->s;
2255 }
2256 break;
2257
2258 case NT_CTYPE:
2259 *min = 1;
2260 break;
2261
2262 case NT_CCLASS:
2263 case NT_CANY:
2264 *min = 1;
2265 break;
2266
2267 case NT_QTFR:
2268 {
2269 QtfrNode* qn = NQTFR(node);
2270
2271 if (qn->lower > 0) {
2272 r = get_min_match_length(qn->target, min, env);
2273 if (r == 0)
2274 *min = distance_multiply(*min, qn->lower);
2275 }
2276 }
2277 break;
2278
2279 case NT_ENCLOSE:
2280 {
2281 EncloseNode* en = NENCLOSE(node);
2282 switch (en->type) {
2283 case ENCLOSE_MEMORY:
2284 if (IS_ENCLOSE_MIN_FIXED(en))
2285 *min = en->min_len;
2286 else {
2287 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2288 *min = 0; /* recursive */
2289 else {
2290 SET_ENCLOSE_STATUS(node, NST_MARK1);
2291 r = get_min_match_length(en->target, min, env);
2292 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2293 if (r == 0) {
2294 en->min_len = *min;
2295 SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
2296 }
2297 }
2298 }
2299 break;
2300
2301 case ENCLOSE_OPTION:
2302 case ENCLOSE_STOP_BACKTRACK:
2303 case ENCLOSE_CONDITION:
2304 r = get_min_match_length(en->target, min, env);
2305 break;
2306
2307 case ENCLOSE_ABSENT:
2308 break;
2309 }
2310 }
2311 break;
2312
2313 case NT_ANCHOR:
2314 default:
2315 break;
2316 }
2317
2318 return r;
2319}
2320
2321static int
2322get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
2323{
2324 OnigDistance tmax;
2325 int r = 0;
2326
2327 *max = 0;
2328 switch (NTYPE(node)) {
2329 case NT_LIST:
2330 do {
2331 r = get_max_match_length(NCAR(node), &tmax, env);
2332 if (r == 0)
2333 *max = distance_add(*max, tmax);
2334 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2335 break;
2336
2337 case NT_ALT:
2338 do {
2339 r = get_max_match_length(NCAR(node), &tmax, env);
2340 if (r == 0 && *max < tmax) *max = tmax;
2341 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2342 break;
2343
2344 case NT_STR:
2345 {
2346 StrNode* sn = NSTR(node);
2347 *max = sn->end - sn->s;
2348 }
2349 break;
2350
2351 case NT_CTYPE:
2352 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2353 break;
2354
2355 case NT_CCLASS:
2356 case NT_CANY:
2357 *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
2358 break;
2359
2360 case NT_BREF:
2361 {
2362 int i;
2363 int* backs;
2364 Node** nodes = SCANENV_MEM_NODES(env);
2365 BRefNode* br = NBREF(node);
2366 if (br->state & NST_RECURSION) {
2367 *max = ONIG_INFINITE_DISTANCE;
2368 break;
2369 }
2370 backs = BACKREFS_P(br);
2371 for (i = 0; i < br->back_num; i++) {
2372 if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
2373 r = get_max_match_length(nodes[backs[i]], &tmax, env);
2374 if (r != 0) break;
2375 if (*max < tmax) *max = tmax;
2376 }
2377 }
2378 break;
2379
2380#ifdef USE_SUBEXP_CALL
2381 case NT_CALL:
2382 if (! IS_CALL_RECURSION(NCALL(node)))
2383 r = get_max_match_length(NCALL(node)->target, max, env);
2384 else
2385 *max = ONIG_INFINITE_DISTANCE;
2386 break;
2387#endif
2388
2389 case NT_QTFR:
2390 {
2391 QtfrNode* qn = NQTFR(node);
2392
2393 if (qn->upper != 0) {
2394 r = get_max_match_length(qn->target, max, env);
2395 if (r == 0 && *max != 0) {
2396 if (! IS_REPEAT_INFINITE(qn->upper))
2397 *max = distance_multiply(*max, qn->upper);
2398 else
2399 *max = ONIG_INFINITE_DISTANCE;
2400 }
2401 }
2402 }
2403 break;
2404
2405 case NT_ENCLOSE:
2406 {
2407 EncloseNode* en = NENCLOSE(node);
2408 switch (en->type) {
2409 case ENCLOSE_MEMORY:
2410 if (IS_ENCLOSE_MAX_FIXED(en))
2411 *max = en->max_len;
2412 else {
2413 if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2414 *max = ONIG_INFINITE_DISTANCE;
2415 else {
2416 SET_ENCLOSE_STATUS(node, NST_MARK1);
2417 r = get_max_match_length(en->target, max, env);
2418 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
2419 if (r == 0) {
2420 en->max_len = *max;
2421 SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
2422 }
2423 }
2424 }
2425 break;
2426
2427 case ENCLOSE_OPTION:
2428 case ENCLOSE_STOP_BACKTRACK:
2429 case ENCLOSE_CONDITION:
2430 r = get_max_match_length(en->target, max, env);
2431 break;
2432
2433 case ENCLOSE_ABSENT:
2434 break;
2435 }
2436 }
2437 break;
2438
2439 case NT_ANCHOR:
2440 default:
2441 break;
2442 }
2443
2444 return r;
2445}
2446
2447#define GET_CHAR_LEN_VARLEN -1
2448#define GET_CHAR_LEN_TOP_ALT_VARLEN -2
2449
2450/* fixed size pattern node only */
2451static int
2452get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
2453{
2454 int tlen;
2455 int r = 0;
2456
2457 level++;
2458 *len = 0;
2459 switch (NTYPE(node)) {
2460 case NT_LIST:
2461 do {
2462 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2463 if (r == 0)
2464 *len = (int )distance_add(*len, tlen);
2465 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2466 break;
2467
2468 case NT_ALT:
2469 {
2470 int tlen2;
2471 int varlen = 0;
2472
2473 r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
2474 while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
2475 r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
2476 if (r == 0) {
2477 if (tlen != tlen2)
2478 varlen = 1;
2479 }
2480 }
2481 if (r == 0) {
2482 if (varlen != 0) {
2483 if (level == 1)
2484 r = GET_CHAR_LEN_TOP_ALT_VARLEN;
2485 else
2486 r = GET_CHAR_LEN_VARLEN;
2487 }
2488 else
2489 *len = tlen;
2490 }
2491 }
2492 break;
2493
2494 case NT_STR:
2495 {
2496 StrNode* sn = NSTR(node);
2497 UChar *s = sn->s;
2498 while (s < sn->end) {
2499 s += enclen(reg->enc, s, sn->end);
2500 (*len)++;
2501 }
2502 }
2503 break;
2504
2505 case NT_QTFR:
2506 {
2507 QtfrNode* qn = NQTFR(node);
2508 if (qn->lower == qn->upper) {
2509 r = get_char_length_tree1(qn->target, reg, &tlen, level);
2510 if (r == 0)
2511 *len = (int )distance_multiply(tlen, qn->lower);
2512 }
2513 else
2514 r = GET_CHAR_LEN_VARLEN;
2515 }
2516 break;
2517
2518#ifdef USE_SUBEXP_CALL
2519 case NT_CALL:
2520 if (! IS_CALL_RECURSION(NCALL(node)))
2521 r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
2522 else
2523 r = GET_CHAR_LEN_VARLEN;
2524 break;
2525#endif
2526
2527 case NT_CTYPE:
2528 *len = 1;
2529 break;
2530
2531 case NT_CCLASS:
2532 case NT_CANY:
2533 *len = 1;
2534 break;
2535
2536 case NT_ENCLOSE:
2537 {
2538 EncloseNode* en = NENCLOSE(node);
2539 switch (en->type) {
2540 case ENCLOSE_MEMORY:
2541#ifdef USE_SUBEXP_CALL
2542 if (IS_ENCLOSE_CLEN_FIXED(en))
2543 *len = en->char_len;
2544 else {
2545 r = get_char_length_tree1(en->target, reg, len, level);
2546 if (r == 0) {
2547 en->char_len = *len;
2548 SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
2549 }
2550 }
2551 break;
2552#endif
2553 case ENCLOSE_OPTION:
2554 case ENCLOSE_STOP_BACKTRACK:
2555 case ENCLOSE_CONDITION:
2556 r = get_char_length_tree1(en->target, reg, len, level);
2557 break;
2558 case ENCLOSE_ABSENT:
2559 default:
2560 break;
2561 }
2562 }
2563 break;
2564
2565 case NT_ANCHOR:
2566 break;
2567
2568 default:
2569 r = GET_CHAR_LEN_VARLEN;
2570 break;
2571 }
2572
2573 return r;
2574}
2575
2576static int
2577get_char_length_tree(Node* node, regex_t* reg, int* len)
2578{
2579 return get_char_length_tree1(node, reg, len, 0);
2580}
2581
2582/* x is not included y ==> 1 : 0 */
2583static int
2584is_not_included(Node* x, Node* y, regex_t* reg)
2585{
2586 int i;
2587 OnigDistance len;
2588 OnigCodePoint code;
2589 UChar *p;
2590 int ytype;
2591
2592 retry:
2593 ytype = NTYPE(y);
2594 switch (NTYPE(x)) {
2595 case NT_CTYPE:
2596 {
2597 switch (ytype) {
2598 case NT_CTYPE:
2599 if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
2600 NCTYPE(y)->not != NCTYPE(x)->not &&
2601 NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
2602 return 1;
2603 else
2604 return 0;
2605 break;
2606
2607 case NT_CCLASS:
2608 swap:
2609 {
2610 Node* tmp;
2611 tmp = x; x = y; y = tmp;
2612 goto retry;
2613 }
2614 break;
2615
2616 case NT_STR:
2617 goto swap;
2618 break;
2619
2620 default:
2621 break;
2622 }
2623 }
2624 break;
2625
2626 case NT_CCLASS:
2627 {
2628 CClassNode* xc = NCCLASS(x);
2629 switch (ytype) {
2630 case NT_CTYPE:
2631 switch (NCTYPE(y)->ctype) {
2632 case ONIGENC_CTYPE_WORD:
2633 if (NCTYPE(y)->not == 0) {
2634 if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
2635 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2636 if (BITSET_AT(xc->bs, i)) {
2637 if (NCTYPE(y)->ascii_range) {
2638 if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
2639 }
2640 else {
2641 if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
2642 }
2643 }
2644 }
2645 return 1;
2646 }
2647 return 0;
2648 }
2649 else {
2650 if (IS_NOT_NULL(xc->mbuf)) return 0;
2651 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2652 int is_word;
2653 if (NCTYPE(y)->ascii_range)
2654 is_word = IS_CODE_SB_WORD(reg->enc, i);
2655 else
2656 is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
2657 if (! is_word) {
2658 if (!IS_NCCLASS_NOT(xc)) {
2659 if (BITSET_AT(xc->bs, i))
2660 return 0;
2661 }
2662 else {
2663 if (! BITSET_AT(xc->bs, i))
2664 return 0;
2665 }
2666 }
2667 }
2668 return 1;
2669 }
2670 break;
2671
2672 default:
2673 break;
2674 }
2675 break;
2676
2677 case NT_CCLASS:
2678 {
2679 int v;
2680 CClassNode* yc = NCCLASS(y);
2681
2682 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
2683 v = BITSET_AT(xc->bs, i);
2684 if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
2685 (v == 0 && IS_NCCLASS_NOT(xc))) {
2686 v = BITSET_AT(yc->bs, i);
2687 if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
2688 (v == 0 && IS_NCCLASS_NOT(yc)))
2689 return 0;
2690 }
2691 }
2692 if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
2693 (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
2694 return 1;
2695 return 0;
2696 }
2697 break;
2698
2699 case NT_STR:
2700 goto swap;
2701 break;
2702
2703 default:
2704 break;
2705 }
2706 }
2707 break;
2708
2709 case NT_STR:
2710 {
2711 StrNode* xs = NSTR(x);
2712 if (NSTRING_LEN(x) == 0)
2713 break;
2714
2715 switch (ytype) {
2716 case NT_CTYPE:
2717 switch (NCTYPE(y)->ctype) {
2718 case ONIGENC_CTYPE_WORD:
2719 if (NCTYPE(y)->ascii_range) {
2720 if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
2721 return NCTYPE(y)->not;
2722 else
2723 return !(NCTYPE(y)->not);
2724 }
2725 else {
2726 if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
2727 return NCTYPE(y)->not;
2728 else
2729 return !(NCTYPE(y)->not);
2730 }
2731 break;
2732 default:
2733 break;
2734 }
2735 break;
2736
2737 case NT_CCLASS:
2738 {
2739 CClassNode* cc = NCCLASS(y);
2740
2741 code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
2742 xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
2743 return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
2744 }
2745 break;
2746
2747 case NT_STR:
2748 {
2749 UChar *q;
2750 StrNode* ys = NSTR(y);
2751 len = NSTRING_LEN(x);
2752 if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
2753 if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
2754 /* tiny version */
2755 return 0;
2756 }
2757 else {
2758 for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
2759 if (*p != *q) return 1;
2760 }
2761 }
2762 }
2763 break;
2764
2765 default:
2766 break;
2767 }
2768 }
2769 break;
2770
2771 default:
2772 break;
2773 }
2774
2775 return 0;
2776}
2777
2778static Node*
2779get_head_value_node(Node* node, int exact, regex_t* reg)
2780{
2781 Node* n = NULL_NODE;
2782
2783 switch (NTYPE(node)) {
2784 case NT_BREF:
2785 case NT_ALT:
2786 case NT_CANY:
2787#ifdef USE_SUBEXP_CALL
2788 case NT_CALL:
2789#endif
2790 break;
2791
2792 case NT_CTYPE:
2793 case NT_CCLASS:
2794 if (exact == 0) {
2795 n = node;
2796 }
2797 break;
2798
2799 case NT_LIST:
2800 n = get_head_value_node(NCAR(node), exact, reg);
2801 break;
2802
2803 case NT_STR:
2804 {
2805 StrNode* sn = NSTR(node);
2806 if (sn->end <= sn->s)
2807 break;
2808
2809 if (exact == 0 ||
2810 NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) {
2811 n = node;
2812 }
2813 }
2814 break;
2815
2816 case NT_QTFR:
2817 {
2818 QtfrNode* qn = NQTFR(node);
2819 if (qn->lower > 0) {
2820#ifdef USE_OP_PUSH_OR_JUMP_EXACT
2821 if (IS_NOT_NULL(qn->head_exact))
2822 n = qn->head_exact;
2823 else
2824#endif
2825 n = get_head_value_node(qn->target, exact, reg);
2826 }
2827 }
2828 break;
2829
2830 case NT_ENCLOSE:
2831 {
2832 EncloseNode* en = NENCLOSE(node);
2833 switch (en->type) {
2834 case ENCLOSE_OPTION:
2835 {
2836 OnigOptionType options = reg->options;
2837
2838 reg->options = NENCLOSE(node)->option;
2839 n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
2840 reg->options = options;
2841 }
2842 break;
2843
2844 case ENCLOSE_MEMORY:
2845 case ENCLOSE_STOP_BACKTRACK:
2846 case ENCLOSE_CONDITION:
2847 n = get_head_value_node(en->target, exact, reg);
2848 break;
2849
2850 case ENCLOSE_ABSENT:
2851 break;
2852 }
2853 }
2854 break;
2855
2856 case NT_ANCHOR:
2857 if (NANCHOR(node)->type == ANCHOR_PREC_READ)
2858 n = get_head_value_node(NANCHOR(node)->target, exact, reg);
2859 break;
2860
2861 default:
2862 break;
2863 }
2864
2865 return n;
2866}
2867
2868static int
2869check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
2870{
2871 int type, r = 0;
2872
2873 type = NTYPE(node);
2874 if ((NTYPE2BIT(type) & type_mask) == 0)
2875 return 1;
2876
2877 switch (type) {
2878 case NT_LIST:
2879 case NT_ALT:
2880 do {
2881 r = check_type_tree(NCAR(node), type_mask, enclose_mask,
2882 anchor_mask);
2883 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
2884 break;
2885
2886 case NT_QTFR:
2887 r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
2888 anchor_mask);
2889 break;
2890
2891 case NT_ENCLOSE:
2892 {
2893 EncloseNode* en = NENCLOSE(node);
2894 if ((en->type & enclose_mask) == 0)
2895 return 1;
2896
2897 r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
2898 }
2899 break;
2900
2901 case NT_ANCHOR:
2902 type = NANCHOR(node)->type;
2903 if ((type & anchor_mask) == 0)
2904 return 1;
2905
2906 if (NANCHOR(node)->target)
2907 r = check_type_tree(NANCHOR(node)->target,
2908 type_mask, enclose_mask, anchor_mask);
2909 break;
2910
2911 default:
2912 break;
2913 }
2914 return r;
2915}
2916
2917#ifdef USE_SUBEXP_CALL
2918
2919# define RECURSION_EXIST 1
2920# define RECURSION_INFINITE 2
2921
2922static int
2923subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
2924{
2925 int type;
2926 int r = 0;
2927
2928 type = NTYPE(node);
2929 switch (type) {
2930 case NT_LIST:
2931 {
2932 Node *x;
2933 OnigDistance min;
2934 int ret;
2935
2936 x = node;
2937 do {
2938 ret = subexp_inf_recursive_check(NCAR(x), env, head);
2939 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2940 r |= ret;
2941 if (head) {
2942 ret = get_min_match_length(NCAR(x), &min, env);
2943 if (ret != 0) return ret;
2944 if (min != 0) head = 0;
2945 }
2946 } while (IS_NOT_NULL(x = NCDR(x)));
2947 }
2948 break;
2949
2950 case NT_ALT:
2951 {
2952 int ret;
2953 r = RECURSION_EXIST;
2954 do {
2955 ret = subexp_inf_recursive_check(NCAR(node), env, head);
2956 if (ret < 0 || ret == RECURSION_INFINITE) return ret;
2957 r &= ret;
2958 } while (IS_NOT_NULL(node = NCDR(node)));
2959 }
2960 break;
2961
2962 case NT_QTFR:
2963 r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
2964 if (r == RECURSION_EXIST) {
2965 if (NQTFR(node)->lower == 0) r = 0;
2966 }
2967 break;
2968
2969 case NT_ANCHOR:
2970 {
2971 AnchorNode* an = NANCHOR(node);
2972 switch (an->type) {
2973 case ANCHOR_PREC_READ:
2974 case ANCHOR_PREC_READ_NOT:
2975 case ANCHOR_LOOK_BEHIND:
2976 case ANCHOR_LOOK_BEHIND_NOT:
2977 r = subexp_inf_recursive_check(an->target, env, head);
2978 break;
2979 }
2980 }
2981 break;
2982
2983 case NT_CALL:
2984 r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
2985 break;
2986
2987 case NT_ENCLOSE:
2988 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
2989 return 0;
2990 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
2991 return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
2992 else {
2993 SET_ENCLOSE_STATUS(node, NST_MARK2);
2994 r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
2995 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
2996 }
2997 break;
2998
2999 default:
3000 break;
3001 }
3002
3003 return r;
3004}
3005
3006static int
3007subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
3008{
3009 int type;
3010 int r = 0;
3011
3012 type = NTYPE(node);
3013 switch (type) {
3014 case NT_LIST:
3015 case NT_ALT:
3016 do {
3017 r = subexp_inf_recursive_check_trav(NCAR(node), env);
3018 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3019 break;
3020
3021 case NT_QTFR:
3022 r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
3023 break;
3024
3025 case NT_ANCHOR:
3026 {
3027 AnchorNode* an = NANCHOR(node);
3028 switch (an->type) {
3029 case ANCHOR_PREC_READ:
3030 case ANCHOR_PREC_READ_NOT:
3031 case ANCHOR_LOOK_BEHIND:
3032 case ANCHOR_LOOK_BEHIND_NOT:
3033 r = subexp_inf_recursive_check_trav(an->target, env);
3034 break;
3035 }
3036 }
3037 break;
3038
3039 case NT_ENCLOSE:
3040 {
3041 EncloseNode* en = NENCLOSE(node);
3042
3043 if (IS_ENCLOSE_RECURSION(en)) {
3044 SET_ENCLOSE_STATUS(node, NST_MARK1);
3045 r = subexp_inf_recursive_check(en->target, env, 1);
3046 if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
3047 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3048 }
3049 r = subexp_inf_recursive_check_trav(en->target, env);
3050 }
3051
3052 break;
3053
3054 default:
3055 break;
3056 }
3057
3058 return r;
3059}
3060
3061static int
3062subexp_recursive_check(Node* node)
3063{
3064 int r = 0;
3065
3066 switch (NTYPE(node)) {
3067 case NT_LIST:
3068 case NT_ALT:
3069 do {
3070 r |= subexp_recursive_check(NCAR(node));
3071 } while (IS_NOT_NULL(node = NCDR(node)));
3072 break;
3073
3074 case NT_QTFR:
3075 r = subexp_recursive_check(NQTFR(node)->target);
3076 break;
3077
3078 case NT_ANCHOR:
3079 {
3080 AnchorNode* an = NANCHOR(node);
3081 switch (an->type) {
3082 case ANCHOR_PREC_READ:
3083 case ANCHOR_PREC_READ_NOT:
3084 case ANCHOR_LOOK_BEHIND:
3085 case ANCHOR_LOOK_BEHIND_NOT:
3086 r = subexp_recursive_check(an->target);
3087 break;
3088 }
3089 }
3090 break;
3091
3092 case NT_CALL:
3093 r = subexp_recursive_check(NCALL(node)->target);
3094 if (r != 0) SET_CALL_RECURSION(node);
3095 break;
3096
3097 case NT_ENCLOSE:
3098 if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
3099 return 0;
3100 else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
3101 return 1; /* recursion */
3102 else {
3103 SET_ENCLOSE_STATUS(node, NST_MARK2);
3104 r = subexp_recursive_check(NENCLOSE(node)->target);
3105 CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
3106 }
3107 break;
3108
3109 default:
3110 break;
3111 }
3112
3113 return r;
3114}
3115
3116
3117static int
3118subexp_recursive_check_trav(Node* node, ScanEnv* env)
3119{
3120# define FOUND_CALLED_NODE 1
3121
3122 int type;
3123 int r = 0;
3124
3125 type = NTYPE(node);
3126 switch (type) {
3127 case NT_LIST:
3128 case NT_ALT:
3129 {
3130 int ret;
3131 do {
3132 ret = subexp_recursive_check_trav(NCAR(node), env);
3133 if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
3134 else if (ret < 0) return ret;
3135 } while (IS_NOT_NULL(node = NCDR(node)));
3136 }
3137 break;
3138
3139 case NT_QTFR:
3140 r = subexp_recursive_check_trav(NQTFR(node)->target, env);
3141 if (NQTFR(node)->upper == 0) {
3142 if (r == FOUND_CALLED_NODE)
3143 NQTFR(node)->is_referred = 1;
3144 }
3145 break;
3146
3147 case NT_ANCHOR:
3148 {
3149 AnchorNode* an = NANCHOR(node);
3150 switch (an->type) {
3151 case ANCHOR_PREC_READ:
3152 case ANCHOR_PREC_READ_NOT:
3153 case ANCHOR_LOOK_BEHIND:
3154 case ANCHOR_LOOK_BEHIND_NOT:
3155 r = subexp_recursive_check_trav(an->target, env);
3156 break;
3157 }
3158 }
3159 break;
3160
3161 case NT_ENCLOSE:
3162 {
3163 EncloseNode* en = NENCLOSE(node);
3164
3165 if (! IS_ENCLOSE_RECURSION(en)) {
3166 if (IS_ENCLOSE_CALLED(en)) {
3167 SET_ENCLOSE_STATUS(node, NST_MARK1);
3168 r = subexp_recursive_check(en->target);
3169 if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
3170 CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
3171 }
3172 }
3173 r = subexp_recursive_check_trav(en->target, env);
3174 if (IS_ENCLOSE_CALLED(en))
3175 r |= FOUND_CALLED_NODE;
3176 }
3177 break;
3178
3179 default:
3180 break;
3181 }
3182
3183 return r;
3184}
3185
3186static int
3187setup_subexp_call(Node* node, ScanEnv* env)
3188{
3189 int type;
3190 int r = 0;
3191
3192 type = NTYPE(node);
3193 switch (type) {
3194 case NT_LIST:
3195 do {
3196 r = setup_subexp_call(NCAR(node), env);
3197 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3198 break;
3199
3200 case NT_ALT:
3201 do {
3202 r = setup_subexp_call(NCAR(node), env);
3203 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3204 break;
3205
3206 case NT_QTFR:
3207 r = setup_subexp_call(NQTFR(node)->target, env);
3208 break;
3209 case NT_ENCLOSE:
3210 r = setup_subexp_call(NENCLOSE(node)->target, env);
3211 break;
3212
3213 case NT_CALL:
3214 {
3215 CallNode* cn = NCALL(node);
3216 Node** nodes = SCANENV_MEM_NODES(env);
3217
3218 if (cn->group_num != 0) {
3219 int gnum = cn->group_num;
3220
3221# ifdef USE_NAMED_GROUP
3222 if (env->num_named > 0 &&
3223 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
3224 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
3225 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
3226 }
3227# endif
3228 if (gnum > env->num_mem) {
3229 onig_scan_env_set_error_string(env,
3230 ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
3231 return ONIGERR_UNDEFINED_GROUP_REFERENCE;
3232 }
3233
3234# ifdef USE_NAMED_GROUP
3235 set_call_attr:
3236# endif
3237 cn->target = nodes[cn->group_num];
3238 if (IS_NULL(cn->target)) {
3239 onig_scan_env_set_error_string(env,
3240 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3241 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3242 }
3243 SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
3244 BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
3245 cn->unset_addr_list = env->unset_addr_list;
3246 }
3247# ifdef USE_NAMED_GROUP
3248# ifdef USE_PERL_SUBEXP_CALL
3249 else if (cn->name == cn->name_end) {
3250 goto set_call_attr;
3251 }
3252# endif
3253 else {
3254 int *refs;
3255
3256 int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
3257 &refs);
3258 if (n <= 0) {
3259 onig_scan_env_set_error_string(env,
3260 ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
3261 return ONIGERR_UNDEFINED_NAME_REFERENCE;
3262 }
3263 else if (n > 1 &&
3264 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
3265 onig_scan_env_set_error_string(env,
3266 ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
3267 return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
3268 }
3269 else {
3270 cn->group_num = refs[0];
3271 goto set_call_attr;
3272 }
3273 }
3274# endif
3275 }
3276 break;
3277
3278 case NT_ANCHOR:
3279 {
3280 AnchorNode* an = NANCHOR(node);
3281
3282 switch (an->type) {
3283 case ANCHOR_PREC_READ:
3284 case ANCHOR_PREC_READ_NOT:
3285 case ANCHOR_LOOK_BEHIND:
3286 case ANCHOR_LOOK_BEHIND_NOT:
3287 r = setup_subexp_call(an->target, env);
3288 break;
3289 }
3290 }
3291 break;
3292
3293 default:
3294 break;
3295 }
3296
3297 return r;
3298}
3299#endif
3300
3301#define IN_ALT (1<<0)
3302#define IN_NOT (1<<1)
3303#define IN_REPEAT (1<<2)
3304#define IN_VAR_REPEAT (1<<3)
3305#define IN_CALL (1<<4)
3306#define IN_RECCALL (1<<5)
3307#define IN_LOOK_BEHIND (1<<6)
3308
3309/* divide different length alternatives in look-behind.
3310 (?<=A|B) ==> (?<=A)|(?<=B)
3311 (?<!A|B) ==> (?<!A)(?<!B)
3312*/
3313static int
3314divide_look_behind_alternatives(Node* node)
3315{
3316 Node *head, *np, *insert_node;
3317 AnchorNode* an = NANCHOR(node);
3318 int anc_type = an->type;
3319
3320 head = an->target;
3321 np = NCAR(head);
3322 swap_node(node, head);
3323 NCAR(node) = head;
3324 NANCHOR(head)->target = np;
3325
3326 np = node;
3327 while ((np = NCDR(np)) != NULL_NODE) {
3328 insert_node = onig_node_new_anchor(anc_type);
3329 CHECK_NULL_RETURN_MEMERR(insert_node);
3330 NANCHOR(insert_node)->target = NCAR(np);
3331 NCAR(np) = insert_node;
3332 }
3333
3334 if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
3335 np = node;
3336 do {
3337 SET_NTYPE(np, NT_LIST); /* alt -> list */
3338 } while ((np = NCDR(np)) != NULL_NODE);
3339 }
3340 return 0;
3341}
3342
3343static int
3344setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
3345{
3346 int r, len;
3347 AnchorNode* an = NANCHOR(node);
3348
3349 r = get_char_length_tree(an->target, reg, &len);
3350 if (r == 0)
3351 an->char_len = len;
3352 else if (r == GET_CHAR_LEN_VARLEN)
3353 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3354 else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
3355 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
3356 r = divide_look_behind_alternatives(node);
3357 else
3358 r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
3359 }
3360
3361 return r;
3362}
3363
3364static int
3365next_setup(Node* node, Node* next_node, regex_t* reg)
3366{
3367 int type;
3368
3369 retry:
3370 type = NTYPE(node);
3371 if (type == NT_QTFR) {
3372 QtfrNode* qn = NQTFR(node);
3373 if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
3374#ifdef USE_QTFR_PEEK_NEXT
3375 Node* n = get_head_value_node(next_node, 1, reg);
3376 /* '\0': for UTF-16BE etc... */
3377 if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
3378 qn->next_head_exact = n;
3379 }
3380#endif
3381 /* automatic possessification a*b ==> (?>a*)b */
3382 if (qn->lower <= 1) {
3383 int ttype = NTYPE(qn->target);
3384 if (IS_NODE_TYPE_SIMPLE(ttype)) {
3385 Node *x, *y;
3386 x = get_head_value_node(qn->target, 0, reg);
3387 if (IS_NOT_NULL(x)) {
3388 y = get_head_value_node(next_node, 0, reg);
3389 if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
3390 Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
3391 CHECK_NULL_RETURN_MEMERR(en);
3392 SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
3393 swap_node(node, en);
3394 NENCLOSE(node)->target = en;
3395 }
3396 }
3397 }
3398 }
3399 }
3400 }
3401 else if (type == NT_ENCLOSE) {
3402 EncloseNode* en = NENCLOSE(node);
3403 if (en->type == ENCLOSE_MEMORY && !IS_ENCLOSE_CALLED(en)) {
3404 node = en->target;
3405 goto retry;
3406 }
3407 }
3408 return 0;
3409}
3410
3411
3412static int
3413update_string_node_case_fold(regex_t* reg, Node *node)
3414{
3415 UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
3416 UChar *sbuf, *ebuf, *sp;
3417 int r, i, len;
3418 OnigDistance sbuf_size;
3419 StrNode* sn = NSTR(node);
3420
3421 end = sn->end;
3422 sbuf_size = (end - sn->s) * 2;
3423 sbuf = (UChar* )xmalloc(sbuf_size);
3424 CHECK_NULL_RETURN_MEMERR(sbuf);
3425 ebuf = sbuf + sbuf_size;
3426
3427 sp = sbuf;
3428 p = sn->s;
3429 while (p < end) {
3430 len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
3431 for (i = 0; i < len; i++) {
3432 if (sp >= ebuf) {
3433 UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
3434 if (IS_NULL(p)) {
3435 xfree(sbuf);
3436 return ONIGERR_MEMORY;
3437 }
3438 sbuf = p;
3439 sp = sbuf + sbuf_size;
3440 sbuf_size *= 2;
3441 ebuf = sbuf + sbuf_size;
3442 }
3443
3444 *sp++ = buf[i];
3445 }
3446 }
3447
3448 r = onig_node_str_set(node, sbuf, sp);
3449
3450 xfree(sbuf);
3451 return r;
3452}
3453
3454static int
3455expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
3456 regex_t* reg)
3457{
3458 int r;
3459 Node *node;
3460
3461 node = onig_node_new_str(s, end);
3462 if (IS_NULL(node)) return ONIGERR_MEMORY;
3463
3464 r = update_string_node_case_fold(reg, node);
3465 if (r != 0) {
3466 onig_node_free(node);
3467 return r;
3468 }
3469
3470 NSTRING_SET_AMBIG(node);
3471 NSTRING_SET_DONT_GET_OPT_INFO(node);
3472 *rnode = node;
3473 return 0;
3474}
3475
3476static int
3477is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
3478 int slen)
3479{
3480 int i;
3481
3482 for (i = 0; i < item_num; i++) {
3483 if (items[i].byte_len != slen) {
3484 return 1;
3485 }
3486 if (items[i].code_len != 1) {
3487 return 1;
3488 }
3489 }
3490 return 0;
3491}
3492
3493static int
3494expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
3495 UChar *p, int slen, UChar *end,
3496 regex_t* reg, Node **rnode)
3497{
3498 int r, i, j, len, varlen;
3499 Node *anode, *var_anode, *snode, *xnode, *an;
3500 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
3501
3502 *rnode = var_anode = NULL_NODE;
3503
3504 varlen = 0;
3505 for (i = 0; i < item_num; i++) {
3506 if (items[i].byte_len != slen) {
3507 varlen = 1;
3508 break;
3509 }
3510 }
3511
3512 if (varlen != 0) {
3513 *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3514 if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
3515
3516 xnode = onig_node_new_list(NULL, NULL);
3517 if (IS_NULL(xnode)) goto mem_err;
3518 NCAR(var_anode) = xnode;
3519
3520 anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3521 if (IS_NULL(anode)) goto mem_err;
3522 NCAR(xnode) = anode;
3523 }
3524 else {
3525 *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
3526 if (IS_NULL(anode)) return ONIGERR_MEMORY;
3527 }
3528
3529 snode = onig_node_new_str(p, p + slen);
3530 if (IS_NULL(snode)) goto mem_err;
3531
3532 NCAR(anode) = snode;
3533
3534 for (i = 0; i < item_num; i++) {
3535 snode = onig_node_new_str(NULL, NULL);
3536 if (IS_NULL(snode)) goto mem_err;
3537
3538 for (j = 0; j < items[i].code_len; j++) {
3539 len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
3540 if (len < 0) {
3541 r = len;
3542 goto mem_err2;
3543 }
3544
3545 r = onig_node_str_cat(snode, buf, buf + len);
3546 if (r != 0) goto mem_err2;
3547 }
3548
3549 an = onig_node_new_alt(NULL_NODE, NULL_NODE);
3550 if (IS_NULL(an)) {
3551 goto mem_err2;
3552 }
3553
3554 if (items[i].byte_len != slen) {
3555 Node *rem;
3556 UChar *q = p + items[i].byte_len;
3557
3558 if (q < end) {
3559 r = expand_case_fold_make_rem_string(&rem, q, end, reg);
3560 if (r != 0) {
3561 onig_node_free(an);
3562 goto mem_err2;
3563 }
3564
3565 xnode = onig_node_list_add(NULL_NODE, snode);
3566 if (IS_NULL(xnode)) {
3567 onig_node_free(an);
3568 onig_node_free(rem);
3569 goto mem_err2;
3570 }
3571 if (IS_NULL(onig_node_list_add(xnode, rem))) {
3572 onig_node_free(an);
3573 onig_node_free(xnode);
3574 onig_node_free(rem);
3575 goto mem_err;
3576 }
3577
3578 NCAR(an) = xnode;
3579 }
3580 else {
3581 NCAR(an) = snode;
3582 }
3583
3584 NCDR(var_anode) = an;
3585 var_anode = an;
3586 }
3587 else {
3588 NCAR(an) = snode;
3589 NCDR(anode) = an;
3590 anode = an;
3591 }
3592 }
3593
3594 return varlen;
3595
3596 mem_err2:
3597 onig_node_free(snode);
3598
3599 mem_err:
3600 onig_node_free(*rnode);
3601
3602 return ONIGERR_MEMORY;
3603}
3604
3605#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
3606
3607static int
3608expand_case_fold_string(Node* node, regex_t* reg, int state)
3609{
3610 int r, n, len, alt_num;
3611 int varlen = 0;
3612 int is_in_look_behind;
3613 UChar *start, *end, *p;
3614 Node *top_root, *root, *snode, *prev_node;
3615 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3616 StrNode* sn;
3617
3618 if (NSTRING_IS_AMBIG(node)) return 0;
3619
3620 sn = NSTR(node);
3621
3622 start = sn->s;
3623 end = sn->end;
3624 if (start >= end) return 0;
3625
3626 is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
3627
3628 r = 0;
3629 top_root = root = prev_node = snode = NULL_NODE;
3630 alt_num = 1;
3631 p = start;
3632 while (p < end) {
3633 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
3634 p, end, items);
3635 if (n < 0) {
3636 r = n;
3637 goto err;
3638 }
3639
3640 len = enclen(reg->enc, p, end);
3641
3642 varlen = is_case_fold_variable_len(n, items, len);
3643 if (n == 0 || varlen == 0 || is_in_look_behind) {
3644 if (IS_NULL(snode)) {
3645 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3646 onig_node_free(top_root);
3647 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3648 if (IS_NULL(root)) {
3649 onig_node_free(prev_node);
3650 goto mem_err;
3651 }
3652 }
3653
3654 prev_node = snode = onig_node_new_str(NULL, NULL);
3655 if (IS_NULL(snode)) goto mem_err;
3656 if (IS_NOT_NULL(root)) {
3657 if (IS_NULL(onig_node_list_add(root, snode))) {
3658 onig_node_free(snode);
3659 goto mem_err;
3660 }
3661 }
3662 }
3663
3664 r = onig_node_str_cat(snode, p, p + len);
3665 if (r != 0) goto err;
3666 }
3667 else {
3668 alt_num *= (n + 1);
3669 if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
3670
3671 if (IS_NOT_NULL(snode)) {
3672 r = update_string_node_case_fold(reg, snode);
3673 if (r == 0) {
3674 NSTRING_SET_AMBIG(snode);
3675 }
3676 }
3677 if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
3678 onig_node_free(top_root);
3679 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3680 if (IS_NULL(root)) {
3681 onig_node_free(prev_node);
3682 goto mem_err;
3683 }
3684 }
3685
3686 r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
3687 if (r < 0) goto mem_err;
3688 if (r == 1) {
3689 if (IS_NULL(root)) {
3690 top_root = prev_node;
3691 }
3692 else {
3693 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3694 onig_node_free(prev_node);
3695 goto mem_err;
3696 }
3697 }
3698
3699 root = NCAR(prev_node);
3700 }
3701 else { /* r == 0 */
3702 if (IS_NOT_NULL(root)) {
3703 if (IS_NULL(onig_node_list_add(root, prev_node))) {
3704 onig_node_free(prev_node);
3705 goto mem_err;
3706 }
3707 }
3708 }
3709
3710 snode = NULL_NODE;
3711 }
3712
3713 p += len;
3714 }
3715 if (IS_NOT_NULL(snode)) {
3716 r = update_string_node_case_fold(reg, snode);
3717 if (r == 0) {
3718 NSTRING_SET_AMBIG(snode);
3719 }
3720 }
3721
3722 if (p < end) {
3723 Node *srem;
3724
3725 r = expand_case_fold_make_rem_string(&srem, p, end, reg);
3726 if (r != 0) goto mem_err;
3727
3728 if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
3729 onig_node_free(top_root);
3730 top_root = root = onig_node_list_add(NULL_NODE, prev_node);
3731 if (IS_NULL(root)) {
3732 onig_node_free(srem);
3733 onig_node_free(prev_node);
3734 goto mem_err;
3735 }
3736 }
3737
3738 if (IS_NULL(root)) {
3739 prev_node = srem;
3740 }
3741 else {
3742 if (IS_NULL(onig_node_list_add(root, srem))) {
3743 onig_node_free(srem);
3744 goto mem_err;
3745 }
3746 }
3747 }
3748
3749 /* ending */
3750 top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
3751 swap_node(node, top_root);
3752 onig_node_free(top_root);
3753 return 0;
3754
3755 mem_err:
3756 r = ONIGERR_MEMORY;
3757
3758 err:
3759 onig_node_free(top_root);
3760 return r;
3761}
3762
3763
3764#ifdef USE_COMBINATION_EXPLOSION_CHECK
3765
3766# define CEC_THRES_NUM_BIG_REPEAT 512
3767# define CEC_INFINITE_NUM 0x7fffffff
3768
3769# define CEC_IN_INFINITE_REPEAT (1<<0)
3770# define CEC_IN_FINITE_REPEAT (1<<1)
3771# define CEC_CONT_BIG_REPEAT (1<<2)
3772
3773static int
3774setup_comb_exp_check(Node* node, int state, ScanEnv* env)
3775{
3776 int type;
3777 int r = state;
3778
3779 type = NTYPE(node);
3780 switch (type) {
3781 case NT_LIST:
3782 {
3783 do {
3784 r = setup_comb_exp_check(NCAR(node), r, env);
3785 } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
3786 }
3787 break;
3788
3789 case NT_ALT:
3790 {
3791 int ret;
3792 do {
3793 ret = setup_comb_exp_check(NCAR(node), state, env);
3794 r |= ret;
3795 } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
3796 }
3797 break;
3798
3799 case NT_QTFR:
3800 {
3801 int child_state = state;
3802 int add_state = 0;
3803 QtfrNode* qn = NQTFR(node);
3804 Node* target = qn->target;
3805 int var_num;
3806
3807 if (! IS_REPEAT_INFINITE(qn->upper)) {
3808 if (qn->upper > 1) {
3809 /* {0,1}, {1,1} are allowed */
3810 child_state |= CEC_IN_FINITE_REPEAT;
3811
3812 /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
3813 if (env->backrefed_mem == 0) {
3814 if (NTYPE(qn->target) == NT_ENCLOSE) {
3815 EncloseNode* en = NENCLOSE(qn->target);
3816 if (en->type == ENCLOSE_MEMORY) {
3817 if (NTYPE(en->target) == NT_QTFR) {
3818 QtfrNode* q = NQTFR(en->target);
3819 if (IS_REPEAT_INFINITE(q->upper)
3820 && q->greedy == qn->greedy) {
3821 qn->upper = (qn->lower == 0 ? 1 : qn->lower);
3822 if (qn->upper == 1)
3823 child_state = state;
3824 }
3825 }
3826 }
3827 }
3828 }
3829 }
3830 }
3831
3832 if (state & CEC_IN_FINITE_REPEAT) {
3833 qn->comb_exp_check_num = -1;
3834 }
3835 else {
3836 if (IS_REPEAT_INFINITE(qn->upper)) {
3837 var_num = CEC_INFINITE_NUM;
3838 child_state |= CEC_IN_INFINITE_REPEAT;
3839 }
3840 else {
3841 var_num = qn->upper - qn->lower;
3842 }
3843
3844 if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
3845 add_state |= CEC_CONT_BIG_REPEAT;
3846
3847 if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
3848 ((state & CEC_CONT_BIG_REPEAT) != 0 &&
3849 var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
3850 if (qn->comb_exp_check_num == 0) {
3851 env->num_comb_exp_check++;
3852 qn->comb_exp_check_num = env->num_comb_exp_check;
3853 if (env->curr_max_regnum > env->comb_exp_max_regnum)
3854 env->comb_exp_max_regnum = env->curr_max_regnum;
3855 }
3856 }
3857 }
3858
3859 r = setup_comb_exp_check(target, child_state, env);
3860 r |= add_state;
3861 }
3862 break;
3863
3864 case NT_ENCLOSE:
3865 {
3866 EncloseNode* en = NENCLOSE(node);
3867
3868 switch (en->type) {
3869 case ENCLOSE_MEMORY:
3870 {
3871 if (env->curr_max_regnum < en->regnum)
3872 env->curr_max_regnum = en->regnum;
3873
3874 r = setup_comb_exp_check(en->target, state, env);
3875 }
3876 break;
3877
3878 default:
3879 r = setup_comb_exp_check(en->target, state, env);
3880 break;
3881 }
3882 }
3883 break;
3884
3885# ifdef USE_SUBEXP_CALL
3886 case NT_CALL:
3887 if (IS_CALL_RECURSION(NCALL(node)))
3888 env->has_recursion = 1;
3889 else
3890 r = setup_comb_exp_check(NCALL(node)->target, state, env);
3891 break;
3892# endif
3893
3894 default:
3895 break;
3896 }
3897
3898 return r;
3899}
3900#endif
3901
3902/* setup_tree does the following work.
3903 1. check empty loop. (set qn->target_empty_info)
3904 2. expand ignore-case in char class.
3905 3. set memory status bit flags. (reg->mem_stats)
3906 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
3907 5. find invalid patterns in look-behind.
3908 6. expand repeated string.
3909 */
3910static int
3911setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
3912{
3913 int type;
3914 int r = 0;
3915
3916restart:
3917 type = NTYPE(node);
3918 switch (type) {
3919 case NT_LIST:
3920 {
3921 Node* prev = NULL_NODE;
3922 do {
3923 r = setup_tree(NCAR(node), reg, state, env);
3924 if (IS_NOT_NULL(prev) && r == 0) {
3925 r = next_setup(prev, NCAR(node), reg);
3926 }
3927 prev = NCAR(node);
3928 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3929 }
3930 break;
3931
3932 case NT_ALT:
3933 do {
3934 r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
3935 } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
3936 break;
3937
3938 case NT_CCLASS:
3939 break;
3940
3941 case NT_STR:
3942 if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3943 r = expand_case_fold_string(node, reg, state);
3944 }
3945 break;
3946
3947 case NT_CTYPE:
3948 case NT_CANY:
3949 break;
3950
3951#ifdef USE_SUBEXP_CALL
3952 case NT_CALL:
3953 break;
3954#endif
3955
3956 case NT_BREF:
3957 {
3958 int i;
3959 int* p;
3960 Node** nodes = SCANENV_MEM_NODES(env);
3961 BRefNode* br = NBREF(node);
3962 p = BACKREFS_P(br);
3963 for (i = 0; i < br->back_num; i++) {
3964 if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
3965 BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
3966 BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
3967#ifdef USE_BACKREF_WITH_LEVEL
3968 if (IS_BACKREF_NEST_LEVEL(br)) {
3969 BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
3970 }
3971#endif
3972 SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
3973 }
3974 }
3975 break;
3976
3977 case NT_QTFR:
3978 {
3979 OnigDistance d;
3980 QtfrNode* qn = NQTFR(node);
3981 Node* target = qn->target;
3982
3983 if ((state & IN_REPEAT) != 0) {
3984 qn->state |= NST_IN_REPEAT;
3985 }
3986
3987 if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
3988 r = get_min_match_length(target, &d, env);
3989 if (r) break;
3990 if (d == 0) {
3991 qn->target_empty_info = NQ_TARGET_IS_EMPTY;
3992#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
3993 r = quantifiers_memory_node_info(target);
3994 if (r < 0) break;
3995 if (r > 0) {
3996 qn->target_empty_info = r;
3997 }
3998#endif
3999#if 0
4000 r = get_max_match_length(target, &d, env);
4001 if (r == 0 && d == 0) {
4002 /* ()* ==> ()?, ()+ ==> () */
4003 qn->upper = 1;
4004 if (qn->lower > 1) qn->lower = 1;
4005 if (NTYPE(target) == NT_STR) {
4006 qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
4007 }
4008 }
4009#endif
4010 }
4011 }
4012
4013 state |= IN_REPEAT;
4014 if (qn->lower != qn->upper)
4015 state |= IN_VAR_REPEAT;
4016 r = setup_tree(target, reg, state, env);
4017 if (r) break;
4018
4019 /* expand string */
4020#define EXPAND_STRING_MAX_LENGTH 100
4021 if (NTYPE(target) == NT_STR) {
4022 if (qn->lower > 1) {
4023 int i, n = qn->lower;
4024 OnigDistance len = NSTRING_LEN(target);
4025 StrNode* sn = NSTR(target);
4026 Node* np;
4027
4028 np = onig_node_new_str(sn->s, sn->end);
4029 if (IS_NULL(np)) return ONIGERR_MEMORY;
4030 NSTR(np)->flag = sn->flag;
4031
4032 for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
4033 r = onig_node_str_cat(np, sn->s, sn->end);
4034 if (r) {
4035 onig_node_free(np);
4036 return r;
4037 }
4038 }
4039 if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
4040 Node *np1, *np2;
4041
4042 qn->lower -= i;
4043 if (! IS_REPEAT_INFINITE(qn->upper))
4044 qn->upper -= i;
4045
4046 np1 = onig_node_new_list(np, NULL);
4047 if (IS_NULL(np1)) {
4048 onig_node_free(np);
4049 return ONIGERR_MEMORY;
4050 }
4051 swap_node(np1, node);
4052 np2 = onig_node_list_add(node, np1);
4053 if (IS_NULL(np2)) {
4054 onig_node_free(np1);
4055 return ONIGERR_MEMORY;
4056 }
4057 }
4058 else {
4059 swap_node(np, node);
4060 onig_node_free(np);
4061 }
4062 break; /* break case NT_QTFR: */
4063 }
4064 }
4065
4066#ifdef USE_OP_PUSH_OR_JUMP_EXACT
4067 if (qn->greedy && (qn->target_empty_info != 0)) {
4068 if (NTYPE(target) == NT_QTFR) {
4069 QtfrNode* tqn = NQTFR(target);
4070 if (IS_NOT_NULL(tqn->head_exact)) {
4071 qn->head_exact = tqn->head_exact;
4072 tqn->head_exact = NULL;
4073 }
4074 }
4075 else {
4076 qn->head_exact = get_head_value_node(qn->target, 1, reg);
4077 }
4078 }
4079#endif
4080 }
4081 break;
4082
4083 case NT_ENCLOSE:
4084 {
4085 EncloseNode* en = NENCLOSE(node);
4086
4087 switch (en->type) {
4088 case ENCLOSE_OPTION:
4089 {
4090 OnigOptionType options = reg->options;
4091 reg->options = NENCLOSE(node)->option;
4092 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4093 reg->options = options;
4094 }
4095 break;
4096
4097 case ENCLOSE_MEMORY:
4098 if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
4099 BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
4100 /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
4101 }
4102 if (IS_ENCLOSE_CALLED(en))
4103 state |= IN_CALL;
4104 if (IS_ENCLOSE_RECURSION(en))
4105 state |= IN_RECCALL;
4106 else if ((state & IN_RECCALL) != 0)
4107 SET_CALL_RECURSION(node);
4108 r = setup_tree(en->target, reg, state, env);
4109 break;
4110
4111 case ENCLOSE_STOP_BACKTRACK:
4112 {
4113 Node* target = en->target;
4114 r = setup_tree(target, reg, state, env);
4115 if (NTYPE(target) == NT_QTFR) {
4116 QtfrNode* tqn = NQTFR(target);
4117 if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
4118 tqn->greedy != 0) { /* (?>a*), a*+ etc... */
4119 int qtype = NTYPE(tqn->target);
4120 if (IS_NODE_TYPE_SIMPLE(qtype))
4121 SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
4122 }
4123 }
4124 }
4125 break;
4126
4127 case ENCLOSE_CONDITION:
4128#ifdef USE_NAMED_GROUP
4129 if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
4130 env->num_named > 0 &&
4131 IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
4132 !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
4133 return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
4134 }
4135#endif
4136 if (NENCLOSE(node)->regnum > env->num_mem)
4137 return ONIGERR_INVALID_BACKREF;
4138 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4139 break;
4140
4141 case ENCLOSE_ABSENT:
4142 r = setup_tree(NENCLOSE(node)->target, reg, state, env);
4143 break;
4144 }
4145 }
4146 break;
4147
4148 case NT_ANCHOR:
4149 {
4150 AnchorNode* an = NANCHOR(node);
4151
4152 switch (an->type) {
4153 case ANCHOR_PREC_READ:
4154 r = setup_tree(an->target, reg, state, env);
4155 break;
4156 case ANCHOR_PREC_READ_NOT:
4157 r = setup_tree(an->target, reg, (state | IN_NOT), env);
4158 break;
4159
4160/* allowed node types in look-behind */
4161#define ALLOWED_TYPE_IN_LB \
4162 ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
4163 BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
4164
4165#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
4166#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
4167
4168#define ALLOWED_ANCHOR_IN_LB \
4169( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4170 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4171 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4172 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4173#define ALLOWED_ANCHOR_IN_LB_NOT \
4174( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
4175 ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
4176 ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
4177 ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
4178
4179 case ANCHOR_LOOK_BEHIND:
4180 {
4181 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4182 ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
4183 if (r < 0) return r;
4184 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4185 if (NTYPE(node) != NT_ANCHOR) goto restart;
4186 r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
4187 if (r != 0) return r;
4188 r = setup_look_behind(node, reg, env);
4189 }
4190 break;
4191
4192 case ANCHOR_LOOK_BEHIND_NOT:
4193 {
4194 r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
4195 ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
4196 if (r < 0) return r;
4197 if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
4198 if (NTYPE(node) != NT_ANCHOR) goto restart;
4199 r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
4200 env);
4201 if (r != 0) return r;
4202 r = setup_look_behind(node, reg, env);
4203 }
4204 break;
4205 }
4206 }
4207 break;
4208
4209 default:
4210 break;
4211 }
4212
4213 return r;
4214}
4215
4216/* set skip map for Sunday's quick search */
4217static int
4218set_bm_skip(UChar* s, UChar* end, regex_t* reg,
4219 UChar skip[], int ignore_case)
4220{
4221 OnigDistance i, len;
4222 int clen, flen, n, j, k;
4223 UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
4224 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4225 OnigEncoding enc = reg->enc;
4226
4227 len = end - s;
4228 if (len >= ONIG_CHAR_TABLE_SIZE) {
4229 /* This should not happen. */
4230 return ONIGERR_TYPE_BUG;
4231 }
4232
4233 if (ignore_case) {
4234 for (i = 0; i < len; i += clen) {
4235 p = s + i;
4236 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4237 p, end, items);
4238 clen = enclen(enc, p, end);
4239 if (p + clen > end)
4240 clen = (int )(end - p);
4241
4242 for (j = 0; j < n; j++) {
4243 if ((items[j].code_len != 1) || (items[j].byte_len != clen)) {
4244 /* Different length isn't supported. Stop optimization at here. */
4245 end = p;
4246 goto endcheck;
4247 }
4248 flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf);
4249 if (flen != clen) {
4250 /* Different length isn't supported. Stop optimization at here. */
4251 end = p;
4252 goto endcheck;
4253 }
4254 }
4255 }
4256endcheck:
4257 len = end - s;
4258 }
4259
4260 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
4261 skip[i] = (UChar )(len + 1);
4262 n = 0;
4263 for (i = 0; i < len; i += clen) {
4264 p = s + i;
4265 if (ignore_case)
4266 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
4267 p, end, items);
4268 clen = enclen(enc, p, end);
4269 if (p + clen > end)
4270 clen = (int )(end - p);
4271
4272 for (j = 0; j < clen; j++) {
4273 skip[s[i + j]] = (UChar )(len - i - j);
4274 for (k = 0; k < n; k++) {
4275 ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
4276 skip[buf[j]] = (UChar )(len - i - j);
4277 }
4278 }
4279 }
4280
4281 return (int )len;
4282}
4283
4284typedef struct {
4285 OnigDistance min; /* min byte length */
4286 OnigDistance max; /* max byte length */
4287} MinMaxLen;
4288
4289typedef struct {
4290 MinMaxLen mmd;
4291 OnigEncoding enc;
4292 OnigOptionType options;
4293 OnigCaseFoldType case_fold_flag;
4294 ScanEnv* scan_env;
4295} OptEnv;
4296
4297typedef struct {
4298 int left_anchor;
4299 int right_anchor;
4300} OptAncInfo;
4301
4302typedef struct {
4303 MinMaxLen mmd; /* info position */
4304 OptAncInfo anc;
4305
4306 int reach_end;
4307 int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
4308 int len;
4309 UChar s[OPT_EXACT_MAXLEN];
4310} OptExactInfo;
4311
4312typedef struct {
4313 MinMaxLen mmd; /* info position */
4314 OptAncInfo anc;
4315
4316 int value; /* weighted value */
4317 UChar map[ONIG_CHAR_TABLE_SIZE];
4318} OptMapInfo;
4319
4320typedef struct {
4321 MinMaxLen len;
4322
4323 OptAncInfo anc;
4324 OptExactInfo exb; /* boundary */
4325 OptExactInfo exm; /* middle */
4326 OptExactInfo expr; /* prec read (?=...) */
4327
4328 OptMapInfo map; /* boundary */
4329} NodeOptInfo;
4330
4331
4332static int
4333map_position_value(OnigEncoding enc, int i)
4334{
4335 static const short int ByteValTable[] = {
4336 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
4337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4338 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
4339 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
4340 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4341 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
4342 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4343 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
4344 };
4345
4346 if (i < numberof(ByteValTable)) {
4347 if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
4348 return 20;
4349 else
4350 return (int )ByteValTable[i];
4351 }
4352 else
4353 return 4; /* Take it easy. */
4354}
4355
4356static int
4357distance_value(MinMaxLen* mm)
4358{
4359 /* 1000 / (min-max-dist + 1) */
4360 static const short int dist_vals[] = {
4361 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
4362 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
4363 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
4364 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
4365 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
4366 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
4367 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
4368 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
4369 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
4370 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
4371 };
4372
4373 OnigDistance d;
4374
4375 if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
4376
4377 d = mm->max - mm->min;
4378 if (d < numberof(dist_vals))
4379 /* return dist_vals[d] * 16 / (mm->min + 12); */
4380 return (int )dist_vals[d];
4381 else
4382 return 1;
4383}
4384
4385static int
4386comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
4387{
4388 if (v2 <= 0) return -1;
4389 if (v1 <= 0) return 1;
4390
4391 v1 *= distance_value(d1);
4392 v2 *= distance_value(d2);
4393
4394 if (v2 > v1) return 1;
4395 if (v2 < v1) return -1;
4396
4397 if (d2->min < d1->min) return 1;
4398 if (d2->min > d1->min) return -1;
4399 return 0;
4400}
4401
4402static int
4403is_equal_mml(MinMaxLen* a, MinMaxLen* b)
4404{
4405 return (a->min == b->min && a->max == b->max) ? 1 : 0;
4406}
4407
4408
4409static void
4410set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
4411{
4412 mml->min = min;
4413 mml->max = max;
4414}
4415
4416static void
4417clear_mml(MinMaxLen* mml)
4418{
4419 mml->min = mml->max = 0;
4420}
4421
4422static void
4423copy_mml(MinMaxLen* to, MinMaxLen* from)
4424{
4425 to->min = from->min;
4426 to->max = from->max;
4427}
4428
4429static void
4430add_mml(MinMaxLen* to, MinMaxLen* from)
4431{
4432 to->min = distance_add(to->min, from->min);
4433 to->max = distance_add(to->max, from->max);
4434}
4435
4436#if 0
4437static void
4438add_len_mml(MinMaxLen* to, OnigDistance len)
4439{
4440 to->min = distance_add(to->min, len);
4441 to->max = distance_add(to->max, len);
4442}
4443#endif
4444
4445static void
4446alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
4447{
4448 if (to->min > from->min) to->min = from->min;
4449 if (to->max < from->max) to->max = from->max;
4450}
4451
4452static void
4453copy_opt_env(OptEnv* to, OptEnv* from)
4454{
4455 *to = *from;
4456}
4457
4458static void
4459clear_opt_anc_info(OptAncInfo* anc)
4460{
4461 anc->left_anchor = 0;
4462 anc->right_anchor = 0;
4463}
4464
4465static void
4466copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
4467{
4468 *to = *from;
4469}
4470
4471static void
4472concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
4473 OnigDistance left_len, OnigDistance right_len)
4474{
4475 clear_opt_anc_info(to);
4476
4477 to->left_anchor = left->left_anchor;
4478 if (left_len == 0) {
4479 to->left_anchor |= right->left_anchor;
4480 }
4481
4482 to->right_anchor = right->right_anchor;
4483 if (right_len == 0) {
4484 to->right_anchor |= left->right_anchor;
4485 }
4486 else {
4487 to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
4488 }
4489}
4490
4491static int
4492is_left_anchor(int anc)
4493{
4494 if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
4495 anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
4496 anc == ANCHOR_PREC_READ_NOT)
4497 return 0;
4498
4499 return 1;
4500}
4501
4502static int
4503is_set_opt_anc_info(OptAncInfo* to, int anc)
4504{
4505 if ((to->left_anchor & anc) != 0) return 1;
4506
4507 return ((to->right_anchor & anc) != 0 ? 1 : 0);
4508}
4509
4510static void
4511add_opt_anc_info(OptAncInfo* to, int anc)
4512{
4513 if (is_left_anchor(anc))
4514 to->left_anchor |= anc;
4515 else
4516 to->right_anchor |= anc;
4517}
4518
4519static void
4520remove_opt_anc_info(OptAncInfo* to, int anc)
4521{
4522 if (is_left_anchor(anc))
4523 to->left_anchor &= ~anc;
4524 else
4525 to->right_anchor &= ~anc;
4526}
4527
4528static void
4529alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
4530{
4531 to->left_anchor &= add->left_anchor;
4532 to->right_anchor &= add->right_anchor;
4533}
4534
4535static int
4536is_full_opt_exact_info(OptExactInfo* ex)
4537{
4538 return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
4539}
4540
4541static void
4542clear_opt_exact_info(OptExactInfo* ex)
4543{
4544 clear_mml(&ex->mmd);
4545 clear_opt_anc_info(&ex->anc);
4546 ex->reach_end = 0;
4547 ex->ignore_case = -1; /* unset */
4548 ex->len = 0;
4549 ex->s[0] = '\0';
4550}
4551
4552static void
4553copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
4554{
4555 *to = *from;
4556}
4557
4558static void
4559concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
4560{
4561 int i, j, len;
4562 UChar *p, *end;
4563 OptAncInfo tanc;
4564
4565 if (to->ignore_case < 0)
4566 to->ignore_case = add->ignore_case;
4567 else if (to->ignore_case != add->ignore_case)
4568 return ; /* avoid */
4569
4570 p = add->s;
4571 end = p + add->len;
4572 for (i = to->len; p < end; ) {
4573 len = enclen(enc, p, end);
4574 if (i + len > OPT_EXACT_MAXLEN) break;
4575 for (j = 0; j < len && p < end; j++)
4576 to->s[i++] = *p++;
4577 }
4578
4579 to->len = i;
4580 to->reach_end = (p == end ? add->reach_end : 0);
4581
4582 concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
4583 if (! to->reach_end) tanc.right_anchor = 0;
4584 copy_opt_anc_info(&to->anc, &tanc);
4585}
4586
4587static void
4588concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
4589 int raw ARG_UNUSED, OnigEncoding enc)
4590{
4591 int i, j, len;
4592 UChar *p;
4593
4594 for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
4595 len = enclen(enc, p, end);
4596 if (i + len > OPT_EXACT_MAXLEN) break;
4597 for (j = 0; j < len && p < end; j++)
4598 to->s[i++] = *p++;
4599 }
4600
4601 to->len = i;
4602}
4603
4604static void
4605alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
4606{
4607 int i, j, len;
4608
4609 if (add->len == 0 || to->len == 0) {
4610 clear_opt_exact_info(to);
4611 return ;
4612 }
4613
4614 if (! is_equal_mml(&to->mmd, &add->mmd)) {
4615 clear_opt_exact_info(to);
4616 return ;
4617 }
4618
4619 for (i = 0; i < to->len && i < add->len; ) {
4620 if (to->s[i] != add->s[i]) break;
4621 len = enclen(env->enc, to->s + i, to->s + to->len);
4622
4623 for (j = 1; j < len; j++) {
4624 if (to->s[i+j] != add->s[i+j]) break;
4625 }
4626 if (j < len) break;
4627 i += len;
4628 }
4629
4630 if (! add->reach_end || i < add->len || i < to->len) {
4631 to->reach_end = 0;
4632 }
4633 to->len = i;
4634 if (to->ignore_case < 0)
4635 to->ignore_case = add->ignore_case;
4636 else if (add->ignore_case >= 0)
4637 to->ignore_case |= add->ignore_case;
4638
4639 alt_merge_opt_anc_info(&to->anc, &add->anc);
4640 if (! to->reach_end) to->anc.right_anchor = 0;
4641}
4642
4643static void
4644select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
4645{
4646 int v1, v2;
4647
4648 v1 = now->len;
4649 v2 = alt->len;
4650
4651 if (v2 == 0) {
4652 return ;
4653 }
4654 else if (v1 == 0) {
4655 copy_opt_exact_info(now, alt);
4656 return ;
4657 }
4658 else if (v1 <= 2 && v2 <= 2) {
4659 /* ByteValTable[x] is big value --> low price */
4660 v2 = map_position_value(enc, now->s[0]);
4661 v1 = map_position_value(enc, alt->s[0]);
4662
4663 if (now->len > 1) v1 += 5;
4664 if (alt->len > 1) v2 += 5;
4665 }
4666
4667 if (now->ignore_case <= 0) v1 *= 2;
4668 if (alt->ignore_case <= 0) v2 *= 2;
4669
4670 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4671 copy_opt_exact_info(now, alt);
4672}
4673
4674static void
4675clear_opt_map_info(OptMapInfo* map)
4676{
4677 static const OptMapInfo clean_info = {
4678 {0, 0}, {0, 0}, 0,
4679 {
4680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
4696 }
4697 };
4698
4699 xmemcpy(map, &clean_info, sizeof(OptMapInfo));
4700}
4701
4702static void
4703copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
4704{
4705 *to = *from;
4706}
4707
4708static void
4709add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
4710{
4711 if (map->map[c] == 0) {
4712 map->map[c] = 1;
4713 map->value += map_position_value(enc, c);
4714 }
4715}
4716
4717static int
4718add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
4719 OnigEncoding enc, OnigCaseFoldType case_fold_flag)
4720{
4721 OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
4722 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
4723 int i, n;
4724
4725 add_char_opt_map_info(map, p[0], enc);
4726
4727 case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
4728 n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
4729 if (n < 0) return n;
4730
4731 for (i = 0; i < n; i++) {
4732 ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
4733 add_char_opt_map_info(map, buf[0], enc);
4734 }
4735
4736 return 0;
4737}
4738
4739static void
4740select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
4741{
4742 const int z = 1<<15; /* 32768: something big value */
4743
4744 int v1, v2;
4745
4746 if (alt->value == 0) return ;
4747 if (now->value == 0) {
4748 copy_opt_map_info(now, alt);
4749 return ;
4750 }
4751
4752 v1 = z / now->value;
4753 v2 = z / alt->value;
4754 if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
4755 copy_opt_map_info(now, alt);
4756}
4757
4758static int
4759comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
4760{
4761#define COMP_EM_BASE 20
4762 int ve, vm;
4763
4764 if (m->value <= 0) return -1;
4765
4766 ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
4767 vm = COMP_EM_BASE * 5 * 2 / m->value;
4768 return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
4769}
4770
4771static void
4772alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
4773{
4774 int i, val;
4775
4776 /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
4777 if (to->value == 0) return ;
4778 if (add->value == 0 || to->mmd.max < add->mmd.min) {
4779 clear_opt_map_info(to);
4780 return ;
4781 }
4782
4783 alt_merge_mml(&to->mmd, &add->mmd);
4784
4785 val = 0;
4786 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
4787 if (add->map[i])
4788 to->map[i] = 1;
4789
4790 if (to->map[i])
4791 val += map_position_value(enc, i);
4792 }
4793 to->value = val;
4794
4795 alt_merge_opt_anc_info(&to->anc, &add->anc);
4796}
4797
4798static void
4799set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
4800{
4801 copy_mml(&(opt->exb.mmd), mmd);
4802 copy_mml(&(opt->expr.mmd), mmd);
4803 copy_mml(&(opt->map.mmd), mmd);
4804}
4805
4806static void
4807clear_node_opt_info(NodeOptInfo* opt)
4808{
4809 clear_mml(&opt->len);
4810 clear_opt_anc_info(&opt->anc);
4811 clear_opt_exact_info(&opt->exb);
4812 clear_opt_exact_info(&opt->exm);
4813 clear_opt_exact_info(&opt->expr);
4814 clear_opt_map_info(&opt->map);
4815}
4816
4817static void
4818copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
4819{
4820 *to = *from;
4821}
4822
4823static void
4824concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
4825{
4826 int exb_reach, exm_reach;
4827 OptAncInfo tanc;
4828
4829 concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
4830 copy_opt_anc_info(&to->anc, &tanc);
4831
4832 if (add->exb.len > 0 && to->len.max == 0) {
4833 concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
4834 to->len.max, add->len.max);
4835 copy_opt_anc_info(&add->exb.anc, &tanc);
4836 }
4837
4838 if (add->map.value > 0 && to->len.max == 0) {
4839 if (add->map.mmd.max == 0)
4840 add->map.anc.left_anchor |= to->anc.left_anchor;
4841 }
4842
4843 exb_reach = to->exb.reach_end;
4844 exm_reach = to->exm.reach_end;
4845
4846 if (add->len.max != 0)
4847 to->exb.reach_end = to->exm.reach_end = 0;
4848
4849 if (add->exb.len > 0) {
4850 if (exb_reach) {
4851 concat_opt_exact_info(&to->exb, &add->exb, enc);
4852 clear_opt_exact_info(&add->exb);
4853 }
4854 else if (exm_reach) {
4855 concat_opt_exact_info(&to->exm, &add->exb, enc);
4856 clear_opt_exact_info(&add->exb);
4857 }
4858 }
4859 select_opt_exact_info(enc, &to->exm, &add->exb);
4860 select_opt_exact_info(enc, &to->exm, &add->exm);
4861
4862 if (to->expr.len > 0) {
4863 if (add->len.max > 0) {
4864 if (to->expr.len > (int )add->len.max)
4865 to->expr.len = (int )add->len.max;
4866
4867 if (to->expr.mmd.max == 0)
4868 select_opt_exact_info(enc, &to->exb, &to->expr);
4869 else
4870 select_opt_exact_info(enc, &to->exm, &to->expr);
4871 }
4872 }
4873 else if (add->expr.len > 0) {
4874 copy_opt_exact_info(&to->expr, &add->expr);
4875 }
4876
4877 select_opt_map_info(&to->map, &add->map);
4878
4879 add_mml(&to->len, &add->len);
4880}
4881
4882static void
4883alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
4884{
4885 alt_merge_opt_anc_info (&to->anc, &add->anc);
4886 alt_merge_opt_exact_info(&to->exb, &add->exb, env);
4887 alt_merge_opt_exact_info(&to->exm, &add->exm, env);
4888 alt_merge_opt_exact_info(&to->expr, &add->expr, env);
4889 alt_merge_opt_map_info(env->enc, &to->map, &add->map);
4890
4891 alt_merge_mml(&to->len, &add->len);
4892}
4893
4894
4895#define MAX_NODE_OPT_INFO_REF_COUNT 5
4896
4897static int
4898optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
4899{
4900 int type;
4901 int r = 0;
4902
4903 clear_node_opt_info(opt);
4904 set_bound_node_opt_info(opt, &env->mmd);
4905
4906 type = NTYPE(node);
4907 switch (type) {
4908 case NT_LIST:
4909 {
4910 OptEnv nenv;
4911 NodeOptInfo nopt;
4912 Node* nd = node;
4913
4914 copy_opt_env(&nenv, env);
4915 do {
4916 r = optimize_node_left(NCAR(nd), &nopt, &nenv);
4917 if (r == 0) {
4918 add_mml(&nenv.mmd, &nopt.len);
4919 concat_left_node_opt_info(env->enc, opt, &nopt);
4920 }
4921 } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
4922 }
4923 break;
4924
4925 case NT_ALT:
4926 {
4927 NodeOptInfo nopt;
4928 Node* nd = node;
4929
4930 do {
4931 r = optimize_node_left(NCAR(nd), &nopt, env);
4932 if (r == 0) {
4933 if (nd == node) copy_node_opt_info(opt, &nopt);
4934 else alt_merge_node_opt_info(opt, &nopt, env);
4935 }
4936 } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
4937 }
4938 break;
4939
4940 case NT_STR:
4941 {
4942 StrNode* sn = NSTR(node);
4943 OnigDistance slen = sn->end - sn->s;
4944 int is_raw = NSTRING_IS_RAW(node);
4945
4946 if (! NSTRING_IS_AMBIG(node)) {
4947 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4948 is_raw, env->enc);
4949 opt->exb.ignore_case = 0;
4950 if (slen > 0) {
4951 add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
4952 }
4953 set_mml(&opt->len, slen, slen);
4954 }
4955 else {
4956 OnigDistance max;
4957
4958 if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
4959 int n = onigenc_strlen(env->enc, sn->s, sn->end);
4960 max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
4961 }
4962 else {
4963 concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
4964 is_raw, env->enc);
4965 opt->exb.ignore_case = 1;
4966
4967 if (slen > 0) {
4968 r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
4969 env->enc, env->case_fold_flag);
4970 if (r != 0) break;
4971 }
4972
4973 max = slen;
4974 }
4975
4976 set_mml(&opt->len, slen, max);
4977 }
4978
4979 if ((OnigDistance )opt->exb.len == slen)
4980 opt->exb.reach_end = 1;
4981 }
4982 break;
4983
4984 case NT_CCLASS:
4985 {
4986 int i, z;
4987 CClassNode* cc = NCCLASS(node);
4988
4989 /* no need to check ignore case. (set in setup_tree()) */
4990
4991 if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
4992 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
4993 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
4994
4995 set_mml(&opt->len, min, max);
4996 }
4997 else {
4998 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
4999 z = BITSET_AT(cc->bs, i);
5000 if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
5001 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5002 }
5003 }
5004 set_mml(&opt->len, 1, 1);
5005 }
5006 }
5007 break;
5008
5009 case NT_CTYPE:
5010 {
5011 int i, min, max;
5012 int maxcode;
5013
5014 max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5015
5016 if (max == 1) {
5017 min = 1;
5018
5019 maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
5020 switch (NCTYPE(node)->ctype) {
5021 case ONIGENC_CTYPE_WORD:
5022 if (NCTYPE(node)->not != 0) {
5023 for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
5024 if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
5025 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5026 }
5027 }
5028 }
5029 else {
5030 for (i = 0; i < maxcode; i++) {
5031 if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
5032 add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
5033 }
5034 }
5035 }
5036 break;
5037 }
5038 }
5039 else {
5040 min = ONIGENC_MBC_MINLEN(env->enc);
5041 }
5042 set_mml(&opt->len, min, max);
5043 }
5044 break;
5045
5046 case NT_CANY:
5047 {
5048 OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
5049 OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
5050 set_mml(&opt->len, min, max);
5051 }
5052 break;
5053
5054 case NT_ANCHOR:
5055 switch (NANCHOR(node)->type) {
5056 case ANCHOR_BEGIN_BUF:
5057 case ANCHOR_BEGIN_POSITION:
5058 case ANCHOR_BEGIN_LINE:
5059 case ANCHOR_END_BUF:
5060 case ANCHOR_SEMI_END_BUF:
5061 case ANCHOR_END_LINE:
5062 case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
5063 case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
5064 add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
5065 break;
5066
5067 case ANCHOR_PREC_READ:
5068 {
5069 NodeOptInfo nopt;
5070
5071 r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
5072 if (r == 0) {
5073 if (nopt.exb.len > 0)
5074 copy_opt_exact_info(&opt->expr, &nopt.exb);
5075 else if (nopt.exm.len > 0)
5076 copy_opt_exact_info(&opt->expr, &nopt.exm);
5077
5078 opt->expr.reach_end = 0;
5079
5080 if (nopt.map.value > 0)
5081 copy_opt_map_info(&opt->map, &nopt.map);
5082 }
5083 }
5084 break;
5085
5086 case ANCHOR_LOOK_BEHIND_NOT:
5087 break;
5088 }
5089 break;
5090
5091 case NT_BREF:
5092 {
5093 int i;
5094 int* backs;
5095 OnigDistance min, max, tmin, tmax;
5096 Node** nodes = SCANENV_MEM_NODES(env->scan_env);
5097 BRefNode* br = NBREF(node);
5098
5099 if (br->state & NST_RECURSION) {
5100 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5101 break;
5102 }
5103 backs = BACKREFS_P(br);
5104 r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
5105 if (r != 0) break;
5106 r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
5107 if (r != 0) break;
5108 for (i = 1; i < br->back_num; i++) {
5109 r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
5110 if (r != 0) break;
5111 r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
5112 if (r != 0) break;
5113 if (min > tmin) min = tmin;
5114 if (max < tmax) max = tmax;
5115 }
5116 if (r == 0) set_mml(&opt->len, min, max);
5117 }
5118 break;
5119
5120#ifdef USE_SUBEXP_CALL
5121 case NT_CALL:
5122 if (IS_CALL_RECURSION(NCALL(node)))
5123 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5124 else {
5125 OnigOptionType save = env->options;
5126 env->options = NENCLOSE(NCALL(node)->target)->option;
5127 r = optimize_node_left(NCALL(node)->target, opt, env);
5128 env->options = save;
5129 }
5130 break;
5131#endif
5132
5133 case NT_QTFR:
5134 {
5135 int i;
5136 OnigDistance min, max;
5137 NodeOptInfo nopt;
5138 QtfrNode* qn = NQTFR(node);
5139
5140 r = optimize_node_left(qn->target, &nopt, env);
5141 if (r) break;
5142
5143 if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
5144 if (env->mmd.max == 0 &&
5145 NTYPE(qn->target) == NT_CANY && qn->greedy) {
5146 if (IS_MULTILINE(env->options))
5147 /* implicit anchor: /.*a/ ==> /\A.*a/ */
5148 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
5149 else
5150 add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
5151 }
5152 }
5153 else {
5154 if (qn->lower > 0) {
5155 copy_node_opt_info(opt, &nopt);
5156 if (nopt.exb.len > 0) {
5157 if (nopt.exb.reach_end) {
5158 for (i = 2; i <= qn->lower &&
5159 ! is_full_opt_exact_info(&opt->exb); i++) {
5160 concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
5161 }
5162 if (i < qn->lower) {
5163 opt->exb.reach_end = 0;
5164 }
5165 }
5166 }
5167
5168 if (qn->lower != qn->upper) {
5169 opt->exb.reach_end = 0;
5170 opt->exm.reach_end = 0;
5171 }
5172 if (qn->lower > 1)
5173 opt->exm.reach_end = 0;
5174 }
5175 }
5176
5177 min = distance_multiply(nopt.len.min, qn->lower);
5178 if (IS_REPEAT_INFINITE(qn->upper))
5179 max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
5180 else
5181 max = distance_multiply(nopt.len.max, qn->upper);
5182
5183 set_mml(&opt->len, min, max);
5184 }
5185 break;
5186
5187 case NT_ENCLOSE:
5188 {
5189 EncloseNode* en = NENCLOSE(node);
5190
5191 switch (en->type) {
5192 case ENCLOSE_OPTION:
5193 {
5194 OnigOptionType save = env->options;
5195
5196 env->options = en->option;
5197 r = optimize_node_left(en->target, opt, env);
5198 env->options = save;
5199 }
5200 break;
5201
5202 case ENCLOSE_MEMORY:
5203#ifdef USE_SUBEXP_CALL
5204 en->opt_count++;
5205 if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
5206 OnigDistance min, max;
5207
5208 min = 0;
5209 max = ONIG_INFINITE_DISTANCE;
5210 if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
5211 if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
5212 set_mml(&opt->len, min, max);
5213 }
5214 else
5215#endif
5216 {
5217 r = optimize_node_left(en->target, opt, env);
5218
5219 if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
5220 if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
5221 remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
5222 }
5223 }
5224 break;
5225
5226 case ENCLOSE_STOP_BACKTRACK:
5227 case ENCLOSE_CONDITION:
5228 r = optimize_node_left(en->target, opt, env);
5229 break;
5230
5231 case ENCLOSE_ABSENT:
5232 set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
5233 break;
5234 }
5235 }
5236 break;
5237
5238 default:
5239#ifdef ONIG_DEBUG
5240 fprintf(stderr, "optimize_node_left: undefined node type %d\n",
5241 NTYPE(node));
5242#endif
5243 r = ONIGERR_TYPE_BUG;
5244 break;
5245 }
5246
5247 return r;
5248}
5249
5250static int
5251set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
5252{
5253 int allow_reverse;
5254
5255 if (e->len == 0) return 0;
5256
5257 reg->exact = (UChar* )xmalloc(e->len);
5258 CHECK_NULL_RETURN_MEMERR(reg->exact);
5259 xmemcpy(reg->exact, e->s, e->len);
5260 reg->exact_end = reg->exact + e->len;
5261
5262 allow_reverse =
5263 ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
5264
5265 if (e->ignore_case > 0) {
5266 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5267 int orig_len = e->len;
5268 e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
5269 reg->map, 1);
5270 if (e->len >= 3) {
5271 reg->exact_end = reg->exact + e->len;
5272 reg->optimize = (allow_reverse != 0
5273 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
5274 }
5275 else {
5276 /* Even if BM skip table can't be built (e.g., pattern starts with
5277 's' or 'k' which have multi-byte case fold variants), we should
5278 still use EXACT_IC optimization with the original pattern.
5279 Without this fallback, patterns like /slackware/i have no
5280 optimization at all, causing severe performance regression
5281 especially with non-ASCII strings. See [Bug #21824] */
5282 e->len = orig_len; /* Restore original length for EXACT_IC */
5283 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5284 }
5285 }
5286 else {
5287 reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
5288 }
5289 }
5290 else {
5291 if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
5292 set_bm_skip(reg->exact, reg->exact_end, reg,
5293 reg->map, 0);
5294 reg->optimize = (allow_reverse != 0
5295 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
5296 }
5297 else {
5298 reg->optimize = ONIG_OPTIMIZE_EXACT;
5299 }
5300 }
5301
5302 reg->dmin = e->mmd.min;
5303 reg->dmax = e->mmd.max;
5304
5305 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5306 reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
5307 }
5308
5309 return 0;
5310}
5311
5312static void
5313set_optimize_map_info(regex_t* reg, OptMapInfo* m)
5314{
5315 int i;
5316
5317 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5318 reg->map[i] = m->map[i];
5319
5320 reg->optimize = ONIG_OPTIMIZE_MAP;
5321 reg->dmin = m->mmd.min;
5322 reg->dmax = m->mmd.max;
5323
5324 if (reg->dmin != ONIG_INFINITE_DISTANCE) {
5325 reg->threshold_len = (int )(reg->dmin + 1);
5326 }
5327}
5328
5329static void
5330set_sub_anchor(regex_t* reg, OptAncInfo* anc)
5331{
5332 reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
5333 reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
5334}
5335
5336#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5337static void print_optimize_info(FILE* f, regex_t* reg);
5338#endif
5339
5340static int
5341set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
5342{
5343
5344 int r;
5345 NodeOptInfo opt;
5346 OptEnv env;
5347
5348 env.enc = reg->enc;
5349 env.options = reg->options;
5350 env.case_fold_flag = reg->case_fold_flag;
5351 env.scan_env = scan_env;
5352 clear_mml(&env.mmd);
5353
5354 r = optimize_node_left(node, &opt, &env);
5355 if (r) return r;
5356
5357 reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
5358 ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
5359 ANCHOR_LOOK_BEHIND);
5360
5361 if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
5362 reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
5363
5364 reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
5365 ANCHOR_PREC_READ_NOT);
5366
5367 if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
5368 reg->anchor_dmin = opt.len.min;
5369 reg->anchor_dmax = opt.len.max;
5370 }
5371
5372 if (opt.exb.len > 0 || opt.exm.len > 0) {
5373 select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
5374 if (opt.map.value > 0 &&
5375 comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
5376 goto set_map;
5377 }
5378 else {
5379 r = set_optimize_exact_info(reg, &opt.exb);
5380 set_sub_anchor(reg, &opt.exb.anc);
5381 }
5382 }
5383 else if (opt.map.value > 0) {
5384 set_map:
5385 set_optimize_map_info(reg, &opt.map);
5386 set_sub_anchor(reg, &opt.map.anc);
5387 }
5388 else {
5389 reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
5390 if (opt.len.max == 0)
5391 reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
5392 }
5393
5394#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5395 print_optimize_info(stderr, reg);
5396#endif
5397 return r;
5398}
5399
5400static void
5401clear_optimize_info(regex_t* reg)
5402{
5403 reg->optimize = ONIG_OPTIMIZE_NONE;
5404 reg->anchor = 0;
5405 reg->anchor_dmin = 0;
5406 reg->anchor_dmax = 0;
5407 reg->sub_anchor = 0;
5408 reg->exact_end = (UChar* )NULL;
5409 reg->threshold_len = 0;
5410 xfree(reg->exact);
5411 reg->exact = (UChar* )NULL;
5412}
5413
5414#ifdef ONIG_DEBUG
5415
5416static void print_enc_string(FILE* fp, OnigEncoding enc,
5417 const UChar *s, const UChar *end)
5418{
5419 fprintf(fp, "\nPATTERN: /");
5420
5421 if (ONIGENC_MBC_MINLEN(enc) > 1) {
5422 const UChar *p;
5423 OnigCodePoint code;
5424
5425 p = s;
5426 while (p < end) {
5427 code = ONIGENC_MBC_TO_CODE(enc, p, end);
5428 if (code >= 0x80) {
5429 fprintf(fp, " 0x%04x ", (int )code);
5430 }
5431 else {
5432 fputc((int )code, fp);
5433 }
5434
5435 p += enclen(enc, p, end);
5436 }
5437 }
5438 else {
5439 while (s < end) {
5440 fputc((int )*s, fp);
5441 s++;
5442 }
5443 }
5444
5445 fprintf(fp, "/ (%s)\n", enc->name);
5446}
5447#endif /* ONIG_DEBUG */
5448
5449#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
5450static void
5451print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
5452{
5453 if (a == ONIG_INFINITE_DISTANCE)
5454 fputs("inf", f);
5455 else
5456 fprintf(f, "(%"PRIuPTR")", a);
5457
5458 fputs("-", f);
5459
5460 if (b == ONIG_INFINITE_DISTANCE)
5461 fputs("inf", f);
5462 else
5463 fprintf(f, "(%"PRIuPTR")", b);
5464}
5465
5466static void
5467print_anchor(FILE* f, int anchor)
5468{
5469 int q = 0;
5470
5471 fprintf(f, "[");
5472
5473 if (anchor & ANCHOR_BEGIN_BUF) {
5474 fprintf(f, "begin-buf");
5475 q = 1;
5476 }
5477 if (anchor & ANCHOR_BEGIN_LINE) {
5478 if (q) fprintf(f, ", ");
5479 q = 1;
5480 fprintf(f, "begin-line");
5481 }
5482 if (anchor & ANCHOR_BEGIN_POSITION) {
5483 if (q) fprintf(f, ", ");
5484 q = 1;
5485 fprintf(f, "begin-pos");
5486 }
5487 if (anchor & ANCHOR_END_BUF) {
5488 if (q) fprintf(f, ", ");
5489 q = 1;
5490 fprintf(f, "end-buf");
5491 }
5492 if (anchor & ANCHOR_SEMI_END_BUF) {
5493 if (q) fprintf(f, ", ");
5494 q = 1;
5495 fprintf(f, "semi-end-buf");
5496 }
5497 if (anchor & ANCHOR_END_LINE) {
5498 if (q) fprintf(f, ", ");
5499 q = 1;
5500 fprintf(f, "end-line");
5501 }
5502 if (anchor & ANCHOR_ANYCHAR_STAR) {
5503 if (q) fprintf(f, ", ");
5504 q = 1;
5505 fprintf(f, "anychar-star");
5506 }
5507 if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
5508 if (q) fprintf(f, ", ");
5509 fprintf(f, "anychar-star-ml");
5510 }
5511
5512 fprintf(f, "]");
5513}
5514
5515static void
5516print_optimize_info(FILE* f, regex_t* reg)
5517{
5518 static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
5519 "EXACT_IC", "MAP",
5520 "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
5521
5522 fprintf(f, "optimize: %s\n", on[reg->optimize]);
5523 fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
5524 if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
5525 print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
5526 fprintf(f, "\n");
5527
5528 if (reg->optimize) {
5529 fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
5530 fprintf(f, "\n");
5531 }
5532 fprintf(f, "\n");
5533
5534 if (reg->exact) {
5535 UChar *p;
5536 fprintf(f, "exact: [");
5537 for (p = reg->exact; p < reg->exact_end; p++) {
5538 fputc(*p, f);
5539 }
5540 fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
5541 }
5542 else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
5543 int c, i, n = 0;
5544
5545 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
5546 if (reg->map[i]) n++;
5547
5548 fprintf(f, "map: n=%d\n", n);
5549 if (n > 0) {
5550 c = 0;
5551 fputc('[', f);
5552 for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
5553 if (reg->map[i] != 0) {
5554 if (c > 0) fputs(", ", f);
5555 c++;
5556 if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
5557 ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
5558 fputc(i, f);
5559 else
5560 fprintf(f, "%d", i);
5561 }
5562 }
5563 fprintf(f, "]\n");
5564 }
5565 }
5566}
5567#endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
5568
5569
5570extern void
5571onig_free_body(regex_t* reg)
5572{
5573 if (IS_NOT_NULL(reg)) {
5574 xfree(reg->p);
5575 xfree(reg->exact);
5576 xfree(reg->repeat_range);
5577 onig_free(reg->chain);
5578
5579#ifdef USE_NAMED_GROUP
5580 onig_names_free(reg);
5581#endif
5582 }
5583}
5584
5585extern void
5586onig_free(regex_t* reg)
5587{
5588 if (IS_NOT_NULL(reg)) {
5589 onig_free_body(reg);
5590 xfree(reg);
5591 }
5592}
5593
5594static void*
5595dup_copy(const void *ptr, size_t size)
5596{
5597 void *newptr = xmalloc(size);
5598 if (IS_NOT_NULL(newptr)) {
5599 memcpy(newptr, ptr, size);
5600 }
5601 return newptr;
5602}
5603
5604extern int
5605onig_reg_copy(regex_t** nreg, regex_t* oreg)
5606{
5607 if (IS_NOT_NULL(oreg)) {
5608 regex_t *reg = *nreg = (regex_t* )xmalloc(sizeof(regex_t));
5609 if (IS_NULL(reg)) return ONIGERR_MEMORY;
5610
5611 *reg = *oreg;
5612
5613# define COPY_FAILED(mem, size) IS_NULL(reg->mem = dup_copy(reg->mem, size))
5614
5615 if (IS_NOT_NULL(reg->exact)) {
5616 size_t exact_size = reg->exact_end - reg->exact;
5617 if (COPY_FAILED(exact, exact_size))
5618 goto err;
5619 (reg)->exact_end = (reg)->exact + exact_size;
5620 }
5621
5622 if (IS_NOT_NULL(reg->p)) {
5623 if (COPY_FAILED(p, reg->alloc))
5624 goto err_p;
5625 }
5626 if (IS_NOT_NULL(reg->repeat_range)) {
5627 if (COPY_FAILED(repeat_range, reg->repeat_range_alloc * sizeof(OnigRepeatRange)))
5628 goto err_repeat_range;
5629 }
5630 if (IS_NOT_NULL(reg->name_table)) {
5631 if (onig_names_copy(reg, oreg))
5632 goto err_name_table;
5633 }
5634 if (IS_NOT_NULL(reg->chain)) {
5635 if (onig_reg_copy(&reg->chain, reg->chain))
5636 goto err_chain;
5637 }
5638 return 0;
5639# undef COPY_FAILED
5640
5641 err_chain:
5642 onig_names_free(reg);
5643 err_name_table:
5644 xfree(reg->repeat_range);
5645 err_repeat_range:
5646 xfree(reg->p);
5647 err_p:
5648 xfree(reg->exact);
5649 err:
5650 xfree(reg);
5651 return ONIGERR_MEMORY;
5652 }
5653 return 0;
5654}
5655
5656#ifdef RUBY
5657size_t
5658onig_memsize(const regex_t *reg)
5659{
5660 size_t size = sizeof(regex_t);
5661 if (IS_NULL(reg)) return 0;
5662 if (IS_NOT_NULL(reg->p)) size += reg->alloc;
5663 if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
5664 if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
5665 if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
5666
5667 return size;
5668}
5669
5670size_t
5671onig_region_memsize(const OnigRegion *regs)
5672{
5673 size_t size = sizeof(*regs);
5674 if (IS_NULL(regs)) return 0;
5675 size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
5676 return size;
5677}
5678#endif
5679
5680#define REGEX_TRANSFER(to,from) do {\
5681 onig_free_body(to);\
5682 xmemcpy(to, from, sizeof(regex_t));\
5683 xfree(from);\
5684} while (0)
5685
5686#if 0
5687extern void
5688onig_transfer(regex_t* to, regex_t* from)
5689{
5690 REGEX_TRANSFER(to, from);
5691}
5692#endif
5693
5694#ifdef ONIG_DEBUG_COMPILE
5695static void print_compiled_byte_code_list(FILE* f, regex_t* reg);
5696#endif
5697#ifdef ONIG_DEBUG_PARSE_TREE
5698static void print_tree(FILE* f, Node* node);
5699#endif
5700
5701#ifdef RUBY
5702extern int
5703onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5704 OnigErrorInfo* einfo)
5705{
5706 return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0);
5707}
5708#endif
5709
5710#ifdef RUBY
5711extern int
5712onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5713 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
5714#else
5715extern int
5716onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
5717 OnigErrorInfo* einfo)
5718#endif
5719{
5720#define COMPILE_INIT_SIZE 20
5721
5722 int r;
5723 OnigDistance init_size;
5724 Node* root;
5725 ScanEnv scan_env = {0};
5726#ifdef USE_SUBEXP_CALL
5727 UnsetAddrList uslist;
5728#endif
5729
5730 if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
5731
5732#ifdef RUBY
5733 scan_env.sourcefile = sourcefile;
5734 scan_env.sourceline = sourceline;
5735#endif
5736
5737#ifdef ONIG_DEBUG
5738 print_enc_string(stderr, reg->enc, pattern, pattern_end);
5739#endif
5740
5741 if (reg->alloc == 0) {
5742 init_size = (pattern_end - pattern) * 2;
5743 if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
5744 r = BBUF_INIT(reg, init_size);
5745 if (r != 0) goto end;
5746 }
5747 else
5748 reg->used = 0;
5749
5750 reg->num_mem = 0;
5751 reg->num_repeat = 0;
5752 reg->num_null_check = 0;
5753 reg->repeat_range_alloc = 0;
5754 reg->repeat_range = (OnigRepeatRange* )NULL;
5755#ifdef USE_COMBINATION_EXPLOSION_CHECK
5756 reg->num_comb_exp_check = 0;
5757#endif
5758
5759 r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
5760 if (r != 0) goto err;
5761
5762#ifdef ONIG_DEBUG_PARSE_TREE
5763# if 0
5764 fprintf(stderr, "ORIGINAL PARSE TREE:\n");
5765 print_tree(stderr, root);
5766# endif
5767#endif
5768
5769#ifdef USE_NAMED_GROUP
5770 /* mixed use named group and no-named group */
5771 if (scan_env.num_named > 0 &&
5772 IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
5773 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
5774 if (scan_env.num_named != scan_env.num_mem)
5775 r = disable_noname_group_capture(&root, reg, &scan_env);
5776 else
5777 r = numbered_ref_check(root);
5778
5779 if (r != 0) goto err;
5780 }
5781#endif
5782
5783#ifdef USE_SUBEXP_CALL
5784 if (scan_env.num_call > 0) {
5785 r = unset_addr_list_init(&uslist, scan_env.num_call);
5786 if (r != 0) goto err;
5787 scan_env.unset_addr_list = &uslist;
5788 r = setup_subexp_call(root, &scan_env);
5789 if (r != 0) goto err_unset;
5790 r = subexp_recursive_check_trav(root, &scan_env);
5791 if (r < 0) goto err_unset;
5792 r = subexp_inf_recursive_check_trav(root, &scan_env);
5793 if (r != 0) goto err_unset;
5794
5795 reg->num_call = scan_env.num_call;
5796 }
5797 else
5798 reg->num_call = 0;
5799#endif
5800
5801 r = setup_tree(root, reg, 0, &scan_env);
5802 if (r != 0) goto err_unset;
5803
5804#ifdef ONIG_DEBUG_PARSE_TREE
5805 print_tree(stderr, root);
5806#endif
5807
5808 reg->capture_history = scan_env.capture_history;
5809 reg->bt_mem_start = scan_env.bt_mem_start;
5810 reg->bt_mem_start |= reg->capture_history;
5811 if (IS_FIND_CONDITION(reg->options))
5812 BIT_STATUS_ON_ALL(reg->bt_mem_end);
5813 else {
5814 reg->bt_mem_end = scan_env.bt_mem_end;
5815 reg->bt_mem_end |= reg->capture_history;
5816 }
5817
5818#ifdef USE_COMBINATION_EXPLOSION_CHECK
5819 if (scan_env.backrefed_mem == 0
5820# ifdef USE_SUBEXP_CALL
5821 || scan_env.num_call == 0
5822# endif
5823 ) {
5824 setup_comb_exp_check(root, 0, &scan_env);
5825# ifdef USE_SUBEXP_CALL
5826 if (scan_env.has_recursion != 0) {
5827 scan_env.num_comb_exp_check = 0;
5828 }
5829 else
5830# endif
5831 if (scan_env.comb_exp_max_regnum > 0) {
5832 int i;
5833 for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
5834 if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
5835 scan_env.num_comb_exp_check = 0;
5836 break;
5837 }
5838 }
5839 }
5840 }
5841
5842 reg->num_comb_exp_check = scan_env.num_comb_exp_check;
5843#endif
5844
5845 clear_optimize_info(reg);
5846#ifndef ONIG_DONT_OPTIMIZE
5847 r = set_optimize_info_from_tree(root, reg, &scan_env);
5848 if (r != 0) goto err_unset;
5849#endif
5850
5851 if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
5852 xfree(scan_env.mem_nodes_dynamic);
5853 scan_env.mem_nodes_dynamic = (Node** )NULL;
5854 }
5855
5856 r = compile_tree(root, reg);
5857 if (r == 0) {
5858 r = add_opcode(reg, OP_END);
5859#ifdef USE_SUBEXP_CALL
5860 if (scan_env.num_call > 0) {
5861 r = unset_addr_list_fix(&uslist, reg);
5862 unset_addr_list_end(&uslist);
5863 if (r) goto err;
5864 }
5865#endif
5866
5867 if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
5868 reg->stack_pop_level = STACK_POP_LEVEL_ALL;
5869 else {
5870 if (reg->bt_mem_start != 0)
5871 reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
5872 else
5873 reg->stack_pop_level = STACK_POP_LEVEL_FREE;
5874 }
5875 }
5876#ifdef USE_SUBEXP_CALL
5877 else if (scan_env.num_call > 0) {
5878 unset_addr_list_end(&uslist);
5879 }
5880#endif
5881 onig_node_free(root);
5882
5883#ifdef ONIG_DEBUG_COMPILE
5884# ifdef USE_NAMED_GROUP
5885 onig_print_names(stderr, reg);
5886# endif
5887 print_compiled_byte_code_list(stderr, reg);
5888#endif
5889
5890 end:
5891 onig_reg_resize(reg);
5892 return r;
5893
5894 err_unset:
5895#ifdef USE_SUBEXP_CALL
5896 if (scan_env.num_call > 0) {
5897 unset_addr_list_end(&uslist);
5898 }
5899#endif
5900 err:
5901 if (IS_NOT_NULL(scan_env.error)) {
5902 if (IS_NOT_NULL(einfo)) {
5903 einfo->enc = scan_env.enc;
5904 einfo->par = scan_env.error;
5905 einfo->par_end = scan_env.error_end;
5906 }
5907 }
5908
5909 onig_node_free(root);
5910 xfree(scan_env.mem_nodes_dynamic);
5911
5912 return r;
5913}
5914
5915static int onig_inited = 0;
5916
5917extern int
5918onig_reg_init(regex_t* reg, OnigOptionType option,
5919 OnigCaseFoldType case_fold_flag,
5920 OnigEncoding enc, const OnigSyntaxType* syntax)
5921{
5922 if (! onig_inited)
5923 onig_init();
5924
5925 if (IS_NULL(reg))
5926 return ONIGERR_INVALID_ARGUMENT;
5927
5928 (reg)->exact = (UChar* )NULL;
5929 (reg)->chain = (regex_t* )NULL;
5930 (reg)->p = (UChar* )NULL;
5931 (reg)->name_table = (void* )NULL;
5932 (reg)->repeat_range = (OnigRepeatRange* )NULL;
5933
5934 if (ONIGENC_IS_UNDEF(enc))
5935 return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
5936
5937 if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
5938 == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
5939 return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
5940 }
5941
5942 if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
5943 option |= syntax->options;
5944 option &= ~ONIG_OPTION_SINGLELINE;
5945 }
5946 else
5947 option |= syntax->options;
5948
5949 (reg)->enc = enc;
5950 (reg)->options = option;
5951 (reg)->syntax = syntax;
5952 (reg)->optimize = 0;
5953
5954 (reg)->alloc = 0;
5955 (reg)->used = 0;
5956
5957 (reg)->case_fold_flag = case_fold_flag;
5958
5959 (reg)->timelimit = 0;
5960
5961 return 0;
5962}
5963
5964extern int
5965onig_new_without_alloc(regex_t* reg, const UChar* pattern,
5966 const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
5967 const OnigSyntaxType* syntax, OnigErrorInfo* einfo)
5968{
5969 int r;
5970
5971 r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
5972 if (r) return r;
5973
5974 r = onig_compile(reg, pattern, pattern_end, einfo);
5975 return r;
5976}
5977
5978extern int
5979onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
5980 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
5981 OnigErrorInfo* einfo)
5982{
5983 *reg = (regex_t* )xmalloc(sizeof(regex_t));
5984 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
5985
5986 int r = onig_new_without_alloc(*reg, pattern, pattern_end, option, enc, syntax, einfo);
5987 if (r) {
5988 onig_free(*reg);
5989 *reg = NULL;
5990 }
5991
5992 return r;
5993}
5994
5995extern int
5996onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED)
5997{
5998 return onig_init();
5999}
6000
6001extern int
6002onig_init(void)
6003{
6004 if (onig_inited != 0)
6005 return 0;
6006
6007 onig_inited = 1;
6008
6009#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6010 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
6011#endif
6012
6013 onigenc_init();
6014 /* onigenc_set_default_caseconv_table((UChar* )0); */
6015
6016#ifdef ONIG_DEBUG_STATISTICS
6017 onig_statistics_init();
6018#endif
6019
6020 return 0;
6021}
6022
6023
6024static OnigEndCallListItemType* EndCallTop;
6025
6026extern void onig_add_end_call(void (*func)(void))
6027{
6029
6030 item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
6031 if (item == 0) return ;
6032
6033 item->next = EndCallTop;
6034 item->func = func;
6035
6036 EndCallTop = item;
6037}
6038
6039static void
6040exec_end_call_list(void)
6041{
6043 void (*func)(void);
6044
6045 while (EndCallTop != 0) {
6046 func = EndCallTop->func;
6047 (*func)();
6048
6049 prev = EndCallTop;
6050 EndCallTop = EndCallTop->next;
6051 xfree(prev);
6052 }
6053}
6054
6055extern int
6056onig_end(void)
6057{
6058 exec_end_call_list();
6059
6060#ifdef ONIG_DEBUG_STATISTICS
6061 onig_print_statistics(stderr);
6062#endif
6063
6064#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
6065 _CrtDumpMemoryLeaks();
6066#endif
6067
6068 onig_inited = 0;
6069
6070 return 0;
6071}
6072
6073extern int
6074onig_is_in_code_range(const UChar* p, OnigCodePoint code)
6075{
6076 OnigCodePoint n, *data;
6077 OnigCodePoint low, high, x;
6078
6079 GET_CODE_POINT(n, p);
6080 data = (OnigCodePoint* )p;
6081 data++;
6082
6083 for (low = 0, high = n; low < high; ) {
6084 x = (low + high) >> 1;
6085 if (code > data[x * 2 + 1])
6086 low = x + 1;
6087 else
6088 high = x;
6089 }
6090
6091 return ((low < n && code >= data[low * 2]) ? 1 : 0);
6092}
6093
6094extern int
6095onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
6096{
6097 int found;
6098
6099 if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
6100 if (IS_NULL(cc->mbuf)) {
6101 found = 0;
6102 }
6103 else {
6104 found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
6105 }
6106 }
6107 else {
6108 found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
6109 }
6110
6111 if (IS_NCCLASS_NOT(cc))
6112 return !found;
6113 else
6114 return found;
6115}
6116
6117extern int
6118onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
6119{
6120 int len;
6121
6122 if (ONIGENC_MBC_MINLEN(enc) > 1) {
6123 len = 2;
6124 }
6125 else {
6126 len = ONIGENC_CODE_TO_MBCLEN(enc, code);
6127 }
6128 return onig_is_code_in_cc_len(len, code, cc);
6129}
6130
6131
6132#ifdef ONIG_DEBUG
6133
6134/* arguments type */
6135# define ARG_SPECIAL -1
6136# define ARG_NON 0
6137# define ARG_RELADDR 1
6138# define ARG_ABSADDR 2
6139# define ARG_LENGTH 3
6140# define ARG_MEMNUM 4
6141# define ARG_OPTION 5
6142# define ARG_STATE_CHECK 6
6143
6144OnigOpInfoType OnigOpInfo[] = {
6145 { OP_FINISH, "finish", ARG_NON },
6146 { OP_END, "end", ARG_NON },
6147 { OP_EXACT1, "exact1", ARG_SPECIAL },
6148 { OP_EXACT2, "exact2", ARG_SPECIAL },
6149 { OP_EXACT3, "exact3", ARG_SPECIAL },
6150 { OP_EXACT4, "exact4", ARG_SPECIAL },
6151 { OP_EXACT5, "exact5", ARG_SPECIAL },
6152 { OP_EXACTN, "exactn", ARG_SPECIAL },
6153 { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
6154 { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
6155 { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
6156 { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
6157 { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
6158 { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
6159 { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
6160 { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
6161 { OP_CCLASS, "cclass", ARG_SPECIAL },
6162 { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
6163 { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
6164 { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
6165 { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
6166 { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
6167 { OP_ANYCHAR, "anychar", ARG_NON },
6168 { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
6169 { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
6170 { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
6171 { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
6172 { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
6173 { OP_WORD, "word", ARG_NON },
6174 { OP_NOT_WORD, "not-word", ARG_NON },
6175 { OP_WORD_BOUND, "word-bound", ARG_NON },
6176 { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
6177 { OP_WORD_BEGIN, "word-begin", ARG_NON },
6178 { OP_WORD_END, "word-end", ARG_NON },
6179 { OP_ASCII_WORD, "ascii-word", ARG_NON },
6180 { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
6181 { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
6182 { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
6183 { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
6184 { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
6185 { OP_BEGIN_BUF, "begin-buf", ARG_NON },
6186 { OP_END_BUF, "end-buf", ARG_NON },
6187 { OP_BEGIN_LINE, "begin-line", ARG_NON },
6188 { OP_END_LINE, "end-line", ARG_NON },
6189 { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
6190 { OP_BEGIN_POSITION, "begin-position", ARG_NON },
6191 { OP_BACKREF1, "backref1", ARG_NON },
6192 { OP_BACKREF2, "backref2", ARG_NON },
6193 { OP_BACKREFN, "backrefn", ARG_MEMNUM },
6194 { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
6195 { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
6196 { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
6197 { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
6198 { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
6199 { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
6200 { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
6201 { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
6202 { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
6203 { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
6204 { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
6205 { OP_SET_OPTION, "set-option", ARG_OPTION },
6206 { OP_KEEP, "keep", ARG_NON },
6207 { OP_FAIL, "fail", ARG_NON },
6208 { OP_JUMP, "jump", ARG_RELADDR },
6209 { OP_PUSH, "push", ARG_RELADDR },
6210 { OP_POP, "pop", ARG_NON },
6211 { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
6212 { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
6213 { OP_REPEAT, "repeat", ARG_SPECIAL },
6214 { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
6215 { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
6216 { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
6217 { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
6218 { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
6219 { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
6220 { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
6221 { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
6222 { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
6223 { OP_PUSH_POS, "push-pos", ARG_NON },
6224 { OP_POP_POS, "pop-pos", ARG_NON },
6225 { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
6226 { OP_FAIL_POS, "fail-pos", ARG_NON },
6227 { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
6228 { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
6229 { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
6230 { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
6231 { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
6232 { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON },
6233 { OP_ABSENT, "absent", ARG_RELADDR },
6234 { OP_ABSENT_END, "absent-end", ARG_NON },
6235 { OP_CALL, "call", ARG_ABSADDR },
6236 { OP_RETURN, "return", ARG_NON },
6237 { OP_CONDITION, "condition", ARG_SPECIAL },
6238 { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
6239 { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
6240 { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
6241 { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
6242 { OP_STATE_CHECK_ANYCHAR_ML_STAR,
6243 "state-check-anychar-ml*", ARG_STATE_CHECK },
6244 { -1, "", ARG_NON }
6245};
6246
6247static const char*
6248op2name(int opcode)
6249{
6250 int i;
6251
6252 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6253 if (opcode == OnigOpInfo[i].opcode)
6254 return OnigOpInfo[i].name;
6255 }
6256 return "";
6257}
6258
6259static int
6260op2arg_type(int opcode)
6261{
6262 int i;
6263
6264 for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
6265 if (opcode == OnigOpInfo[i].opcode)
6266 return OnigOpInfo[i].arg_type;
6267 }
6268 return ARG_SPECIAL;
6269}
6270
6271# ifdef ONIG_DEBUG_PARSE_TREE
6272static void
6273Indent(FILE* f, int indent)
6274{
6275 int i;
6276 for (i = 0; i < indent; i++) putc(' ', f);
6277}
6278# endif /* ONIG_DEBUG_PARSE_TREE */
6279
6280static void
6281p_string(FILE* f, ptrdiff_t len, UChar* s)
6282{
6283 fputs(":", f);
6284 while (len-- > 0) { fputc(*s++, f); }
6285}
6286
6287static void
6288p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
6289{
6290 int x = len * mb_len;
6291
6292 fprintf(f, ":%d:", len);
6293 while (x-- > 0) { fputc(*s++, f); }
6294}
6295
6296extern void
6297onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
6298 OnigEncoding enc)
6299{
6300 int i, n, arg_type;
6301 RelAddrType addr;
6302 LengthType len;
6303 MemNumType mem;
6304 StateCheckNumType scn;
6305 OnigCodePoint code;
6306 UChar *q;
6307
6308 fprintf(f, "[%s", op2name(*bp));
6309 arg_type = op2arg_type(*bp);
6310 if (arg_type != ARG_SPECIAL) {
6311 bp++;
6312 switch (arg_type) {
6313 case ARG_NON:
6314 break;
6315 case ARG_RELADDR:
6316 GET_RELADDR_INC(addr, bp);
6317 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6318 break;
6319 case ARG_ABSADDR:
6320 GET_ABSADDR_INC(addr, bp);
6321 fprintf(f, ":(%d)", addr);
6322 break;
6323 case ARG_LENGTH:
6324 GET_LENGTH_INC(len, bp);
6325 fprintf(f, ":%d", len);
6326 break;
6327 case ARG_MEMNUM:
6328 mem = *((MemNumType* )bp);
6329 bp += SIZE_MEMNUM;
6330 fprintf(f, ":%d", mem);
6331 break;
6332 case ARG_OPTION:
6333 {
6334 OnigOptionType option = *((OnigOptionType* )bp);
6335 bp += SIZE_OPTION;
6336 fprintf(f, ":%d", option);
6337 }
6338 break;
6339
6340 case ARG_STATE_CHECK:
6341 scn = *((StateCheckNumType* )bp);
6342 bp += SIZE_STATE_CHECK_NUM;
6343 fprintf(f, ":%d", scn);
6344 break;
6345 }
6346 }
6347 else {
6348 switch (*bp++) {
6349 case OP_EXACT1:
6350 case OP_ANYCHAR_STAR_PEEK_NEXT:
6351 case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
6352 p_string(f, 1, bp++); break;
6353 case OP_EXACT2:
6354 p_string(f, 2, bp); bp += 2; break;
6355 case OP_EXACT3:
6356 p_string(f, 3, bp); bp += 3; break;
6357 case OP_EXACT4:
6358 p_string(f, 4, bp); bp += 4; break;
6359 case OP_EXACT5:
6360 p_string(f, 5, bp); bp += 5; break;
6361 case OP_EXACTN:
6362 GET_LENGTH_INC(len, bp);
6363 p_len_string(f, len, 1, bp);
6364 bp += len;
6365 break;
6366
6367 case OP_EXACTMB2N1:
6368 p_string(f, 2, bp); bp += 2; break;
6369 case OP_EXACTMB2N2:
6370 p_string(f, 4, bp); bp += 4; break;
6371 case OP_EXACTMB2N3:
6372 p_string(f, 6, bp); bp += 6; break;
6373 case OP_EXACTMB2N:
6374 GET_LENGTH_INC(len, bp);
6375 p_len_string(f, len, 2, bp);
6376 bp += len * 2;
6377 break;
6378 case OP_EXACTMB3N:
6379 GET_LENGTH_INC(len, bp);
6380 p_len_string(f, len, 3, bp);
6381 bp += len * 3;
6382 break;
6383 case OP_EXACTMBN:
6384 {
6385 int mb_len;
6386
6387 GET_LENGTH_INC(mb_len, bp);
6388 GET_LENGTH_INC(len, bp);
6389 fprintf(f, ":%d:%d:", mb_len, len);
6390 n = len * mb_len;
6391 while (n-- > 0) { fputc(*bp++, f); }
6392 }
6393 break;
6394
6395 case OP_EXACT1_IC:
6396 len = enclen(enc, bp, bpend);
6397 p_string(f, len, bp);
6398 bp += len;
6399 break;
6400 case OP_EXACTN_IC:
6401 GET_LENGTH_INC(len, bp);
6402 p_len_string(f, len, 1, bp);
6403 bp += len;
6404 break;
6405
6406 case OP_CCLASS:
6407 n = bitset_on_num((BitSetRef )bp);
6408 bp += SIZE_BITSET;
6409 fprintf(f, ":%d", n);
6410 break;
6411
6412 case OP_CCLASS_NOT:
6413 n = bitset_on_num((BitSetRef )bp);
6414 bp += SIZE_BITSET;
6415 fprintf(f, ":%d", n);
6416 break;
6417
6418 case OP_CCLASS_MB:
6419 case OP_CCLASS_MB_NOT:
6420 GET_LENGTH_INC(len, bp);
6421 q = bp;
6422# ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6423 ALIGNMENT_RIGHT(q);
6424# endif
6425 GET_CODE_POINT(code, q);
6426 bp += len;
6427 fprintf(f, ":%d:%d", (int )code, len);
6428 break;
6429
6430 case OP_CCLASS_MIX:
6431 case OP_CCLASS_MIX_NOT:
6432 n = bitset_on_num((BitSetRef )bp);
6433 bp += SIZE_BITSET;
6434 GET_LENGTH_INC(len, bp);
6435 q = bp;
6436# ifndef PLATFORM_UNALIGNED_WORD_ACCESS
6437 ALIGNMENT_RIGHT(q);
6438# endif
6439 GET_CODE_POINT(code, q);
6440 bp += len;
6441 fprintf(f, ":%d:%d:%d", n, (int )code, len);
6442 break;
6443
6444 case OP_BACKREFN_IC:
6445 mem = *((MemNumType* )bp);
6446 bp += SIZE_MEMNUM;
6447 fprintf(f, ":%d", mem);
6448 break;
6449
6450 case OP_BACKREF_MULTI_IC:
6451 case OP_BACKREF_MULTI:
6452 fputs(" ", f);
6453 GET_LENGTH_INC(len, bp);
6454 for (i = 0; i < len; i++) {
6455 GET_MEMNUM_INC(mem, bp);
6456 if (i > 0) fputs(", ", f);
6457 fprintf(f, "%d", mem);
6458 }
6459 break;
6460
6461 case OP_BACKREF_WITH_LEVEL:
6462 {
6463 OnigOptionType option;
6464 LengthType level;
6465
6466 GET_OPTION_INC(option, bp);
6467 fprintf(f, ":%d", option);
6468 GET_LENGTH_INC(level, bp);
6469 fprintf(f, ":%d", level);
6470
6471 fputs(" ", f);
6472 GET_LENGTH_INC(len, bp);
6473 for (i = 0; i < len; i++) {
6474 GET_MEMNUM_INC(mem, bp);
6475 if (i > 0) fputs(", ", f);
6476 fprintf(f, "%d", mem);
6477 }
6478 }
6479 break;
6480
6481 case OP_REPEAT:
6482 case OP_REPEAT_NG:
6483 {
6484 mem = *((MemNumType* )bp);
6485 bp += SIZE_MEMNUM;
6486 addr = *((RelAddrType* )bp);
6487 bp += SIZE_RELADDR;
6488 fprintf(f, ":%d:%d", mem, addr);
6489 }
6490 break;
6491
6492 case OP_PUSH_OR_JUMP_EXACT1:
6493 case OP_PUSH_IF_PEEK_NEXT:
6494 addr = *((RelAddrType* )bp);
6495 bp += SIZE_RELADDR;
6496 fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
6497 p_string(f, 1, bp);
6498 bp += 1;
6499 break;
6500
6501 case OP_LOOK_BEHIND:
6502 GET_LENGTH_INC(len, bp);
6503 fprintf(f, ":%d", len);
6504 break;
6505
6506 case OP_PUSH_LOOK_BEHIND_NOT:
6507 GET_RELADDR_INC(addr, bp);
6508 GET_LENGTH_INC(len, bp);
6509 fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr);
6510 break;
6511
6512 case OP_STATE_CHECK_PUSH:
6513 case OP_STATE_CHECK_PUSH_OR_JUMP:
6514 scn = *((StateCheckNumType* )bp);
6515 bp += SIZE_STATE_CHECK_NUM;
6516 addr = *((RelAddrType* )bp);
6517 bp += SIZE_RELADDR;
6518 fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr);
6519 break;
6520
6521 case OP_CONDITION:
6522 GET_MEMNUM_INC(mem, bp);
6523 GET_RELADDR_INC(addr, bp);
6524 fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr);
6525 break;
6526
6527 default:
6528 fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
6529 bp[-1]);
6530 }
6531 }
6532 fputs("]", f);
6533 if (nextp) *nextp = bp;
6534}
6535
6536# ifdef ONIG_DEBUG_COMPILE
6537static void
6538print_compiled_byte_code_list(FILE* f, regex_t* reg)
6539{
6540 int ncode;
6541 UChar* bp = reg->p;
6542 UChar* end = reg->p + reg->used;
6543
6544 fprintf(f, "code length: %d", reg->used);
6545
6546 ncode = -1;
6547 while (bp < end) {
6548 ncode++;
6549 if (ncode % 5 == 0)
6550 fprintf(f, "\n%ld:", bp - reg->p);
6551 else
6552 fprintf(f, " %ld:", bp - reg->p);
6553 onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
6554 }
6555
6556 fprintf(f, "\n");
6557}
6558# endif /* ONIG_DEBUG_COMPILE */
6559
6560# ifdef ONIG_DEBUG_PARSE_TREE
6561static void
6562print_indent_tree(FILE* f, Node* node, int indent)
6563{
6564 int i, type, container_p = 0;
6565 int add = 3;
6566 UChar* p;
6567
6568 Indent(f, indent);
6569 if (IS_NULL(node)) {
6570 fprintf(f, "ERROR: null node!!!\n");
6571 exit (0);
6572 }
6573
6574 type = NTYPE(node);
6575 switch (type) {
6576 case NT_LIST:
6577 case NT_ALT:
6578 if (NTYPE(node) == NT_LIST)
6579 fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
6580 else
6581 fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
6582
6583 print_indent_tree(f, NCAR(node), indent + add);
6584 while (IS_NOT_NULL(node = NCDR(node))) {
6585 if (NTYPE(node) != type) {
6586 fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
6587 exit(0);
6588 }
6589 print_indent_tree(f, NCAR(node), indent + add);
6590 }
6591 break;
6592
6593 case NT_STR:
6594 fprintf(f, "<string%s:%"PRIxPTR">",
6595 (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
6596 for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
6597 if (*p >= 0x20 && *p < 0x7f)
6598 fputc(*p, f);
6599 else {
6600 fprintf(f, " 0x%02x", *p);
6601 }
6602 }
6603 break;
6604
6605 case NT_CCLASS:
6606 fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
6607 if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
6608 if (NCCLASS(node)->mbuf) {
6609 BBuf* bbuf = NCCLASS(node)->mbuf;
6610 OnigCodePoint* data = (OnigCodePoint* )bbuf->p;
6611 OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used);
6612 fprintf(f, "%d", *data++);
6613 for (; data < end; data+=2) {
6614 fprintf(f, ",");
6615 fprintf(f, "%04x-%04x", data[0], data[1]);
6616 }
6617 }
6618 break;
6619
6620 case NT_CTYPE:
6621 fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
6622 switch (NCTYPE(node)->ctype) {
6623 case ONIGENC_CTYPE_WORD:
6624 if (NCTYPE(node)->not != 0)
6625 fputs("not word", f);
6626 else
6627 fputs("word", f);
6628 break;
6629
6630 default:
6631 fprintf(f, "ERROR: undefined ctype.\n");
6632 exit(0);
6633 }
6634 break;
6635
6636 case NT_CANY:
6637 fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
6638 break;
6639
6640 case NT_ANCHOR:
6641 fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
6642 switch (NANCHOR(node)->type) {
6643 case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
6644 case ANCHOR_END_BUF: fputs("end buf", f); break;
6645 case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
6646 case ANCHOR_END_LINE: fputs("end line", f); break;
6647 case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
6648 case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
6649
6650 case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
6651 case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
6652# ifdef USE_WORD_BEGIN_END
6653 case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
6654 case ANCHOR_WORD_END: fputs("word end", f); break;
6655# endif
6656 case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
6657 case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
6658 case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
6659 case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
6660 case ANCHOR_KEEP: fputs("keep",f); break;
6661
6662 default:
6663 fprintf(f, "ERROR: undefined anchor type.\n");
6664 break;
6665 }
6666 break;
6667
6668 case NT_BREF:
6669 {
6670 int* p;
6671 BRefNode* br = NBREF(node);
6672 p = BACKREFS_P(br);
6673 fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
6674 for (i = 0; i < br->back_num; i++) {
6675 if (i > 0) fputs(", ", f);
6676 fprintf(f, "%d", p[i]);
6677 }
6678 }
6679 break;
6680
6681# ifdef USE_SUBEXP_CALL
6682 case NT_CALL:
6683 {
6684 CallNode* cn = NCALL(node);
6685 fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
6686 p_string(f, cn->name_end - cn->name, cn->name);
6687 }
6688 break;
6689# endif
6690
6691 case NT_QTFR:
6692 fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
6693 NQTFR(node)->lower, NQTFR(node)->upper,
6694 (NQTFR(node)->greedy ? "" : "?"));
6695 print_indent_tree(f, NQTFR(node)->target, indent + add);
6696 break;
6697
6698 case NT_ENCLOSE:
6699 fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
6700 switch (NENCLOSE(node)->type) {
6701 case ENCLOSE_OPTION:
6702 fprintf(f, "option:%d", NENCLOSE(node)->option);
6703 break;
6704 case ENCLOSE_MEMORY:
6705 fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
6706 break;
6707 case ENCLOSE_STOP_BACKTRACK:
6708 fprintf(f, "stop-bt");
6709 break;
6710 case ENCLOSE_CONDITION:
6711 fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
6712 break;
6713 case ENCLOSE_ABSENT:
6714 fprintf(f, "absent");
6715 break;
6716
6717 default:
6718 break;
6719 }
6720 fprintf(f, "\n");
6721 print_indent_tree(f, NENCLOSE(node)->target, indent + add);
6722 break;
6723
6724 default:
6725 fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
6726 break;
6727 }
6728
6729 if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
6730 type != NT_ENCLOSE)
6731 fprintf(f, "\n");
6732
6733 if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
6734
6735 fflush(f);
6736}
6737
6738static void
6739print_tree(FILE* f, Node* node)
6740{
6741 print_indent_tree(f, node, 0);
6742}
6743# endif /* ONIG_DEBUG_PARSE_TREE */
6744#endif /* ONIG_DEBUG */
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
int len
Length of the buffer.
Definition io.h:8
VALUE type(ANYARGS)
ANYARGS-ed function type.