Ruby 4.1.0dev (2026-04-04 revision 3b6245536cf55da9e8bfcdb03c845fe9ef931d7f)
regexp.c
1#include "prism/internal/regexp.h"
2
5#include "prism/internal/buffer.h"
6#include "prism/internal/char.h"
7#include "prism/internal/diagnostic.h"
8#include "prism/internal/encoding.h"
9#include "prism/internal/memchr.h"
10#include "prism/internal/parser.h"
11#include "prism/internal/stringy.h"
12#include "prism/internal/strncasecmp.h"
13
14#include <assert.h>
15#include <string.h>
16
18#define PM_REGEXP_PARSE_DEPTH_MAX 4096
19
23typedef struct {
26
28 const uint8_t *start;
29
31 const uint8_t *cursor;
32
34 const uint8_t *end;
35
38
40 pm_regexp_name_callback_t name_callback;
41
44
46 const uint8_t *node_start;
47
49 const uint8_t *node_end;
50
57
64 const uint8_t *property_name;
65
68
73 const uint8_t *unicode_property_name;
74
77
80
83
89
92
94 bool shared;
95
98
101
107
110
113
116
119
123
129static PRISM_INLINE void
130pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
131 pm_parser_t *pm = parser->parser;
132 uint32_t loc_start, loc_length;
133
134 if (parser->shared) {
135 loc_start = (uint32_t) (start - pm->start);
136 loc_length = (uint32_t) (end - start);
137 } else {
138 loc_start = (uint32_t) (parser->node_start - pm->start);
139 loc_length = (uint32_t) (parser->node_end - parser->node_start);
140 }
141
142 pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message);
143}
144
149#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \
150 do { \
151 pm_parser_t *pm__ = (parser_)->parser; \
152 uint32_t loc_start__, loc_length__; \
153 if ((parser_)->shared) { \
154 loc_start__ = (uint32_t) ((err_start_) - pm__->start); \
155 loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \
156 } else { \
157 loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \
158 loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \
159 } \
160 pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \
161 } while (0)
162
167static void
168pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
169 pm_string_t string;
170 pm_string_shared_init(&string, start, end);
171 parser->name_callback(parser->parser, &string, parser->shared, parser->name_data);
172 pm_string_cleanup(&string);
173}
174
178static PRISM_INLINE bool
179pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
180 return parser->cursor >= parser->end;
181}
182
186static PRISM_INLINE bool
187pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
188 if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
189 parser->cursor++;
190 return true;
191 }
192 return false;
193}
194
198static PRISM_INLINE bool
199pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
200 if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
201 parser->cursor++;
202 return true;
203 }
204 return false;
205}
206
210static bool
211pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
212 if (pm_regexp_char_is_eof(parser)) {
213 return false;
214 }
215
216 const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
217 if (end == NULL) {
218 return false;
219 }
220
221 parser->cursor = end + 1;
222 return true;
223}
224
230static PRISM_INLINE void
231pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) {
232 if (parser->hex_group_active) {
233 pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00);
234 parser->hex_group_active = false;
235 }
236}
237
241static PRISM_INLINE void
242pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) {
243 if (byte >= 0x80) {
244 pm_buffer_append_byte(&parser->hex_escape_buffer, byte);
245 parser->hex_group_active = true;
246 parser->has_hex_escape = true;
247
248 parser->explicit_encoding = parser->encoding;
249 parser->last_escape_was_unicode = false;
250 } else {
251 pm_regexp_hex_group_boundary(parser);
252 }
253}
254
258static PRISM_INLINE int
259pm_regexp_hex_digit_value(uint8_t byte) {
260 if (byte >= '0' && byte <= '9') return byte - '0';
261 if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10;
262 if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10;
263 return -1;
264}
265
298static bool
299pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
300 const uint8_t *savepoint = parser->cursor;
301
302 enum {
303 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
304 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
305 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
306 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
307 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
308
309 while (1) {
310 if (parser->cursor >= parser->end) {
311 parser->cursor = savepoint;
312 return true;
313 }
314
315 switch (state) {
316 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
317 switch (*parser->cursor) {
318 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
319 parser->cursor++;
320 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
321 break;
322 case ',':
323 parser->cursor++;
324 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
325 break;
326 default:
327 parser->cursor = savepoint;
328 return true;
329 }
330 break;
331 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
332 switch (*parser->cursor) {
333 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
334 parser->cursor++;
335 break;
336 case ',':
337 parser->cursor++;
338 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
339 break;
340 case '}':
341 parser->cursor++;
342 return true;
343 default:
344 parser->cursor = savepoint;
345 return true;
346 }
347 break;
348 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
349 switch (*parser->cursor) {
350 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
351 parser->cursor++;
352 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
353 break;
354 default:
355 parser->cursor = savepoint;
356 return true;
357 }
358 break;
359 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
360 switch (*parser->cursor) {
361 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
362 parser->cursor++;
363 break;
364 case '}':
365 parser->cursor++;
366 return true;
367 default:
368 parser->cursor = savepoint;
369 return true;
370 }
371 break;
372 }
373 }
374
375 return true;
376}
377
386static bool
387pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
388 while (!pm_regexp_char_is_eof(parser)) {
389 switch (*parser->cursor) {
390 case '*':
391 case '+':
392 case '?':
393 parser->cursor++;
394 break;
395 case '{':
396 parser->cursor++;
397 if (!pm_regexp_parse_range_quantifier(parser)) return false;
398 break;
399 default:
400 // In this case there is no quantifier.
401 return true;
402 }
403 }
404
405 return true;
406}
407
412static bool
413pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
414 if (!pm_regexp_char_expect(parser, ':')) {
415 return false;
416 }
417
418 pm_regexp_char_accept(parser, '^');
419
420 return (
421 pm_regexp_char_find(parser, ':') &&
422 pm_regexp_char_expect(parser, ']') &&
423 pm_regexp_char_expect(parser, ']')
424 );
425}
426
438typedef enum {
439 PM_REGEXP_PROPERTY_POSIX,
440 PM_REGEXP_PROPERTY_SCRIPT,
441 PM_REGEXP_PROPERTY_UNICODE
442} pm_regexp_property_type_t;
443
448static pm_regexp_property_type_t
449pm_regexp_classify_property(const uint8_t *name, size_t length) {
450 // Skip leading '^' for negated properties like \p{^Hiragana}.
451 if (length > 0 && name[0] == '^') {
452 name++;
453 length--;
454 }
455
456#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0)
457
458 switch (length) {
459 case 3:
460 if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT;
461 break;
462 case 4:
463 if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX;
464 break;
465 case 5:
466 /* Most properties are length 5, so dispatch on first character. */
467 switch (name[0] | 0x20) {
468 case 'a':
469 if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX;
470 if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX;
471 if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX;
472 break;
473 case 'b':
474 if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX;
475 break;
476 case 'c':
477 if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX;
478 break;
479 case 'd':
480 if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX;
481 break;
482 case 'g':
483 if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX;
484 if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT;
485 break;
486 case 'l':
487 if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX;
488 if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT;
489 break;
490 case 'p':
491 if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX;
492 if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX;
493 break;
494 case 's':
495 if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX;
496 break;
497 case 'u':
498 if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX;
499 break;
500 }
501 break;
502 case 6:
503 if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX;
504 break;
505 case 8:
506 if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT;
507 if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT;
508 if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT;
509 break;
510 }
511
512#undef PM_REGEXP_CASECMP
513
514 // Everything else is Unicode-only (general categories, other scripts, etc.).
515 return PM_REGEXP_PROPERTY_UNICODE;
516}
517
527static bool
528pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) {
529 assert(*parser->cursor == 'p' || *parser->cursor == 'P');
530
531 if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') {
532 const uint8_t *name_start = parser->cursor + 2;
533 const uint8_t *search = name_start;
534
535 while (search < parser->end && *search != '}') search++;
536
537 if (search < parser->end) {
538 size_t name_length = (size_t) (search - name_start);
539 parser->has_property_escape = true;
540
541 pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length);
542
543 // Track the first non-POSIX property name (for /n error messages).
544 if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) {
545 parser->property_name = name_start;
546 parser->property_name_length = name_length;
547 }
548
549 // Track the first Unicode-only property name (for /e, /s error messages).
550 if (type == PM_REGEXP_PROPERTY_UNICODE) {
551 parser->has_unicode_property_escape = true;
552 if (parser->unicode_property_name == NULL) {
553 parser->unicode_property_name = name_start;
554 parser->unicode_property_name_length = name_length;
555 }
556 }
557
558 parser->cursor = search + 1; // skip past '}'
559 return true;
560 }
561 }
562
563 // Not a property escape, just skip the single character after '\'.
564 parser->cursor++;
565 return false;
566}
567
574static void
575pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) {
576 const uint8_t *escape_start = parser->cursor - 2; // points to '\'
577
578 if (pm_regexp_char_is_eof(parser)) {
579 pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
580 return;
581 }
582
583 if (*parser->cursor == '{') {
584 parser->cursor++; // skip '{'
585
586 // Skip leading whitespace.
587 while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
588 parser->cursor++;
589 }
590
591 bool has_codepoint = false;
592
593 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
594 // Parse the hex digits to compute the codepoint value.
595 uint32_t value = 0;
596 size_t hex_count = 0;
597
598 int digit;
599 while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
600 value = (value << 4) | (uint32_t) digit;
601 hex_count++;
602 parser->cursor++;
603 }
604
605 if (hex_count == 0) {
606 // Skip to '}' or end of regexp to find the full extent.
607 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
608 parser->cursor++;
609 }
610
611 const uint8_t *escape_end = parser->cursor;
612 if (!pm_regexp_char_is_eof(parser)) {
613 escape_end++;
614 parser->cursor++; // skip '}'
615 }
616
617 pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start);
618 return;
619 }
620
621 if (hex_count > 6) {
622 pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range");
623 }
624
625 // Track encoding state for this codepoint.
626 if (value >= 0x80) {
627 parser->has_unicode_escape = true;
628 parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
629 parser->last_escape_was_unicode = true;
630 pm_regexp_hex_group_boundary(parser);
631 }
632
633 // Check for invalid Unicode range (surrogates or > 0x10FFFF).
634 if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
635 parser->invalid_unicode_range = true;
636 }
637
638 has_codepoint = true;
639
640 // Skip whitespace between codepoints.
641 while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
642 parser->cursor++;
643 }
644 }
645
646 if (pm_regexp_char_is_eof(parser)) {
647 pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape");
648 } else {
649 if (!has_codepoint) {
650 pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start);
651 }
652 parser->cursor++; // skip '}'
653 }
654 } else {
655 // \uNNNN form — need exactly 4 hex digits.
656 uint32_t value = 0;
657 size_t hex_count = 0;
658
659 int digit;
660 while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
661 value = (value << 4) | (uint32_t) digit;
662 hex_count++;
663 parser->cursor++;
664 }
665
666 if (hex_count < 4) {
667 pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
668 } else if (value >= 0x80) {
669 parser->has_unicode_escape = true;
670 parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
671 parser->last_escape_was_unicode = true;
672 pm_regexp_hex_group_boundary(parser);
673 }
674
675 // Check for invalid Unicode range.
676 if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) {
677 parser->invalid_unicode_range = true;
678 }
679 }
680}
681
682// Forward declaration because character sets can be nested.
683static bool
684pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
685
690static int
691pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) {
692 int value = -1;
693
694 if (!pm_regexp_char_is_eof(parser)) {
695 int digit = pm_regexp_hex_digit_value(*parser->cursor);
696 if (digit >= 0) {
697 value = digit;
698 parser->cursor++;
699
700 if (!pm_regexp_char_is_eof(parser)) {
701 digit = pm_regexp_hex_digit_value(*parser->cursor);
702 if (digit >= 0) {
703 value = (value << 4) | digit;
704 parser->cursor++;
705 }
706 }
707 }
708 }
709
710 if (value >= 0) {
711 pm_regexp_track_hex_escape(parser, (uint8_t) value);
712 }
713
714 return value;
715}
716
722static void
723pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) {
724 if (pm_regexp_char_is_eof(parser)) return;
725
726 switch (*parser->cursor) {
727 case 'u':
728 parser->cursor++; // skip 'u'
729 pm_regexp_parse_unicode_escape(parser);
730 break;
731 case 'p':
732 case 'P':
733 pm_regexp_parse_property_escape(parser);
734 break;
735 case 'x':
736 parser->cursor++; // skip 'x'
737 pm_regexp_parse_hex_escape(parser);
738 break;
739 case 'M':
740 // \M-x produces (x | 0x80), always >= 0x80
741 if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
742 parser->cursor += 2; // skip 'M-'
743 if (!pm_regexp_char_is_eof(parser)) {
744 if (*parser->cursor == '\\') {
745 parser->cursor++;
746 // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80
747 // We just need to track it as a hex escape >= 0x80.
748 pm_regexp_parse_backslash_escape(parser);
749 } else {
750 parser->cursor++;
751 }
752 // \M-x always produces a byte >= 0x80
753 pm_regexp_track_hex_escape(parser, 0x80);
754 }
755 } else {
756 parser->cursor++;
757 }
758 break;
759 case 'C':
760 // \C-x produces (x & 0x1F)
761 if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
762 parser->cursor += 2; // skip 'C-'
763 if (!pm_regexp_char_is_eof(parser)) {
764 if (*parser->cursor == '\\') {
765 parser->cursor++;
766 pm_regexp_parse_backslash_escape(parser);
767 } else {
768 parser->cursor++;
769 }
770 }
771 } else {
772 parser->cursor++;
773 }
774 break;
775 case 'c':
776 // \cx produces (x & 0x1F)
777 parser->cursor++; // skip 'c'
778 if (!pm_regexp_char_is_eof(parser)) {
779 if (*parser->cursor == '\\') {
780 parser->cursor++;
781 pm_regexp_parse_backslash_escape(parser);
782 } else {
783 parser->cursor++;
784 }
785 }
786 break;
787 default:
788 pm_regexp_hex_group_boundary(parser);
789 parser->cursor++;
790 break;
791 }
792}
793
799static void
800pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) {
801 uint8_t byte = *cursor;
802 if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) {
803 size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor));
804 if (width > 1) {
805 parser->cursor += width - 1;
806 } else if (width == 0) {
807 parser->has_invalid_multibyte = true;
808 pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
809 }
810 }
811}
812
817static bool
818pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
819 pm_regexp_char_accept(parser, '^');
820
821 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
822 switch (*parser->cursor++) {
823 case '[':
824 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
825 break;
826 case '\\':
827 pm_regexp_parse_backslash_escape(parser);
828 break;
829 default:
830 // We've already advanced the cursor by one byte. If the byte
831 // was >= 0x80 in a multibyte encoding, we may need to consume
832 // additional continuation bytes and validate the character.
833 if (*(parser->cursor - 1) >= 0x80) {
834 parser->non_ascii_literal_count++;
835 }
836 pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1);
837 break;
838 }
839 }
840
841 return pm_regexp_char_expect(parser, ']');
842}
843
847static bool
848pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
849 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
850 pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
851 return false;
852 }
853
854 if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
855 parser->cursor++;
856 pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
857 return true;
858 }
859
860 const uint8_t *reset = parser->cursor;
861
862 if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
863 parser->cursor++;
864 if (pm_regexp_parse_posix_class(parser)) return true;
865
866 parser->cursor = reset;
867 }
868
869 return pm_regexp_parse_character_set(parser, depth);
870}
871
872// Forward declaration here since parsing groups needs to go back up the grammar
873// to parse expressions within them.
874static bool
875pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
876
881typedef enum {
882 PM_REGEXP_OPTION_STATE_INVALID,
883 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
884 PM_REGEXP_OPTION_STATE_ADDABLE,
885 PM_REGEXP_OPTION_STATE_ADDED,
886 PM_REGEXP_OPTION_STATE_REMOVED
887} pm_regexp_option_state_t;
888
889// These are the options that are configurable on the regular expression (or
890// from within a group).
891
893#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
894
896#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
897
899#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
900
904typedef struct {
906 uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
908
912static void
913pm_regexp_options_init(pm_regexp_options_t *options) {
914 memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
915 options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
916 options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
917 options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
918 options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
919 options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
920 options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
921}
922
927static bool
928pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
929 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
930 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
931
932 switch (options->values[key]) {
933 case PM_REGEXP_OPTION_STATE_INVALID:
934 case PM_REGEXP_OPTION_STATE_REMOVED:
935 return false;
936 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
937 case PM_REGEXP_OPTION_STATE_ADDABLE:
938 options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
939 return true;
940 case PM_REGEXP_OPTION_STATE_ADDED:
941 return true;
942 }
943 }
944
945 return false;
946}
947
952static bool
953pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
954 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
955 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
956
957 switch (options->values[key]) {
958 case PM_REGEXP_OPTION_STATE_INVALID:
959 case PM_REGEXP_OPTION_STATE_ADDABLE:
960 return false;
961 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
962 case PM_REGEXP_OPTION_STATE_ADDED:
963 case PM_REGEXP_OPTION_STATE_REMOVED:
964 options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
965 return true;
966 }
967 }
968
969 return false;
970}
971
975static uint8_t
976pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
977 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
978 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
979 return options->values[key];
980 }
981
982 return false;
983}
984
1006static bool
1007pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
1008 const uint8_t *group_start = parser->cursor;
1009
1010 pm_regexp_options_t options;
1011 pm_regexp_options_init(&options);
1012
1013 // First, parse any options for the group.
1014 if (pm_regexp_char_accept(parser, '?')) {
1015 if (pm_regexp_char_is_eof(parser)) {
1016 pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
1017 return false;
1018 }
1019
1020 switch (*parser->cursor) {
1021 case '#': { // inline comments
1022 parser->cursor++;
1023 if (pm_regexp_char_is_eof(parser)) {
1024 pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
1025 return false;
1026 }
1027
1028 if (parser->encoding_changed && parser->encoding->multibyte) {
1029 bool escaped = false;
1030
1031 // Here we're going to take a slow path and iterate through
1032 // each multibyte character to find the close paren. We do
1033 // this because \ can be a trailing byte in some encodings.
1034 while (parser->cursor < parser->end) {
1035 if (!escaped && *parser->cursor == ')') {
1036 parser->cursor++;
1037 return true;
1038 }
1039
1040 size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
1041 if (width == 0) {
1042 if (*parser->cursor >= 0x80) {
1043 parser->has_invalid_multibyte = true;
1044 pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
1045 parser->cursor++;
1046 continue;
1047 }
1048 return false;
1049 }
1050
1051 escaped = (width == 1) && (*parser->cursor == '\\');
1052 parser->cursor += width;
1053 }
1054
1055 return false;
1056 } else {
1057 // Here we can take the fast path and use memchr to find the
1058 // next ) because we are safe checking backward for \ since
1059 // it cannot be a trailing character.
1060 bool found = pm_regexp_char_find(parser, ')');
1061
1062 while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
1063 found = pm_regexp_char_find(parser, ')');
1064 }
1065
1066 return found;
1067 }
1068 }
1069 case ':': // non-capturing group
1070 case '=': // positive lookahead
1071 case '!': // negative lookahead
1072 case '>': // atomic group
1073 case '~': // absence operator
1074 parser->cursor++;
1075 break;
1076 case '<':
1077 parser->cursor++;
1078 if (pm_regexp_char_is_eof(parser)) {
1079 pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
1080 return false;
1081 }
1082
1083 switch (*parser->cursor) {
1084 case '=': // positive lookbehind
1085 case '!': // negative lookbehind
1086 parser->cursor++;
1087 break;
1088 default: { // named capture group
1089 const uint8_t *start = parser->cursor;
1090 if (!pm_regexp_char_find(parser, '>')) {
1091 return false;
1092 }
1093
1094 if (parser->cursor - start == 1) {
1095 pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
1096 }
1097
1098 if (parser->name_callback != NULL) {
1099 pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
1100 }
1101
1102 break;
1103 }
1104 }
1105 break;
1106 case '\'': { // named capture group
1107 const uint8_t *start = ++parser->cursor;
1108 if (!pm_regexp_char_find(parser, '\'')) {
1109 return false;
1110 }
1111
1112 if (parser->name_callback != NULL) {
1113 pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
1114 }
1115
1116 break;
1117 }
1118 case '(': // conditional expression
1119 if (!pm_regexp_char_find(parser, ')')) {
1120 return false;
1121 }
1122 break;
1123 case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
1124 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
1125 if (!pm_regexp_options_add(&options, *parser->cursor)) {
1126 return false;
1127 }
1128 parser->cursor++;
1129 }
1130
1131 if (pm_regexp_char_is_eof(parser)) {
1132 return false;
1133 }
1134
1135 // If we are at the end of the group of options and there is no
1136 // subexpression, then we are going to be setting the options
1137 // for the parent group. In this case we are safe to return now.
1138 if (*parser->cursor == ')') {
1139 if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
1140 parser->extended_mode = true;
1141 }
1142
1143 parser->cursor++;
1144 return true;
1145 }
1146
1147 // If we hit a -, then we're done parsing options.
1148 if (*parser->cursor != '-') break;
1149
1151 case '-':
1152 parser->cursor++;
1153 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
1154 if (!pm_regexp_options_remove(&options, *parser->cursor)) {
1155 return false;
1156 }
1157 parser->cursor++;
1158 }
1159
1160 if (pm_regexp_char_is_eof(parser)) {
1161 return false;
1162 }
1163
1164 // If we are at the end of the group of options and there is no
1165 // subexpression, then we are going to be setting the options
1166 // for the parent group. In this case we are safe to return now.
1167 if (*parser->cursor == ')') {
1168 switch (pm_regexp_options_state(&options, 'x')) {
1169 case PM_REGEXP_OPTION_STATE_ADDED:
1170 parser->extended_mode = true;
1171 break;
1172 case PM_REGEXP_OPTION_STATE_REMOVED:
1173 parser->extended_mode = false;
1174 break;
1175 }
1176
1177 parser->cursor++;
1178 return true;
1179 }
1180
1181 break;
1182 default:
1183 parser->cursor++;
1184 pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
1185 break;
1186 }
1187 }
1188
1189 bool extended_mode = parser->extended_mode;
1190 switch (pm_regexp_options_state(&options, 'x')) {
1191 case PM_REGEXP_OPTION_STATE_ADDED:
1192 parser->extended_mode = true;
1193 break;
1194 case PM_REGEXP_OPTION_STATE_REMOVED:
1195 parser->extended_mode = false;
1196 break;
1197 }
1198
1199 // Now, parse the expressions within this group.
1200 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
1201 if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
1202 parser->extended_mode = extended_mode;
1203 return false;
1204 }
1205 pm_regexp_char_accept(parser, '|');
1206 }
1207
1208 // Finally, make sure we have a closing parenthesis.
1209 parser->extended_mode = extended_mode;
1210 if (pm_regexp_char_expect(parser, ')')) return true;
1211
1212 pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
1213 return false;
1214}
1215
1228static bool
1229pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
1230 switch (*parser->cursor) {
1231 case '^':
1232 case '$':
1233 parser->cursor++;
1234 return pm_regexp_parse_quantifier(parser);
1235 case '\\':
1236 parser->cursor++;
1237 pm_regexp_parse_backslash_escape(parser);
1238 return pm_regexp_parse_quantifier(parser);
1239 case '(':
1240 parser->cursor++;
1241 return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
1242 case '[':
1243 parser->cursor++;
1244 return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
1245 case '*':
1246 case '?':
1247 case '+':
1248 parser->cursor++;
1249 pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
1250 return true;
1251 case ')':
1252 parser->cursor++;
1253 pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
1254 return true;
1255 case '#':
1256 if (parser->extended_mode) {
1257 if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
1258 return true;
1259 }
1261 default: {
1262 size_t width;
1263 if (!parser->encoding_changed) {
1264 width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
1265 } else {
1266 width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
1267 }
1268
1269 if (width == 0) {
1270 if (*parser->cursor >= 0x80 && parser->encoding_changed) {
1271 if (parser->encoding->multibyte) {
1272 // Invalid multibyte character in a multibyte encoding.
1273 // Emit the error at the byte location immediately.
1274 parser->has_invalid_multibyte = true;
1275 pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
1276 } else {
1277 // Non-ASCII byte in a single-byte encoding (e.g.,
1278 // US-ASCII). Count it for later error reporting.
1279 parser->non_ascii_literal_count++;
1280 }
1281 parser->cursor++;
1282 return pm_regexp_parse_quantifier(parser);
1283 }
1284 return false;
1285 }
1286
1287 // Count non-ASCII literal bytes.
1288 for (size_t i = 0; i < width; i++) {
1289 if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++;
1290 }
1291
1292 parser->cursor += width;
1293 return pm_regexp_parse_quantifier(parser);
1294 }
1295 }
1296}
1297
1302static bool
1303pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
1304 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
1305 pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
1306 return false;
1307 }
1308
1309 if (!pm_regexp_parse_item(parser, depth)) {
1310 return false;
1311 }
1312
1313 while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
1314 if (!pm_regexp_parse_item(parser, depth)) {
1315 return false;
1316 }
1317 }
1318
1319 return true;
1320}
1321
1328static bool
1329pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
1330 do {
1331 if (pm_regexp_char_is_eof(parser)) return true;
1332 if (!pm_regexp_parse_expression(parser, 0)) return false;
1333 } while (pm_regexp_char_accept(parser, '|'));
1334
1335 return pm_regexp_char_is_eof(parser);
1336}
1337
1338// ---------------------------------------------------------------------------
1339// Encoding validation
1340// ---------------------------------------------------------------------------
1341
1346static bool
1347pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) {
1348 const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer);
1349 size_t len = pm_buffer_length(buffer);
1350 size_t i = 0;
1351
1352 while (i < len) {
1353 size_t group_start = i;
1354 while (i < len && data[i] != 0x00) i++;
1355
1356 for (size_t j = group_start; j < i; ) {
1357 size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j));
1358 if (width == 0) return false;
1359 j += width;
1360 }
1361
1362 if (i < len) i++; // skip sentinel
1363 }
1364
1365 return true;
1366}
1367
1372static void
1373pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) {
1374 size_t index = 0;
1375
1376 if (encoding == PM_ENCODING_UTF_8_ENTRY) {
1377 pm_buffer_append_string(buffer, (const char *) source, length);
1378 return;
1379 }
1380
1381 while (index < length) {
1382 if (source[index] < 0x80) {
1383 pm_buffer_append_byte(buffer, source[index]);
1384 index++;
1385 } else if (encoding->multibyte) {
1386 size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index));
1387
1388 if (width > 1) {
1389 pm_buffer_append_string(buffer, "\\x{", 3);
1390 for (size_t i = 0; i < width; i++) {
1391 pm_buffer_append_format(buffer, "%02X", source[index + i]);
1392 }
1393 pm_buffer_append_byte(buffer, '}');
1394 index += width;
1395 } else {
1396 pm_buffer_append_format(buffer, "\\x%02X", source[index]);
1397 index++;
1398 }
1399 } else {
1400 pm_buffer_append_format(buffer, "\\x%02X", source[index]);
1401 index++;
1402 }
1403 }
1404}
1405
1409#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \
1410 pm_diagnostic_list_append_format( \
1411 &(parser)->parser->metadata_arena, \
1412 &(parser)->parser->error_list, \
1413 (uint32_t) ((parser)->node_start - (parser)->parser->start), \
1414 (uint32_t) ((parser)->node_end - (parser)->node_start), \
1415 diag_id, __VA_ARGS__)
1416
1436static pm_node_flags_t
1437pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) {
1438
1439 if (parser->explicit_encoding == NULL) {
1440 if (ascii_only) {
1441 // Check property escapes against the modifier's encoding tier.
1442 // /n (ASCII-8BIT): only POSIX properties are valid.
1443 // /e, /s: POSIX and script properties are valid.
1444 // /u: all properties are valid.
1445 if (modifier == 'n' && parser->property_name != NULL) {
1446 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
1447 (int) parser->property_name_length, (const char *) parser->property_name,
1448 source_length, source_start);
1449 } else if (modifier != 'u' && parser->has_unicode_property_escape) {
1450 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
1451 (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name,
1452 source_length, source_start);
1453 }
1454 return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
1455 }
1456
1457 if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
1458 for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
1459 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
1460 }
1461 } else if (parser->encoding != modifier_encoding) {
1462 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
1463
1464 if (modifier == 'n' && !ascii_only) {
1465 pm_buffer_t formatted = { 0 };
1466 pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length);
1467 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value);
1468 pm_buffer_cleanup(&formatted);
1469 }
1470 }
1471
1472 return flags;
1473 }
1474
1475 // Mixed unicode + hex escapes.
1476 if (parser->has_unicode_escape && parser->has_hex_escape) {
1477 if (modifier == 'n') {
1478 if (parser->last_escape_was_unicode) {
1479 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
1480 } else {
1481 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
1482 }
1483 } else {
1484 if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
1485 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1486 }
1487 }
1488
1489 return flags;
1490 }
1491
1492 if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
1493 if (parser->last_escape_was_unicode) {
1494 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
1495 } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
1496 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
1497 }
1498 }
1499
1500 if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
1501 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1502 }
1503
1504 return flags;
1505}
1506
1523static pm_node_flags_t
1524pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) {
1525
1526 // Invalid multibyte characters suppress further validation.
1527 // Errors were already emitted at the byte locations during parsing.
1528 if (parser->has_invalid_multibyte) {
1529 return flags;
1530 }
1531
1532 if (parser->invalid_unicode_range) {
1533 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start);
1534 return flags;
1535 }
1536
1537 // Check modifier flags first.
1538 if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
1539 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length);
1540 }
1541 if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
1542 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length);
1543 }
1544 if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
1545 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length);
1546 }
1547 if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
1548 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length);
1549 }
1550
1551 // No modifier — check for non-ASCII literals in US-ASCII encoding.
1552 if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
1553 for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
1554 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
1555 }
1556 }
1557
1558 // ASCII-only regexps get downgraded to US-ASCII, unless property escapes
1559 // force UTF-8.
1560 if (ascii_only) {
1561 if (parser->has_property_escape) {
1562 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
1563 }
1564 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
1565 }
1566
1567 // Check explicit encoding from escape sequences.
1568 if (parser->explicit_encoding != NULL) {
1569 // Mixed unicode + hex escapes without modifier.
1570 if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
1571 if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY &&
1572 parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY &&
1573 !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) {
1574 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1575 } else if (parser->last_escape_was_unicode) {
1576 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
1577 } else {
1578 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
1579 }
1580
1581 return 0;
1582 }
1583
1584 if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
1585 if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
1586 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1587 }
1588
1589 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
1590 } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
1591 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
1592 } else {
1593 if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
1594 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1595 }
1596 }
1597 }
1598
1599 return 0;
1600}
1601
1609pm_node_flags_t
1610pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
1611 const uint8_t *source = parser->start + node->content_loc.start;
1612 size_t size = node->content_loc.length;
1613 bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED);
1614 pm_node_flags_t flags = PM_NODE_FLAGS(node);
1615
1616 const uint8_t *node_start = parser->start + node->base.location.start;
1617 const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length;
1618
1619 // First pass: walk raw source for encoding validation (no name extraction).
1620 pm_regexp_parser_t regexp_parser = {
1621 .parser = parser,
1622 .start = source,
1623 .cursor = source,
1624 .end = source + size,
1625 .extended_mode = extended_mode,
1626 .encoding_changed = parser->encoding_changed,
1627 .encoding = parser->encoding,
1628 .name_callback = NULL,
1629 .name_data = NULL,
1630 .shared = true,
1631 .node_start = node_start,
1632 .node_end = node_end,
1633 .has_unicode_escape = false,
1634 .has_hex_escape = false,
1635 .last_escape_was_unicode = false,
1636 .explicit_encoding = NULL,
1637 .has_property_escape = false,
1638 .has_unicode_property_escape = false,
1639 .property_name = NULL,
1640 .property_name_length = 0,
1641 .unicode_property_name = NULL,
1642 .unicode_property_name_length = 0,
1643 .non_ascii_literal_count = 0,
1644 .invalid_unicode_range = false,
1645 .hex_escape_buffer = { 0 },
1646 .hex_group_active = false,
1647 .has_invalid_multibyte = false,
1648 };
1649
1650 pm_regexp_parse_pattern(&regexp_parser);
1651
1652 // Compute ascii_only from the regexp parser's tracked state. We cannot
1653 // use node->unescaped for this because regexp unescaped content preserves
1654 // escape text (e.g., \x80 is 4 ASCII chars), not the binary values.
1655 bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0;
1656 // Use the unescaped content for error messages to match CRuby's format,
1657 // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes
1658 // like \u{80} are preserved as text.
1659 const char *error_source = (const char *) pm_string_source(&node->unescaped);
1660 int error_source_length = (int) pm_string_length(&node->unescaped);
1661 pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(&regexp_parser, ascii_only, flags, error_source, error_source_length);
1662 pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
1663
1664 // Second pass: walk unescaped content for named capture extraction.
1665 if (name_callback != NULL) {
1666 bool shared = node->unescaped.type == PM_STRING_SHARED;
1667 pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data);
1668 }
1669
1670 return encoding_flags;
1671}
1672
1683void
1684pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
1685 pm_regexp_parser_t regexp_parser = {
1686 .parser = parser,
1687 .start = source,
1688 .cursor = source,
1689 .end = source + size,
1690 .extended_mode = extended_mode,
1691 .encoding_changed = parser->encoding_changed,
1692 .encoding = parser->encoding,
1693 .name_callback = name_callback,
1694 .name_data = name_data,
1695 .shared = shared,
1696 .node_start = source,
1697 .node_end = source + size,
1698 .has_unicode_escape = false,
1699 .has_hex_escape = false,
1700 .last_escape_was_unicode = false,
1701 .explicit_encoding = NULL,
1702 .has_property_escape = false,
1703 .has_unicode_property_escape = false,
1704 .property_name = NULL,
1705 .property_name_length = 0,
1706 .unicode_property_name = NULL,
1707 .unicode_property_name_length = 0,
1708 .non_ascii_literal_count = 0,
1709 .invalid_unicode_range = false,
1710 .hex_escape_buffer = { 0 },
1711 .hex_group_active = false,
1712 .has_invalid_multibyte = false,
1713 };
1714
1715 pm_regexp_parse_pattern(&regexp_parser);
1716 pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
1717}
#define PRISM_FALLTHROUGH
We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
Definition fallthrough.h:15
int len
Length of the buffer.
Definition io.h:8
#define PRISM_INLINE
Old Visual Studio versions do not support the inline keyword, so we need to define it to be __inline.
Definition inline.h:12
VALUE type(ANYARGS)
ANYARGS-ed function type.
uint32_t start
The offset of the location from the start of the source.
Definition ast.h:556
uint32_t length
The length of the location.
Definition ast.h:559
pm_location_t location
This is the location of the node in the source.
Definition ast.h:1088
This is the set of options that are configurable on the regular expression.
Definition regexp.c:904
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
Definition regexp.c:906
This is the parser that is going to handle parsing regular expressions.
Definition regexp.c:23
const uint8_t * cursor
A pointer to the current position in the source.
Definition regexp.c:31
bool has_unicode_escape
Whether a \u{...} escape with value >= 0x80 was seen.
Definition regexp.c:97
size_t unicode_property_name_length
Length of the first Unicode-only property name found.
Definition regexp.c:76
bool has_hex_escape
Whether a \xNN escape (or \M-x, etc.) with value >= 0x80 was seen.
Definition regexp.c:100
const uint8_t * start
A pointer to the start of the source that we are parsing.
Definition regexp.c:28
const uint8_t * end
A pointer to the end of the source that we are parsing.
Definition regexp.c:34
pm_regexp_name_data_t * name_data
The data to pass to the name callback.
Definition regexp.c:43
bool hex_group_active
Whether we are accumulating consecutive hex escape bytes.
Definition regexp.c:118
pm_buffer_t hex_escape_buffer
Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels.
Definition regexp.c:79
bool extended_mode
Whether or not the regular expression currently being parsed is in extended mode, wherein whitespace ...
Definition regexp.c:88
pm_parser_t * parser
The parser that is currently being used.
Definition regexp.c:25
size_t property_name_length
Length of the first non-POSIX property name found.
Definition regexp.c:67
bool shared
Whether the source content is shared (for named capture callback).
Definition regexp.c:94
bool invalid_unicode_range
Whether a \u escape with invalid range (surrogate or > 0x10FFFF) was seen.
Definition regexp.c:115
const uint8_t * property_name
Pointer to the first non-POSIX property name (for /n error messages).
Definition regexp.c:64
const pm_encoding_t * encoding
The encoding of the source.
Definition regexp.c:37
const pm_encoding_t * explicit_encoding
The explicit encoding determined by escape sequences.
Definition regexp.c:56
const uint8_t * node_end
The end of the regexp node (for error locations).
Definition regexp.c:49
uint32_t non_ascii_literal_count
Count of non-ASCII literal bytes (not from escapes).
Definition regexp.c:82
const uint8_t * node_start
The start of the regexp node (for error locations).
Definition regexp.c:46
const uint8_t * unicode_property_name
Pointer to the first Unicode-only property name (for /e, /s error messages).
Definition regexp.c:73
pm_regexp_name_callback_t name_callback
The callback to call when a named capture group is found.
Definition regexp.c:40
bool encoding_changed
Whether the encoding has changed from the default.
Definition regexp.c:91
bool has_unicode_property_escape
Whether a Unicode-only property escape was found (not POSIX or script).
Definition regexp.c:112
bool has_invalid_multibyte
Whether an invalid multibyte character was found during parsing.
Definition regexp.c:121
bool last_escape_was_unicode
Tracks whether the last encoding-setting escape was \u (true) or \x (false).
Definition regexp.c:106
bool has_property_escape
Whether any \p{...} or \P{...} property escape was found.
Definition regexp.c:109
RegularExpressionNode.
Definition ast.h:6976
pm_node_t base
The embedded base node.
Definition ast.h:6978
pm_string_t unescaped
RegularExpressionNode::unescaped.
Definition ast.h:6998
pm_location_t content_loc
RegularExpressionNode::content_loc.
Definition ast.h:6988
A generic string type that can have various ownership semantics.
Definition stringy.h:18
enum pm_string_t::@110 type
The type of the string.