1#include "prism/internal/regexp.h"
5#include "prism/internal/buffer.h"
6#include "prism/internal/char.h"
7#include "prism/internal/diagnostic.h"
8#include "prism/internal/encoding.h"
9#include "prism/internal/memchr.h"
10#include "prism/internal/parser.h"
11#include "prism/internal/stringy.h"
12#include "prism/internal/strncasecmp.h"
18#define PM_REGEXP_PARSE_DEPTH_MAX 4096
130pm_regexp_parse_error(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end,
const char *message) {
132 uint32_t loc_start, loc_length;
135 loc_start = (uint32_t) (start - pm->start);
136 loc_length = (uint32_t) (end - start);
138 loc_start = (uint32_t) (parser->
node_start - pm->start);
142 pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message);
149#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \
151 pm_parser_t *pm__ = (parser_)->parser; \
152 uint32_t loc_start__, loc_length__; \
153 if ((parser_)->shared) { \
154 loc_start__ = (uint32_t) ((err_start_) - pm__->start); \
155 loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \
157 loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \
158 loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \
160 pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \
168pm_regexp_parser_named_capture(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end) {
170 pm_string_shared_init(&
string, start, end);
172 pm_string_cleanup(&
string);
188 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
200 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
212 if (pm_regexp_char_is_eof(parser)) {
251 pm_regexp_hex_group_boundary(parser);
259pm_regexp_hex_digit_value(uint8_t
byte) {
260 if (
byte >=
'0' &&
byte <=
'9')
return byte -
'0';
261 if (
byte >=
'a' &&
byte <=
'f')
return byte -
'a' + 10;
262 if (
byte >=
'A' &&
byte <=
'F')
return byte -
'A' + 10;
300 const uint8_t *savepoint = parser->
cursor;
303 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
304 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
305 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
306 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
307 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
311 parser->
cursor = savepoint;
316 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
317 switch (*parser->
cursor) {
318 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
320 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
324 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
327 parser->
cursor = savepoint;
331 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
332 switch (*parser->
cursor) {
333 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
338 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
344 parser->
cursor = savepoint;
348 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
349 switch (*parser->
cursor) {
350 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
352 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
355 parser->
cursor = savepoint;
359 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
360 switch (*parser->
cursor) {
361 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
368 parser->
cursor = savepoint;
388 while (!pm_regexp_char_is_eof(parser)) {
389 switch (*parser->
cursor) {
397 if (!pm_regexp_parse_range_quantifier(parser))
return false;
414 if (!pm_regexp_char_expect(parser,
':')) {
418 pm_regexp_char_accept(parser,
'^');
421 pm_regexp_char_find(parser,
':') &&
422 pm_regexp_char_expect(parser,
']') &&
423 pm_regexp_char_expect(parser,
']')
439 PM_REGEXP_PROPERTY_POSIX,
440 PM_REGEXP_PROPERTY_SCRIPT,
441 PM_REGEXP_PROPERTY_UNICODE
442} pm_regexp_property_type_t;
448static pm_regexp_property_type_t
449pm_regexp_classify_property(
const uint8_t *name,
size_t length) {
451 if (length > 0 && name[0] ==
'^') {
456#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0)
460 if (PM_REGEXP_CASECMP(
"Han"))
return PM_REGEXP_PROPERTY_SCRIPT;
463 if (PM_REGEXP_CASECMP(
"Word"))
return PM_REGEXP_PROPERTY_POSIX;
467 switch (name[0] | 0x20) {
469 if (PM_REGEXP_CASECMP(
"Alnum"))
return PM_REGEXP_PROPERTY_POSIX;
470 if (PM_REGEXP_CASECMP(
"Alpha"))
return PM_REGEXP_PROPERTY_POSIX;
471 if (PM_REGEXP_CASECMP(
"ASCII"))
return PM_REGEXP_PROPERTY_POSIX;
474 if (PM_REGEXP_CASECMP(
"Blank"))
return PM_REGEXP_PROPERTY_POSIX;
477 if (PM_REGEXP_CASECMP(
"Cntrl"))
return PM_REGEXP_PROPERTY_POSIX;
480 if (PM_REGEXP_CASECMP(
"Digit"))
return PM_REGEXP_PROPERTY_POSIX;
483 if (PM_REGEXP_CASECMP(
"Graph"))
return PM_REGEXP_PROPERTY_POSIX;
484 if (PM_REGEXP_CASECMP(
"Greek"))
return PM_REGEXP_PROPERTY_SCRIPT;
487 if (PM_REGEXP_CASECMP(
"Lower"))
return PM_REGEXP_PROPERTY_POSIX;
488 if (PM_REGEXP_CASECMP(
"Latin"))
return PM_REGEXP_PROPERTY_SCRIPT;
491 if (PM_REGEXP_CASECMP(
"Print"))
return PM_REGEXP_PROPERTY_POSIX;
492 if (PM_REGEXP_CASECMP(
"Punct"))
return PM_REGEXP_PROPERTY_POSIX;
495 if (PM_REGEXP_CASECMP(
"Space"))
return PM_REGEXP_PROPERTY_POSIX;
498 if (PM_REGEXP_CASECMP(
"Upper"))
return PM_REGEXP_PROPERTY_POSIX;
503 if (PM_REGEXP_CASECMP(
"XDigit"))
return PM_REGEXP_PROPERTY_POSIX;
506 if (PM_REGEXP_CASECMP(
"Hiragana"))
return PM_REGEXP_PROPERTY_SCRIPT;
507 if (PM_REGEXP_CASECMP(
"Katakana"))
return PM_REGEXP_PROPERTY_SCRIPT;
508 if (PM_REGEXP_CASECMP(
"Cyrillic"))
return PM_REGEXP_PROPERTY_SCRIPT;
512#undef PM_REGEXP_CASECMP
515 return PM_REGEXP_PROPERTY_UNICODE;
529 assert(*parser->
cursor ==
'p' || *parser->
cursor ==
'P');
532 const uint8_t *name_start = parser->
cursor + 2;
533 const uint8_t *search = name_start;
535 while (search < parser->end && *search !=
'}') search++;
537 if (search < parser->end) {
538 size_t name_length = (size_t) (search - name_start);
541 pm_regexp_property_type_t
type = pm_regexp_classify_property(name_start, name_length);
544 if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->
property_name == NULL) {
550 if (type == PM_REGEXP_PROPERTY_UNICODE) {
558 parser->
cursor = search + 1;
576 const uint8_t *escape_start = parser->
cursor - 2;
578 if (pm_regexp_char_is_eof(parser)) {
579 pm_regexp_parse_error(parser, escape_start, parser->
cursor,
"invalid Unicode escape");
583 if (*parser->
cursor ==
'{') {
587 while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->
cursor)) {
591 bool has_codepoint =
false;
593 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'}') {
596 size_t hex_count = 0;
599 while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->
cursor)) >= 0) {
600 value = (value << 4) | (uint32_t) digit;
605 if (hex_count == 0) {
607 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'}') {
611 const uint8_t *escape_end = parser->
cursor;
612 if (!pm_regexp_char_is_eof(parser)) {
617 pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (
int) (escape_end - escape_start), (
const char *) escape_start);
622 pm_regexp_parse_error(parser, escape_start, parser->
cursor,
"invalid Unicode range");
630 pm_regexp_hex_group_boundary(parser);
634 if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
638 has_codepoint =
true;
641 while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->
cursor)) {
646 if (pm_regexp_char_is_eof(parser)) {
647 pm_regexp_parse_error(parser, escape_start, parser->
cursor,
"unterminated Unicode escape");
649 if (!has_codepoint) {
650 pm_regexp_parse_error_format(parser, escape_start, parser->
cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (
int) (parser->
cursor + 1 - escape_start), (
const char *) escape_start);
657 size_t hex_count = 0;
660 while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->
cursor)) >= 0) {
661 value = (value << 4) | (uint32_t) digit;
667 pm_regexp_parse_error(parser, escape_start, parser->
cursor,
"invalid Unicode escape");
668 }
else if (value >= 0x80) {
672 pm_regexp_hex_group_boundary(parser);
676 if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) {
694 if (!pm_regexp_char_is_eof(parser)) {
695 int digit = pm_regexp_hex_digit_value(*parser->
cursor);
700 if (!pm_regexp_char_is_eof(parser)) {
701 digit = pm_regexp_hex_digit_value(*parser->
cursor);
703 value = (value << 4) | digit;
711 pm_regexp_track_hex_escape(parser, (uint8_t) value);
724 if (pm_regexp_char_is_eof(parser))
return;
726 switch (*parser->
cursor) {
729 pm_regexp_parse_unicode_escape(parser);
733 pm_regexp_parse_property_escape(parser);
737 pm_regexp_parse_hex_escape(parser);
743 if (!pm_regexp_char_is_eof(parser)) {
744 if (*parser->
cursor ==
'\\') {
748 pm_regexp_parse_backslash_escape(parser);
753 pm_regexp_track_hex_escape(parser, 0x80);
763 if (!pm_regexp_char_is_eof(parser)) {
764 if (*parser->
cursor ==
'\\') {
766 pm_regexp_parse_backslash_escape(parser);
778 if (!pm_regexp_char_is_eof(parser)) {
779 if (*parser->
cursor ==
'\\') {
781 pm_regexp_parse_backslash_escape(parser);
788 pm_regexp_hex_group_boundary(parser);
801 uint8_t
byte = *cursor;
803 size_t width = parser->
encoding->char_width(cursor, (ptrdiff_t) (parser->
end - cursor));
805 parser->
cursor += width - 1;
806 }
else if (width == 0) {
808 pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->
encoding->name);
819 pm_regexp_char_accept(parser,
'^');
821 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
']') {
822 switch (*parser->
cursor++) {
824 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
827 pm_regexp_parse_backslash_escape(parser);
833 if (*(parser->
cursor - 1) >= 0x80) {
836 pm_regexp_parse_invalid_multibyte(parser, parser->
cursor - 1);
841 return pm_regexp_char_expect(parser,
']');
849 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
850 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
856 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"empty char-class");
860 const uint8_t *reset = parser->
cursor;
864 if (pm_regexp_parse_posix_class(parser))
return true;
869 return pm_regexp_parse_character_set(parser, depth);
882 PM_REGEXP_OPTION_STATE_INVALID,
883 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
884 PM_REGEXP_OPTION_STATE_ADDABLE,
885 PM_REGEXP_OPTION_STATE_ADDED,
886 PM_REGEXP_OPTION_STATE_REMOVED
887} pm_regexp_option_state_t;
893#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
896#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
899#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
906 uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
914 memset(options, PM_REGEXP_OPTION_STATE_INVALID,
sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
915 options->
values[
'i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
916 options->
values[
'm' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
917 options->
values[
'x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
918 options->
values[
'd' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
919 options->
values[
'a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
920 options->
values[
'u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
929 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
930 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
932 switch (options->
values[key]) {
933 case PM_REGEXP_OPTION_STATE_INVALID:
934 case PM_REGEXP_OPTION_STATE_REMOVED:
936 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
937 case PM_REGEXP_OPTION_STATE_ADDABLE:
938 options->
values[key] = PM_REGEXP_OPTION_STATE_ADDED;
940 case PM_REGEXP_OPTION_STATE_ADDED:
954 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
955 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
957 switch (options->
values[key]) {
958 case PM_REGEXP_OPTION_STATE_INVALID:
959 case PM_REGEXP_OPTION_STATE_ADDABLE:
961 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
962 case PM_REGEXP_OPTION_STATE_ADDED:
963 case PM_REGEXP_OPTION_STATE_REMOVED:
964 options->
values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
977 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
978 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
979 return options->
values[key];
1008 const uint8_t *group_start = parser->
cursor;
1011 pm_regexp_options_init(&options);
1014 if (pm_regexp_char_accept(parser,
'?')) {
1015 if (pm_regexp_char_is_eof(parser)) {
1016 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
1020 switch (*parser->
cursor) {
1023 if (pm_regexp_char_is_eof(parser)) {
1024 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
1029 bool escaped =
false;
1035 if (!escaped && *parser->
cursor ==
')') {
1042 if (*parser->
cursor >= 0x80) {
1044 pm_regexp_parse_error_format(parser, parser->
cursor, parser->
cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->
encoding->name);
1051 escaped = (width == 1) && (*parser->
cursor ==
'\\');
1060 bool found = pm_regexp_char_find(parser,
')');
1062 while (found && (parser->
start <= parser->
cursor - 2) && (*(parser->
cursor - 2) ==
'\\')) {
1063 found = pm_regexp_char_find(parser,
')');
1078 if (pm_regexp_char_is_eof(parser)) {
1079 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
1083 switch (*parser->
cursor) {
1089 const uint8_t *start = parser->
cursor;
1090 if (!pm_regexp_char_find(parser,
'>')) {
1094 if (parser->
cursor - start == 1) {
1095 pm_regexp_parse_error(parser, start, parser->
cursor,
"group name is empty");
1099 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
1107 const uint8_t *start = ++parser->
cursor;
1108 if (!pm_regexp_char_find(parser,
'\'')) {
1113 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
1119 if (!pm_regexp_char_find(parser,
')')) {
1123 case 'i':
case 'm':
case 'x':
case 'd':
case 'a':
case 'u':
1124 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'-' && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
1125 if (!pm_regexp_options_add(&options, *parser->
cursor)) {
1131 if (pm_regexp_char_is_eof(parser)) {
1138 if (*parser->
cursor ==
')') {
1139 if (pm_regexp_options_state(&options,
'x') == PM_REGEXP_OPTION_STATE_ADDED) {
1148 if (*parser->
cursor !=
'-')
break;
1153 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
1154 if (!pm_regexp_options_remove(&options, *parser->
cursor)) {
1160 if (pm_regexp_char_is_eof(parser)) {
1167 if (*parser->
cursor ==
')') {
1168 switch (pm_regexp_options_state(&options,
'x')) {
1169 case PM_REGEXP_OPTION_STATE_ADDED:
1172 case PM_REGEXP_OPTION_STATE_REMOVED:
1184 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"undefined group option");
1190 switch (pm_regexp_options_state(&options,
'x')) {
1191 case PM_REGEXP_OPTION_STATE_ADDED:
1194 case PM_REGEXP_OPTION_STATE_REMOVED:
1200 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')') {
1201 if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
1205 pm_regexp_char_accept(parser,
'|');
1210 if (pm_regexp_char_expect(parser,
')'))
return true;
1212 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
1230 switch (*parser->
cursor) {
1234 return pm_regexp_parse_quantifier(parser);
1237 pm_regexp_parse_backslash_escape(parser);
1238 return pm_regexp_parse_quantifier(parser);
1241 return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
1244 return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
1249 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"target of repeat operator is not specified");
1253 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"unmatched close parenthesis");
1257 if (!pm_regexp_char_find(parser,
'\n')) parser->
cursor = parser->
end;
1264 width = pm_encoding_utf_8_char_width(parser->
cursor, (ptrdiff_t) (parser->
end - parser->
cursor));
1275 pm_regexp_parse_error_format(parser, parser->
cursor, parser->
cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->
encoding->name);
1282 return pm_regexp_parse_quantifier(parser);
1288 for (
size_t i = 0; i < width; i++) {
1293 return pm_regexp_parse_quantifier(parser);
1304 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
1305 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
1309 if (!pm_regexp_parse_item(parser, depth)) {
1313 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')' && *parser->
cursor !=
'|') {
1314 if (!pm_regexp_parse_item(parser, depth)) {
1331 if (pm_regexp_char_is_eof(parser))
return true;
1332 if (!pm_regexp_parse_expression(parser, 0))
return false;
1333 }
while (pm_regexp_char_accept(parser,
'|'));
1335 return pm_regexp_char_is_eof(parser);
1348 const uint8_t *data = (
const uint8_t *) pm_buffer_value(buffer);
1349 size_t len = pm_buffer_length(buffer);
1353 size_t group_start = i;
1354 while (i <
len && data[i] != 0x00) i++;
1356 for (
size_t j = group_start; j < i; ) {
1357 size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j));
1358 if (width == 0)
return false;
1373pm_regexp_format_for_error(
pm_buffer_t *buffer,
const pm_encoding_t *encoding,
const uint8_t *source,
size_t length) {
1376 if (encoding == PM_ENCODING_UTF_8_ENTRY) {
1377 pm_buffer_append_string(buffer, (
const char *) source, length);
1381 while (index < length) {
1382 if (source[index] < 0x80) {
1383 pm_buffer_append_byte(buffer, source[index]);
1385 }
else if (encoding->multibyte) {
1386 size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index));
1389 pm_buffer_append_string(buffer,
"\\x{", 3);
1390 for (
size_t i = 0; i < width; i++) {
1391 pm_buffer_append_format(buffer,
"%02X", source[index + i]);
1393 pm_buffer_append_byte(buffer,
'}');
1396 pm_buffer_append_format(buffer,
"\\x%02X", source[index]);
1400 pm_buffer_append_format(buffer,
"\\x%02X", source[index]);
1409#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \
1410 pm_diagnostic_list_append_format( \
1411 &(parser)->parser->metadata_arena, \
1412 &(parser)->parser->error_list, \
1413 (uint32_t) ((parser)->node_start - (parser)->parser->start), \
1414 (uint32_t) ((parser)->node_end - (parser)->node_start), \
1415 diag_id, __VA_ARGS__)
1436static pm_node_flags_t
1437pm_regexp_validate_encoding_modifier(
pm_regexp_parser_t *parser,
bool ascii_only, pm_node_flags_t flags,
char modifier,
const pm_encoding_t *modifier_encoding,
const char *source_start,
int source_length) {
1446 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
1448 source_length, source_start);
1450 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
1452 source_length, source_start);
1454 return modifier ==
'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
1457 if (parser->
encoding == PM_ENCODING_US_ASCII_ENTRY) {
1459 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->
encoding->name);
1461 }
else if (parser->
encoding != modifier_encoding) {
1462 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->
encoding->name);
1464 if (modifier ==
'n' && !ascii_only) {
1466 pm_regexp_format_for_error(&formatted, parser->
encoding, (
const uint8_t *) source_start, (
size_t) source_length);
1467 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (
int) formatted.length, (
const char *) formatted.value);
1468 pm_buffer_cleanup(&formatted);
1477 if (modifier ==
'n') {
1479 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
1481 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
1484 if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->
hex_escape_buffer)) {
1485 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1494 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
1495 }
else if (parser->
encoding != PM_ENCODING_UTF_8_ENTRY) {
1496 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
1500 if (modifier !=
'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->
hex_escape_buffer)) {
1501 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1523static pm_node_flags_t
1524pm_regexp_validate_encoding(
pm_regexp_parser_t *parser,
bool ascii_only, pm_node_flags_t flags,
const char *source_start,
int source_length) {
1533 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start);
1538 if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
1539 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags,
'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length);
1541 if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
1542 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags,
'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length);
1544 if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
1545 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags,
'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length);
1547 if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
1548 return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags,
's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length);
1554 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->
encoding->name);
1562 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
1564 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
1571 if (parser->
encoding != PM_ENCODING_US_ASCII_ENTRY &&
1572 parser->
encoding != PM_ENCODING_ASCII_8BIT_ENTRY &&
1574 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1576 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
1578 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
1586 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1589 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
1590 }
else if (parser->
encoding == PM_ENCODING_US_ASCII_ENTRY) {
1591 return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
1594 PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
1613 bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED);
1614 pm_node_flags_t flags = PM_NODE_FLAGS(node);
1624 .end = source + size,
1625 .extended_mode = extended_mode,
1626 .encoding_changed = parser->encoding_changed,
1627 .encoding = parser->encoding,
1628 .name_callback = NULL,
1631 .node_start = node_start,
1632 .node_end = node_end,
1633 .has_unicode_escape =
false,
1634 .has_hex_escape =
false,
1635 .last_escape_was_unicode =
false,
1636 .explicit_encoding = NULL,
1637 .has_property_escape =
false,
1638 .has_unicode_property_escape =
false,
1639 .property_name = NULL,
1640 .property_name_length = 0,
1641 .unicode_property_name = NULL,
1642 .unicode_property_name_length = 0,
1643 .non_ascii_literal_count = 0,
1644 .invalid_unicode_range =
false,
1645 .hex_escape_buffer = { 0 },
1646 .hex_group_active =
false,
1647 .has_invalid_multibyte =
false,
1650 pm_regexp_parse_pattern(®exp_parser);
1659 const char *error_source = (
const char *) pm_string_source(&node->
unescaped);
1660 int error_source_length = (int) pm_string_length(&node->
unescaped);
1661 pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(®exp_parser, ascii_only, flags, error_source, error_source_length);
1665 if (name_callback != NULL) {
1667 pm_regexp_parse_named_captures(parser, pm_string_source(&node->
unescaped), pm_string_length(&node->
unescaped), shared, extended_mode, name_callback, name_data);
1670 return encoding_flags;
1684pm_regexp_parse_named_captures(
pm_parser_t *parser,
const uint8_t *source,
size_t size,
bool shared,
bool extended_mode, pm_regexp_name_callback_t name_callback,
pm_regexp_name_data_t *name_data) {
1689 .end = source + size,
1690 .extended_mode = extended_mode,
1691 .encoding_changed = parser->encoding_changed,
1692 .encoding = parser->encoding,
1693 .name_callback = name_callback,
1694 .name_data = name_data,
1696 .node_start = source,
1697 .node_end = source + size,
1698 .has_unicode_escape =
false,
1699 .has_hex_escape =
false,
1700 .last_escape_was_unicode =
false,
1701 .explicit_encoding = NULL,
1702 .has_property_escape =
false,
1703 .has_unicode_property_escape =
false,
1704 .property_name = NULL,
1705 .property_name_length = 0,
1706 .unicode_property_name = NULL,
1707 .unicode_property_name_length = 0,
1708 .non_ascii_literal_count = 0,
1709 .invalid_unicode_range =
false,
1710 .hex_escape_buffer = { 0 },
1711 .hex_group_active =
false,
1712 .has_invalid_multibyte =
false,
1715 pm_regexp_parse_pattern(®exp_parser);
#define PRISM_FALLTHROUGH
We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch.
int len
Length of the buffer.
#define PRISM_INLINE
Old Visual Studio versions do not support the inline keyword, so we need to define it to be __inline.
VALUE type(ANYARGS)
ANYARGS-ed function type.
uint32_t start
The offset of the location from the start of the source.
uint32_t length
The length of the location.
pm_location_t location
This is the location of the node in the source.
This is the set of options that are configurable on the regular expression.
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
This is the parser that is going to handle parsing regular expressions.
const uint8_t * cursor
A pointer to the current position in the source.
bool has_unicode_escape
Whether a \u{...} escape with value >= 0x80 was seen.
size_t unicode_property_name_length
Length of the first Unicode-only property name found.
bool has_hex_escape
Whether a \xNN escape (or \M-x, etc.) with value >= 0x80 was seen.
const uint8_t * start
A pointer to the start of the source that we are parsing.
const uint8_t * end
A pointer to the end of the source that we are parsing.
pm_regexp_name_data_t * name_data
The data to pass to the name callback.
bool hex_group_active
Whether we are accumulating consecutive hex escape bytes.
pm_buffer_t hex_escape_buffer
Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels.
bool extended_mode
Whether or not the regular expression currently being parsed is in extended mode, wherein whitespace ...
pm_parser_t * parser
The parser that is currently being used.
size_t property_name_length
Length of the first non-POSIX property name found.
bool shared
Whether the source content is shared (for named capture callback).
bool invalid_unicode_range
Whether a \u escape with invalid range (surrogate or > 0x10FFFF) was seen.
const uint8_t * property_name
Pointer to the first non-POSIX property name (for /n error messages).
const pm_encoding_t * encoding
The encoding of the source.
const pm_encoding_t * explicit_encoding
The explicit encoding determined by escape sequences.
const uint8_t * node_end
The end of the regexp node (for error locations).
uint32_t non_ascii_literal_count
Count of non-ASCII literal bytes (not from escapes).
const uint8_t * node_start
The start of the regexp node (for error locations).
const uint8_t * unicode_property_name
Pointer to the first Unicode-only property name (for /e, /s error messages).
pm_regexp_name_callback_t name_callback
The callback to call when a named capture group is found.
bool encoding_changed
Whether the encoding has changed from the default.
bool has_unicode_property_escape
Whether a Unicode-only property escape was found (not POSIX or script).
bool has_invalid_multibyte
Whether an invalid multibyte character was found during parsing.
bool last_escape_was_unicode
Tracks whether the last encoding-setting escape was \u (true) or \x (false).
bool has_property_escape
Whether any \p{...} or \P{...} property escape was found.
pm_node_t base
The embedded base node.
pm_string_t unescaped
RegularExpressionNode::unescaped.
pm_location_t content_loc
RegularExpressionNode::content_loc.
A generic string type that can have various ownership semantics.
enum pm_string_t::@110 type
The type of the string.