3 #define PM_REGEXP_PARSE_DEPTH_MAX 4096
50 pm_regexp_parse_error(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end,
const char *message) {
59 pm_regexp_parser_named_capture(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end) {
79 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
91 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
103 if (pm_regexp_char_is_eof(parser)) {
151 const uint8_t *savepoint = parser->
cursor;
154 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
155 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
156 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
157 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
158 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
162 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
163 switch (*parser->
cursor) {
164 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
166 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
170 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
173 parser->
cursor = savepoint;
177 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
178 switch (*parser->
cursor) {
179 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
184 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
190 parser->
cursor = savepoint;
194 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
195 switch (*parser->
cursor) {
196 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
198 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
201 parser->
cursor = savepoint;
205 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
206 switch (*parser->
cursor) {
207 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
214 parser->
cursor = savepoint;
234 while (!pm_regexp_char_is_eof(parser)) {
235 switch (*parser->
cursor) {
243 if (!pm_regexp_parse_range_quantifier(parser))
return false;
260 if (!pm_regexp_char_expect(parser,
':')) {
264 pm_regexp_char_accept(parser,
'^');
267 pm_regexp_char_find(parser,
':') &&
268 pm_regexp_char_expect(parser,
']') &&
269 pm_regexp_char_expect(parser,
']')
283 pm_regexp_char_accept(parser,
'^');
285 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
']') {
286 switch (*parser->
cursor++) {
288 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
291 if (!pm_regexp_char_is_eof(parser)) {
301 return pm_regexp_char_expect(parser,
']');
309 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
310 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
316 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"empty char-class");
320 const uint8_t *reset = parser->
cursor;
324 if (pm_regexp_parse_posix_class(parser))
return true;
329 return pm_regexp_parse_character_set(parser, depth);
342 PM_REGEXP_OPTION_STATE_INVALID,
343 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
344 PM_REGEXP_OPTION_STATE_ADDABLE,
345 PM_REGEXP_OPTION_STATE_ADDED,
346 PM_REGEXP_OPTION_STATE_REMOVED
347 } pm_regexp_option_state_t;
352 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
353 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
354 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
361 uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
369 memset(options, PM_REGEXP_OPTION_STATE_INVALID,
sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
370 options->
values[
'i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
371 options->
values[
'm' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
372 options->
values[
'x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
373 options->
values[
'd' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
374 options->
values[
'a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
375 options->
values[
'u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
384 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
385 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
387 switch (options->
values[key]) {
388 case PM_REGEXP_OPTION_STATE_INVALID:
389 case PM_REGEXP_OPTION_STATE_REMOVED:
391 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
392 case PM_REGEXP_OPTION_STATE_ADDABLE:
393 options->
values[key] = PM_REGEXP_OPTION_STATE_ADDED;
395 case PM_REGEXP_OPTION_STATE_ADDED:
409 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
410 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
412 switch (options->
values[key]) {
413 case PM_REGEXP_OPTION_STATE_INVALID:
414 case PM_REGEXP_OPTION_STATE_ADDABLE:
416 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
417 case PM_REGEXP_OPTION_STATE_ADDED:
418 case PM_REGEXP_OPTION_STATE_REMOVED:
419 options->
values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
432 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
433 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
434 return options->
values[key];
463 const uint8_t *group_start = parser->
cursor;
466 pm_regexp_options_init(&options);
469 if (pm_regexp_char_accept(parser,
'?')) {
470 if (pm_regexp_char_is_eof(parser)) {
471 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
475 switch (*parser->
cursor) {
478 if (pm_regexp_char_is_eof(parser)) {
479 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
484 bool escaped =
false;
490 if (!escaped && *parser->
cursor ==
')') {
496 if (width == 0)
return false;
498 escaped = (width == 1) && (*parser->
cursor ==
'\\');
507 bool found = pm_regexp_char_find(parser,
')');
509 while (found && (parser->
start <= parser->
cursor - 2) && (*(parser->
cursor - 2) ==
'\\')) {
510 found = pm_regexp_char_find(parser,
')');
525 if (pm_regexp_char_is_eof(parser)) {
526 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
530 switch (*parser->
cursor) {
536 const uint8_t *start = parser->
cursor;
537 if (!pm_regexp_char_find(parser,
'>')) {
541 if (parser->
cursor - start == 1) {
542 pm_regexp_parse_error(parser, start, parser->
cursor,
"group name is empty");
546 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
554 const uint8_t *start = ++parser->
cursor;
555 if (!pm_regexp_char_find(parser,
'\'')) {
560 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
566 if (!pm_regexp_char_find(parser,
')')) {
570 case 'i':
case 'm':
case 'x':
case 'd':
case 'a':
case 'u':
571 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'-' && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
572 if (!pm_regexp_options_add(&options, *parser->
cursor)) {
578 if (pm_regexp_char_is_eof(parser)) {
585 if (*parser->
cursor ==
')') {
586 if (pm_regexp_options_state(&options,
'x') == PM_REGEXP_OPTION_STATE_ADDED) {
595 if (*parser->
cursor !=
'-')
break;
601 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
602 if (!pm_regexp_options_remove(&options, *parser->
cursor)) {
608 if (pm_regexp_char_is_eof(parser)) {
615 if (*parser->
cursor ==
')') {
616 switch (pm_regexp_options_state(&options,
'x')) {
617 case PM_REGEXP_OPTION_STATE_ADDED:
620 case PM_REGEXP_OPTION_STATE_REMOVED:
632 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"undefined group option");
638 switch (pm_regexp_options_state(&options,
'x')) {
639 case PM_REGEXP_OPTION_STATE_ADDED:
642 case PM_REGEXP_OPTION_STATE_REMOVED:
648 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')') {
649 if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
653 pm_regexp_char_accept(parser,
'|');
658 if (pm_regexp_char_expect(parser,
')'))
return true;
660 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
678 switch (*parser->
cursor) {
682 return pm_regexp_parse_quantifier(parser);
685 if (!pm_regexp_char_is_eof(parser)) {
688 return pm_regexp_parse_quantifier(parser);
691 return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
694 return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
699 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"target of repeat operator is not specified");
703 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"unmatched close parenthesis");
707 if (!pm_regexp_char_find(parser,
'\n')) parser->
cursor = parser->
end;
719 if (width == 0)
return false;
722 return pm_regexp_parse_quantifier(parser);
733 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
734 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
738 if (!pm_regexp_parse_item(parser, depth)) {
742 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')' && *parser->
cursor !=
'|') {
743 if (!pm_regexp_parse_item(parser, depth)) {
760 if (pm_regexp_char_is_eof(parser))
return true;
761 if (!pm_regexp_parse_expression(parser, 0))
return false;
762 }
while (pm_regexp_char_accept(parser,
'|'));
764 return pm_regexp_char_is_eof(parser);
777 .end = source + size,
778 .extended_mode = extended_mode,
781 .name_callback = name_callback,
782 .name_data = name_data,
783 .error_callback = error_callback,
784 .error_data = error_data
void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding)
We need to roll our own memchr to handle cases where the encoding changes and we need to search for a...
void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end)
Initialize a shared string that is based on initial input.
PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string)
Free the associated memory of the given string.
#define PRISM_EXPORTED_FUNCTION
By default, we compile with -fvisibility=hidden.
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
A regular expression parser.
void(* pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data)
This callback is called when a parse error is found.
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data)
Parse a regular expression.
void(* pm_regexp_name_callback_t)(const pm_string_t *name, void *data)
This callback is called when a named capture group is found.
This struct defines the functions necessary to implement the encoding interface so we can determine h...
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
bool multibyte
Return true if the encoding is a multibyte encoding.
This struct represents the overall parser.
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
const uint8_t * start
The pointer to the start of the source.
This is the set of options that are configurable on the regular expression.
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
This is the parser that is going to handle parsing regular expressions.
const uint8_t * cursor
A pointer to the current position in the source.
pm_regexp_error_callback_t error_callback
The callback to call when a parse error is found.
const uint8_t * start
A pointer to the start of the source that we are parsing.
const uint8_t * end
A pointer to the end of the source that we are parsing.
void * name_data
The data to pass to the name callback.
bool extended_mode
Whether or not the regular expression currently being parsed is in extended mode, wherein whitespace ...
pm_parser_t * parser
The parser that is currently being used.
const pm_encoding_t * encoding
The encoding of the source.
void * error_data
The data to pass to the error callback.
pm_regexp_name_callback_t name_callback
The callback to call when a named capture group is found.
bool encoding_changed
Whether the encoding has changed from the default.
A generic string type that can have various ownership semantics.