3 #define PM_REGEXP_PARSE_DEPTH_MAX 4096
50 pm_regexp_parse_error(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end,
const char *message) {
59 pm_regexp_parser_named_capture(
pm_regexp_parser_t *parser,
const uint8_t *start,
const uint8_t *end) {
79 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
91 if (!pm_regexp_char_is_eof(parser) && *parser->
cursor == value) {
103 if (pm_regexp_char_is_eof(parser)) {
151 const uint8_t *savepoint = parser->
cursor;
154 PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
155 PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
156 PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
157 PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
158 } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
162 parser->
cursor = savepoint;
167 case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
168 switch (*parser->
cursor) {
169 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
171 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
175 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
178 parser->
cursor = savepoint;
182 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
183 switch (*parser->
cursor) {
184 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
189 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
195 parser->
cursor = savepoint;
199 case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
200 switch (*parser->
cursor) {
201 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
203 state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
206 parser->
cursor = savepoint;
210 case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
211 switch (*parser->
cursor) {
212 case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
219 parser->
cursor = savepoint;
239 while (!pm_regexp_char_is_eof(parser)) {
240 switch (*parser->
cursor) {
248 if (!pm_regexp_parse_range_quantifier(parser))
return false;
265 if (!pm_regexp_char_expect(parser,
':')) {
269 pm_regexp_char_accept(parser,
'^');
272 pm_regexp_char_find(parser,
':') &&
273 pm_regexp_char_expect(parser,
']') &&
274 pm_regexp_char_expect(parser,
']')
288 pm_regexp_char_accept(parser,
'^');
290 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
']') {
291 switch (*parser->
cursor++) {
293 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
296 if (!pm_regexp_char_is_eof(parser)) {
306 return pm_regexp_char_expect(parser,
']');
314 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
315 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
321 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"empty char-class");
325 const uint8_t *reset = parser->
cursor;
329 if (pm_regexp_parse_posix_class(parser))
return true;
334 return pm_regexp_parse_character_set(parser, depth);
347 PM_REGEXP_OPTION_STATE_INVALID,
348 PM_REGEXP_OPTION_STATE_TOGGLEABLE,
349 PM_REGEXP_OPTION_STATE_ADDABLE,
350 PM_REGEXP_OPTION_STATE_ADDED,
351 PM_REGEXP_OPTION_STATE_REMOVED
352 } pm_regexp_option_state_t;
357 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
358 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
359 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
366 uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
374 memset(options, PM_REGEXP_OPTION_STATE_INVALID,
sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
375 options->
values[
'i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
376 options->
values[
'm' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
377 options->
values[
'x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
378 options->
values[
'd' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
379 options->
values[
'a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
380 options->
values[
'u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
389 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
390 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
392 switch (options->
values[key]) {
393 case PM_REGEXP_OPTION_STATE_INVALID:
394 case PM_REGEXP_OPTION_STATE_REMOVED:
396 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
397 case PM_REGEXP_OPTION_STATE_ADDABLE:
398 options->
values[key] = PM_REGEXP_OPTION_STATE_ADDED;
400 case PM_REGEXP_OPTION_STATE_ADDED:
414 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
415 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
417 switch (options->
values[key]) {
418 case PM_REGEXP_OPTION_STATE_INVALID:
419 case PM_REGEXP_OPTION_STATE_ADDABLE:
421 case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
422 case PM_REGEXP_OPTION_STATE_ADDED:
423 case PM_REGEXP_OPTION_STATE_REMOVED:
424 options->
values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
437 if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
438 key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
439 return options->
values[key];
468 const uint8_t *group_start = parser->
cursor;
471 pm_regexp_options_init(&options);
474 if (pm_regexp_char_accept(parser,
'?')) {
475 if (pm_regexp_char_is_eof(parser)) {
476 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
480 switch (*parser->
cursor) {
483 if (pm_regexp_char_is_eof(parser)) {
484 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern in group");
489 bool escaped =
false;
495 if (!escaped && *parser->
cursor ==
')') {
501 if (width == 0)
return false;
503 escaped = (width == 1) && (*parser->
cursor ==
'\\');
512 bool found = pm_regexp_char_find(parser,
')');
514 while (found && (parser->
start <= parser->
cursor - 2) && (*(parser->
cursor - 2) ==
'\\')) {
515 found = pm_regexp_char_find(parser,
')');
530 if (pm_regexp_char_is_eof(parser)) {
531 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
535 switch (*parser->
cursor) {
541 const uint8_t *start = parser->
cursor;
542 if (!pm_regexp_char_find(parser,
'>')) {
546 if (parser->
cursor - start == 1) {
547 pm_regexp_parse_error(parser, start, parser->
cursor,
"group name is empty");
551 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
559 const uint8_t *start = ++parser->
cursor;
560 if (!pm_regexp_char_find(parser,
'\'')) {
565 pm_regexp_parser_named_capture(parser, start, parser->
cursor - 1);
571 if (!pm_regexp_char_find(parser,
')')) {
575 case 'i':
case 'm':
case 'x':
case 'd':
case 'a':
case 'u':
576 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
'-' && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
577 if (!pm_regexp_options_add(&options, *parser->
cursor)) {
583 if (pm_regexp_char_is_eof(parser)) {
590 if (*parser->
cursor ==
')') {
591 if (pm_regexp_options_state(&options,
'x') == PM_REGEXP_OPTION_STATE_ADDED) {
600 if (*parser->
cursor !=
'-')
break;
606 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
':' && *parser->
cursor !=
')') {
607 if (!pm_regexp_options_remove(&options, *parser->
cursor)) {
613 if (pm_regexp_char_is_eof(parser)) {
620 if (*parser->
cursor ==
')') {
621 switch (pm_regexp_options_state(&options,
'x')) {
622 case PM_REGEXP_OPTION_STATE_ADDED:
625 case PM_REGEXP_OPTION_STATE_REMOVED:
637 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"undefined group option");
643 switch (pm_regexp_options_state(&options,
'x')) {
644 case PM_REGEXP_OPTION_STATE_ADDED:
647 case PM_REGEXP_OPTION_STATE_REMOVED:
653 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')') {
654 if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
658 pm_regexp_char_accept(parser,
'|');
663 if (pm_regexp_char_expect(parser,
')'))
return true;
665 pm_regexp_parse_error(parser, group_start, parser->
cursor,
"end pattern with unmatched parenthesis");
683 switch (*parser->
cursor) {
687 return pm_regexp_parse_quantifier(parser);
690 if (!pm_regexp_char_is_eof(parser)) {
693 return pm_regexp_parse_quantifier(parser);
696 return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
699 return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
704 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"target of repeat operator is not specified");
708 pm_regexp_parse_error(parser, parser->
cursor - 1, parser->
cursor,
"unmatched close parenthesis");
712 if (!pm_regexp_char_find(parser,
'\n')) parser->
cursor = parser->
end;
724 if (width == 0)
return false;
727 return pm_regexp_parse_quantifier(parser);
738 if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
739 pm_regexp_parse_error(parser, parser->
start, parser->
end,
"parse depth limit over");
743 if (!pm_regexp_parse_item(parser, depth)) {
747 while (!pm_regexp_char_is_eof(parser) && *parser->
cursor !=
')' && *parser->
cursor !=
'|') {
748 if (!pm_regexp_parse_item(parser, depth)) {
765 if (pm_regexp_char_is_eof(parser))
return true;
766 if (!pm_regexp_parse_expression(parser, 0))
return false;
767 }
while (pm_regexp_char_accept(parser,
'|'));
769 return pm_regexp_char_is_eof(parser);
782 .end = source + size,
783 .extended_mode = extended_mode,
786 .name_callback = name_callback,
787 .name_data = name_data,
788 .error_callback = error_callback,
789 .error_data = error_data
void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding)
We need to roll our own memchr to handle cases where the encoding changes and we need to search for a...
void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end)
Initialize a shared string that is based on initial input.
PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string)
Free the associated memory of the given string.
#define PRISM_EXPORTED_FUNCTION
By default, we compile with -fvisibility=hidden.
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
A regular expression parser.
void(* pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data)
This callback is called when a parse error is found.
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data)
Parse a regular expression.
void(* pm_regexp_name_callback_t)(const pm_string_t *name, void *data)
This callback is called when a named capture group is found.
This struct defines the functions necessary to implement the encoding interface so we can determine h...
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
bool multibyte
Return true if the encoding is a multibyte encoding.
This struct represents the overall parser.
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
const uint8_t * start
The pointer to the start of the source.
This is the set of options that are configurable on the regular expression.
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
This is the parser that is going to handle parsing regular expressions.
const uint8_t * cursor
A pointer to the current position in the source.
pm_regexp_error_callback_t error_callback
The callback to call when a parse error is found.
const uint8_t * start
A pointer to the start of the source that we are parsing.
const uint8_t * end
A pointer to the end of the source that we are parsing.
void * name_data
The data to pass to the name callback.
bool extended_mode
Whether or not the regular expression currently being parsed is in extended mode, wherein whitespace ...
pm_parser_t * parser
The parser that is currently being used.
const pm_encoding_t * encoding
The encoding of the source.
void * error_data
The data to pass to the error callback.
pm_regexp_name_callback_t name_callback
The callback to call when a named capture group is found.
bool encoding_changed
Whether the encoding has changed from the default.
A generic string type that can have various ownership semantics.