Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
regexp.c
1 #include "prism/regexp.h"
2 
3 #define PM_REGEXP_PARSE_DEPTH_MAX 4096
4 
8 typedef struct {
11 
13  const uint8_t *start;
14 
16  const uint8_t *cursor;
17 
19  const uint8_t *end;
20 
26 
29 
32 
35 
37  void *name_data;
38 
41 
43  void *error_data;
45 
49 static inline void
50 pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
51  parser->error_callback(start, end, message, parser->error_data);
52 }
53 
58 static void
59 pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
60  pm_string_t string;
61  pm_string_shared_init(&string, start, end);
62  parser->name_callback(&string, parser->name_data);
63  pm_string_free(&string);
64 }
65 
69 static inline bool
70 pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
71  return parser->cursor >= parser->end;
72 }
73 
77 static inline bool
78 pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
79  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
80  parser->cursor++;
81  return true;
82  }
83  return false;
84 }
85 
89 static inline bool
90 pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
91  if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
92  parser->cursor++;
93  return true;
94  }
95  return false;
96 }
97 
101 static bool
102 pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
103  if (pm_regexp_char_is_eof(parser)) {
104  return false;
105  }
106 
107  const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
108  if (end == NULL) {
109  return false;
110  }
111 
112  parser->cursor = end + 1;
113  return true;
114 }
115 
149 static bool
150 pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
151  const uint8_t *savepoint = parser->cursor;
152 
153  enum {
154  PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
155  PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
156  PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
157  PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
158  } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
159 
160  while (1) {
161  if (parser->cursor >= parser->end) {
162  parser->cursor = savepoint;
163  return true;
164  }
165 
166  switch (state) {
167  case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
168  switch (*parser->cursor) {
169  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
170  parser->cursor++;
171  state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
172  break;
173  case ',':
174  parser->cursor++;
175  state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
176  break;
177  default:
178  parser->cursor = savepoint;
179  return true;
180  }
181  break;
182  case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
183  switch (*parser->cursor) {
184  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
185  parser->cursor++;
186  break;
187  case ',':
188  parser->cursor++;
189  state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
190  break;
191  case '}':
192  parser->cursor++;
193  return true;
194  default:
195  parser->cursor = savepoint;
196  return true;
197  }
198  break;
199  case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
200  switch (*parser->cursor) {
201  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
202  parser->cursor++;
203  state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
204  break;
205  default:
206  parser->cursor = savepoint;
207  return true;
208  }
209  break;
210  case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
211  switch (*parser->cursor) {
212  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
213  parser->cursor++;
214  break;
215  case '}':
216  parser->cursor++;
217  return true;
218  default:
219  parser->cursor = savepoint;
220  return true;
221  }
222  break;
223  }
224  }
225 
226  return true;
227 }
228 
237 static bool
238 pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
239  while (!pm_regexp_char_is_eof(parser)) {
240  switch (*parser->cursor) {
241  case '*':
242  case '+':
243  case '?':
244  parser->cursor++;
245  break;
246  case '{':
247  parser->cursor++;
248  if (!pm_regexp_parse_range_quantifier(parser)) return false;
249  break;
250  default:
251  // In this case there is no quantifier.
252  return true;
253  }
254  }
255 
256  return true;
257 }
258 
263 static bool
264 pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
265  if (!pm_regexp_char_expect(parser, ':')) {
266  return false;
267  }
268 
269  pm_regexp_char_accept(parser, '^');
270 
271  return (
272  pm_regexp_char_find(parser, ':') &&
273  pm_regexp_char_expect(parser, ']') &&
274  pm_regexp_char_expect(parser, ']')
275  );
276 }
277 
278 // Forward declaration because character sets can be nested.
279 static bool
280 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
281 
286 static bool
287 pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
288  pm_regexp_char_accept(parser, '^');
289 
290  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
291  switch (*parser->cursor++) {
292  case '[':
293  pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
294  break;
295  case '\\':
296  if (!pm_regexp_char_is_eof(parser)) {
297  parser->cursor++;
298  }
299  break;
300  default:
301  // do nothing, we've already advanced the cursor
302  break;
303  }
304  }
305 
306  return pm_regexp_char_expect(parser, ']');
307 }
308 
312 static bool
313 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
314  if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
315  pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
316  return false;
317  }
318 
319  if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
320  parser->cursor++;
321  pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
322  return true;
323  }
324 
325  const uint8_t *reset = parser->cursor;
326 
327  if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
328  parser->cursor++;
329  if (pm_regexp_parse_posix_class(parser)) return true;
330 
331  parser->cursor = reset;
332  }
333 
334  return pm_regexp_parse_character_set(parser, depth);
335 }
336 
337 // Forward declaration here since parsing groups needs to go back up the grammar
338 // to parse expressions within them.
339 static bool
340 pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
341 
346 typedef enum {
347  PM_REGEXP_OPTION_STATE_INVALID,
348  PM_REGEXP_OPTION_STATE_TOGGLEABLE,
349  PM_REGEXP_OPTION_STATE_ADDABLE,
350  PM_REGEXP_OPTION_STATE_ADDED,
351  PM_REGEXP_OPTION_STATE_REMOVED
352 } pm_regexp_option_state_t;
353 
354 // These are the options that are configurable on the regular expression (or
355 // from within a group).
356 
357 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
358 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
359 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
360 
364 typedef struct {
366  uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
368 
372 static void
373 pm_regexp_options_init(pm_regexp_options_t *options) {
374  memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
375  options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
376  options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
377  options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
378  options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
379  options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
380  options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
381 }
382 
387 static bool
388 pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
389  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
390  key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
391 
392  switch (options->values[key]) {
393  case PM_REGEXP_OPTION_STATE_INVALID:
394  case PM_REGEXP_OPTION_STATE_REMOVED:
395  return false;
396  case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
397  case PM_REGEXP_OPTION_STATE_ADDABLE:
398  options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
399  return true;
400  case PM_REGEXP_OPTION_STATE_ADDED:
401  return true;
402  }
403  }
404 
405  return false;
406 }
407 
412 static bool
413 pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
414  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
415  key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
416 
417  switch (options->values[key]) {
418  case PM_REGEXP_OPTION_STATE_INVALID:
419  case PM_REGEXP_OPTION_STATE_ADDABLE:
420  return false;
421  case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
422  case PM_REGEXP_OPTION_STATE_ADDED:
423  case PM_REGEXP_OPTION_STATE_REMOVED:
424  options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
425  return true;
426  }
427  }
428 
429  return false;
430 }
431 
435 static uint8_t
436 pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
437  if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
438  key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
439  return options->values[key];
440  }
441 
442  return false;
443 }
444 
466 static bool
467 pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
468  const uint8_t *group_start = parser->cursor;
469 
470  pm_regexp_options_t options;
471  pm_regexp_options_init(&options);
472 
473  // First, parse any options for the group.
474  if (pm_regexp_char_accept(parser, '?')) {
475  if (pm_regexp_char_is_eof(parser)) {
476  pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
477  return false;
478  }
479 
480  switch (*parser->cursor) {
481  case '#': { // inline comments
482  parser->cursor++;
483  if (pm_regexp_char_is_eof(parser)) {
484  pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
485  return false;
486  }
487 
488  if (parser->encoding_changed && parser->encoding->multibyte) {
489  bool escaped = false;
490 
491  // Here we're going to take a slow path and iterate through
492  // each multibyte character to find the close paren. We do
493  // this because \ can be a trailing byte in some encodings.
494  while (parser->cursor < parser->end) {
495  if (!escaped && *parser->cursor == ')') {
496  parser->cursor++;
497  return true;
498  }
499 
500  size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
501  if (width == 0) return false;
502 
503  escaped = (width == 1) && (*parser->cursor == '\\');
504  parser->cursor += width;
505  }
506 
507  return false;
508  } else {
509  // Here we can take the fast path and use memchr to find the
510  // next ) because we are safe checking backward for \ since
511  // it cannot be a trailing character.
512  bool found = pm_regexp_char_find(parser, ')');
513 
514  while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
515  found = pm_regexp_char_find(parser, ')');
516  }
517 
518  return found;
519  }
520  }
521  case ':': // non-capturing group
522  case '=': // positive lookahead
523  case '!': // negative lookahead
524  case '>': // atomic group
525  case '~': // absence operator
526  parser->cursor++;
527  break;
528  case '<':
529  parser->cursor++;
530  if (pm_regexp_char_is_eof(parser)) {
531  pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
532  return false;
533  }
534 
535  switch (*parser->cursor) {
536  case '=': // positive lookbehind
537  case '!': // negative lookbehind
538  parser->cursor++;
539  break;
540  default: { // named capture group
541  const uint8_t *start = parser->cursor;
542  if (!pm_regexp_char_find(parser, '>')) {
543  return false;
544  }
545 
546  if (parser->cursor - start == 1) {
547  pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
548  }
549 
550  if (parser->name_callback != NULL) {
551  pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
552  }
553 
554  break;
555  }
556  }
557  break;
558  case '\'': { // named capture group
559  const uint8_t *start = ++parser->cursor;
560  if (!pm_regexp_char_find(parser, '\'')) {
561  return false;
562  }
563 
564  if (parser->name_callback != NULL) {
565  pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
566  }
567 
568  break;
569  }
570  case '(': // conditional expression
571  if (!pm_regexp_char_find(parser, ')')) {
572  return false;
573  }
574  break;
575  case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
576  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
577  if (!pm_regexp_options_add(&options, *parser->cursor)) {
578  return false;
579  }
580  parser->cursor++;
581  }
582 
583  if (pm_regexp_char_is_eof(parser)) {
584  return false;
585  }
586 
587  // If we are at the end of the group of options and there is no
588  // subexpression, then we are going to be setting the options
589  // for the parent group. In this case we are safe to return now.
590  if (*parser->cursor == ')') {
591  if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
592  parser->extended_mode = true;
593  }
594 
595  parser->cursor++;
596  return true;
597  }
598 
599  // If we hit a -, then we're done parsing options.
600  if (*parser->cursor != '-') break;
601 
602  // Otherwise, fallthrough to the - case.
603  /* fallthrough */
604  case '-':
605  parser->cursor++;
606  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
607  if (!pm_regexp_options_remove(&options, *parser->cursor)) {
608  return false;
609  }
610  parser->cursor++;
611  }
612 
613  if (pm_regexp_char_is_eof(parser)) {
614  return false;
615  }
616 
617  // If we are at the end of the group of options and there is no
618  // subexpression, then we are going to be setting the options
619  // for the parent group. In this case we are safe to return now.
620  if (*parser->cursor == ')') {
621  switch (pm_regexp_options_state(&options, 'x')) {
622  case PM_REGEXP_OPTION_STATE_ADDED:
623  parser->extended_mode = true;
624  break;
625  case PM_REGEXP_OPTION_STATE_REMOVED:
626  parser->extended_mode = false;
627  break;
628  }
629 
630  parser->cursor++;
631  return true;
632  }
633 
634  break;
635  default:
636  parser->cursor++;
637  pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
638  break;
639  }
640  }
641 
642  bool extended_mode = parser->extended_mode;
643  switch (pm_regexp_options_state(&options, 'x')) {
644  case PM_REGEXP_OPTION_STATE_ADDED:
645  parser->extended_mode = true;
646  break;
647  case PM_REGEXP_OPTION_STATE_REMOVED:
648  parser->extended_mode = false;
649  break;
650  }
651 
652  // Now, parse the expressions within this group.
653  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
654  if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
655  parser->extended_mode = extended_mode;
656  return false;
657  }
658  pm_regexp_char_accept(parser, '|');
659  }
660 
661  // Finally, make sure we have a closing parenthesis.
662  parser->extended_mode = extended_mode;
663  if (pm_regexp_char_expect(parser, ')')) return true;
664 
665  pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
666  return false;
667 }
668 
681 static bool
682 pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
683  switch (*parser->cursor) {
684  case '^':
685  case '$':
686  parser->cursor++;
687  return pm_regexp_parse_quantifier(parser);
688  case '\\':
689  parser->cursor++;
690  if (!pm_regexp_char_is_eof(parser)) {
691  parser->cursor++;
692  }
693  return pm_regexp_parse_quantifier(parser);
694  case '(':
695  parser->cursor++;
696  return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
697  case '[':
698  parser->cursor++;
699  return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
700  case '*':
701  case '?':
702  case '+':
703  parser->cursor++;
704  pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
705  return true;
706  case ')':
707  parser->cursor++;
708  pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
709  return true;
710  case '#':
711  if (parser->extended_mode) {
712  if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
713  return true;
714  }
715  /* fallthrough */
716  default: {
717  size_t width;
718  if (!parser->encoding_changed) {
719  width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
720  } else {
721  width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
722  }
723 
724  if (width == 0) return false; // TODO: add appropriate error
725  parser->cursor += width;
726 
727  return pm_regexp_parse_quantifier(parser);
728  }
729  }
730 }
731 
736 static bool
737 pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
738  if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
739  pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
740  return false;
741  }
742 
743  if (!pm_regexp_parse_item(parser, depth)) {
744  return false;
745  }
746 
747  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
748  if (!pm_regexp_parse_item(parser, depth)) {
749  return false;
750  }
751  }
752 
753  return true;
754 }
755 
762 static bool
763 pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
764  do {
765  if (pm_regexp_char_is_eof(parser)) return true;
766  if (!pm_regexp_parse_expression(parser, 0)) return false;
767  } while (pm_regexp_char_accept(parser, '|'));
768 
769  return pm_regexp_char_is_eof(parser);
770 }
771 
777 pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
778  pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
779  .parser = parser,
780  .start = source,
781  .cursor = source,
782  .end = source + size,
783  .extended_mode = extended_mode,
784  .encoding_changed = parser->encoding_changed,
785  .encoding = parser->encoding,
786  .name_callback = name_callback,
787  .name_data = name_data,
788  .error_callback = error_callback,
789  .error_data = error_data
790  });
791 }
void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding)
We need to roll our own memchr to handle cases where the encoding changes and we need to search for a...
Definition: pm_memchr.c:11
void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end)
Initialize a shared string that is based on initial input.
Definition: pm_string.c:16
PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string)
Free the associated memory of the given string.
Definition: pm_string.c:369
#define PRISM_EXPORTED_FUNCTION
By default, we compile with -fvisibility=hidden.
Definition: defines.h:50
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
Definition: encoding.c:2287
A regular expression parser.
void(* pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data)
This callback is called when a parse error is found.
Definition: regexp.h:27
PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data)
Parse a regular expression.
Definition: regexp.c:777
void(* pm_regexp_name_callback_t)(const pm_string_t *name, void *data)
This callback is called when a named capture group is found.
Definition: regexp.h:22
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition: encoding.h:23
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
Definition: encoding.h:29
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition: encoding.h:61
This struct represents the overall parser.
Definition: parser.h:640
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
Definition: parser.h:755
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
Definition: parser.h:903
const uint8_t * start
The pointer to the start of the source.
Definition: parser.h:691
This is the set of options that are configurable on the regular expression.
Definition: regexp.c:364
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]
The current state of each option.
Definition: regexp.c:366
This is the parser that is going to handle parsing regular expressions.
Definition: regexp.c:8
const uint8_t * cursor
A pointer to the current position in the source.
Definition: regexp.c:16
pm_regexp_error_callback_t error_callback
The callback to call when a parse error is found.
Definition: regexp.c:40
const uint8_t * start
A pointer to the start of the source that we are parsing.
Definition: regexp.c:13
const uint8_t * end
A pointer to the end of the source that we are parsing.
Definition: regexp.c:19
void * name_data
The data to pass to the name callback.
Definition: regexp.c:37
bool extended_mode
Whether or not the regular expression currently being parsed is in extended mode, wherein whitespace ...
Definition: regexp.c:25
pm_parser_t * parser
The parser that is currently being used.
Definition: regexp.c:10
const pm_encoding_t * encoding
The encoding of the source.
Definition: regexp.c:31
void * error_data
The data to pass to the error callback.
Definition: regexp.c:43
pm_regexp_name_callback_t name_callback
The callback to call when a named capture group is found.
Definition: regexp.c:34
bool encoding_changed
Whether the encoding has changed from the default.
Definition: regexp.c:28
A generic string type that can have various ownership semantics.
Definition: pm_string.h:33