Ruby 4.1.0dev (2026-03-28 revision 634707a7255f132eb486eaf57473925c288ef7bd)
parser.h
1#ifndef PRISM_INTERNAL_PARSER_H
2#define PRISM_INTERNAL_PARSER_H
3
5
6#include "prism/internal/arena.h"
7#include "prism/internal/constant_pool.h"
8#include "prism/internal/encoding.h"
9#include "prism/internal/list.h"
10#include "prism/internal/options.h"
11#include "prism/internal/static_literals.h"
12
13#include "prism/ast.h"
15#include "prism/parser.h"
16
17#include <stdbool.h>
18#include <stddef.h>
19#include <stdint.h>
20
21/*
22 * This enum provides various bits that represent different kinds of states that
23 * the lexer can track. This is used to determine which kind of token to return
24 * based on the context of the parser.
25 */
26typedef enum {
27 PM_LEX_STATE_BIT_BEG,
28 PM_LEX_STATE_BIT_END,
29 PM_LEX_STATE_BIT_ENDARG,
30 PM_LEX_STATE_BIT_ENDFN,
31 PM_LEX_STATE_BIT_ARG,
32 PM_LEX_STATE_BIT_CMDARG,
33 PM_LEX_STATE_BIT_MID,
34 PM_LEX_STATE_BIT_FNAME,
35 PM_LEX_STATE_BIT_DOT,
36 PM_LEX_STATE_BIT_CLASS,
37 PM_LEX_STATE_BIT_LABEL,
38 PM_LEX_STATE_BIT_LABELED,
39 PM_LEX_STATE_BIT_FITEM
40} pm_lex_state_bit_t;
41
42/*
43 * This enum combines the various bits from the above enum into individual
44 * values that represent the various states of the lexer.
45 */
46typedef enum {
47 PM_LEX_STATE_NONE = 0,
48 PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
49 PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
50 PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
51 PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
52 PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
53 PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
54 PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
55 PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
56 PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
57 PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
58 PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
59 PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
60 PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
61 PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
62 PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
63 PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
64} pm_lex_state_t;
65
66/*
67 * The type of quote that a heredoc uses.
68 */
69typedef enum {
70 PM_HEREDOC_QUOTE_NONE,
71 PM_HEREDOC_QUOTE_SINGLE = '\'',
72 PM_HEREDOC_QUOTE_DOUBLE = '"',
73 PM_HEREDOC_QUOTE_BACKTICK = '`',
74} pm_heredoc_quote_t;
75
76/*
77 * The type of indentation that a heredoc uses.
78 */
79typedef enum {
80 PM_HEREDOC_INDENT_NONE,
81 PM_HEREDOC_INDENT_DASH,
82 PM_HEREDOC_INDENT_TILDE,
83} pm_heredoc_indent_t;
84
85/*
86 * All of the information necessary to store to lexing a heredoc.
87 */
88typedef struct {
89 /* A pointer to the start of the heredoc identifier. */
90 const uint8_t *ident_start;
91
92 /* The length of the heredoc identifier. */
93 size_t ident_length;
94
95 /* The type of quote that the heredoc uses. */
96 pm_heredoc_quote_t quote;
97
98 /* The type of indentation that the heredoc uses. */
99 pm_heredoc_indent_t indent;
101
102/*
103 * When lexing Ruby source, the lexer has a small amount of state to tell which
104 * kind of token it is currently lexing. For example, when we find the start of
105 * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
106 * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
107 * are found as part of a string.
108 */
109typedef struct pm_lex_mode {
110 /* The type of this lex mode. */
111 enum {
112 /* This state is used when any given token is being lexed. */
113 PM_LEX_DEFAULT,
114
115 /*
116 * This state is used when we're lexing as normal but inside an embedded
117 * expression of a string.
118 */
119 PM_LEX_EMBEXPR,
120
121 /*
122 * This state is used when we're lexing a variable that is embedded
123 * directly inside of a string with the # shorthand.
124 */
125 PM_LEX_EMBVAR,
126
127 /* This state is used when you are inside the content of a heredoc. */
128 PM_LEX_HEREDOC,
129
130 /*
131 * This state is used when we are lexing a list of tokens, as in a %w
132 * word list literal or a %i symbol list literal.
133 */
134 PM_LEX_LIST,
135
136 /*
137 * This state is used when a regular expression has been begun and we
138 * are looking for the terminator.
139 */
140 PM_LEX_REGEXP,
141
142 /*
143 * This state is used when we are lexing a string or a string-like
144 * token, as in string content with either quote or an xstring.
145 */
146 PM_LEX_STRING
147 } mode;
148
149 /* The data associated with this type of lex mode. */
150 union {
151 struct {
152 /* This keeps track of the nesting level of the list. */
153 size_t nesting;
154
155 /* Whether or not interpolation is allowed in this list. */
156 bool interpolation;
157
158 /*
159 * When lexing a list, it takes into account balancing the
160 * terminator if the terminator is one of (), [], {}, or <>.
161 */
162 uint8_t incrementor;
163
164 /* This is the terminator of the list literal. */
165 uint8_t terminator;
166
167 /*
168 * This is the character set that should be used to delimit the
169 * tokens within the list.
170 */
171 uint8_t breakpoints[11];
172 } list;
173
174 struct {
175 /*
176 * This keeps track of the nesting level of the regular expression.
177 */
178 size_t nesting;
179
180 /*
181 * When lexing a regular expression, it takes into account balancing
182 * the terminator if the terminator is one of (), [], {}, or <>.
183 */
184 uint8_t incrementor;
185
186 /* This is the terminator of the regular expression. */
187 uint8_t terminator;
188
189 /*
190 * This is the character set that should be used to delimit the
191 * tokens within the regular expression.
192 */
193 uint8_t breakpoints[7];
194 } regexp;
195
196 struct {
197 /* This keeps track of the nesting level of the string. */
198 size_t nesting;
199
200 /* Whether or not interpolation is allowed in this string. */
201 bool interpolation;
202
203 /*
204 * Whether or not at the end of the string we should allow a :,
205 * which would indicate this was a dynamic symbol instead of a
206 * string.
207 */
208 bool label_allowed;
209
210 /*
211 * When lexing a string, it takes into account balancing the
212 * terminator if the terminator is one of (), [], {}, or <>.
213 */
214 uint8_t incrementor;
215
216 /*
217 * This is the terminator of the string. It is typically either a
218 * single or double quote.
219 */
220 uint8_t terminator;
221
222 /*
223 * This is the character set that should be used to delimit the
224 * tokens within the string.
225 */
226 uint8_t breakpoints[7];
227 } string;
228
229 struct {
230 /*
231 * All of the data necessary to lex a heredoc.
232 */
234
235 /*
236 * This is the pointer to the character where lexing should resume
237 * once the heredoc has been completely processed.
238 */
239 const uint8_t *next_start;
240
241 /*
242 * This is used to track the amount of common whitespace on each
243 * line so that we know how much to dedent each line in the case of
244 * a tilde heredoc.
245 */
246 size_t *common_whitespace;
247
248 /* True if the previous token ended with a line continuation. */
249 bool line_continuation;
250 } heredoc;
251 } as;
252
253 /* The previous lex state so that it knows how to pop. */
254 struct pm_lex_mode *prev;
256
257/*
258 * We pre-allocate a certain number of lex states in order to avoid having to
259 * call malloc too many times while parsing. You really shouldn't need more than
260 * this because you only really nest deeply when doing string interpolation.
261 */
262#define PM_LEX_STACK_SIZE 4
263
264/*
265 * While parsing, we keep track of a stack of contexts. This is helpful for
266 * error recovery so that we can pop back to a previous context when we hit a
267 * token that is understood by a parent context but not by the current context.
268 */
269typedef enum {
270 /* a null context, used for returning a value from a function */
271 PM_CONTEXT_NONE = 0,
272
273 /* a begin statement */
274 PM_CONTEXT_BEGIN,
275
276 /* an ensure statement with an explicit begin */
277 PM_CONTEXT_BEGIN_ENSURE,
278
279 /* a rescue else statement with an explicit begin */
280 PM_CONTEXT_BEGIN_ELSE,
281
282 /* a rescue statement with an explicit begin */
283 PM_CONTEXT_BEGIN_RESCUE,
284
285 /* expressions in block arguments using braces */
286 PM_CONTEXT_BLOCK_BRACES,
287
288 /* expressions in block arguments using do..end */
289 PM_CONTEXT_BLOCK_KEYWORDS,
290
291 /* an ensure statement within a do..end block */
292 PM_CONTEXT_BLOCK_ENSURE,
293
294 /* a rescue else statement within a do..end block */
295 PM_CONTEXT_BLOCK_ELSE,
296
297 /* expressions in block parameters `foo do |...| end ` */
298 PM_CONTEXT_BLOCK_PARAMETERS,
299
300 /* a rescue statement within a do..end block */
301 PM_CONTEXT_BLOCK_RESCUE,
302
303 /* a case when statements */
304 PM_CONTEXT_CASE_WHEN,
305
306 /* a case in statements */
307 PM_CONTEXT_CASE_IN,
308
309 /* a class declaration */
310 PM_CONTEXT_CLASS,
311
312 /* an ensure statement within a class statement */
313 PM_CONTEXT_CLASS_ENSURE,
314
315 /* a rescue else statement within a class statement */
316 PM_CONTEXT_CLASS_ELSE,
317
318 /* a rescue statement within a class statement */
319 PM_CONTEXT_CLASS_RESCUE,
320
321 /* a method definition */
322 PM_CONTEXT_DEF,
323
324 /* an ensure statement within a method definition */
325 PM_CONTEXT_DEF_ENSURE,
326
327 /* a rescue else statement within a method definition */
328 PM_CONTEXT_DEF_ELSE,
329
330 /* a rescue statement within a method definition */
331 PM_CONTEXT_DEF_RESCUE,
332
333 /* a method definition's parameters */
334 PM_CONTEXT_DEF_PARAMS,
335
336 /* a defined? expression */
337 PM_CONTEXT_DEFINED,
338
339 /* a method definition's default parameter */
340 PM_CONTEXT_DEFAULT_PARAMS,
341
342 /* an else clause */
343 PM_CONTEXT_ELSE,
344
345 /* an elsif clause */
346 PM_CONTEXT_ELSIF,
347
348 /* an interpolated expression */
349 PM_CONTEXT_EMBEXPR,
350
351 /* a for loop */
352 PM_CONTEXT_FOR,
353
354 /* a for loop's index */
355 PM_CONTEXT_FOR_INDEX,
356
357 /* an if statement */
358 PM_CONTEXT_IF,
359
360 /* a lambda expression with braces */
361 PM_CONTEXT_LAMBDA_BRACES,
362
363 /* a lambda expression with do..end */
364 PM_CONTEXT_LAMBDA_DO_END,
365
366 /* an ensure statement within a lambda expression */
367 PM_CONTEXT_LAMBDA_ENSURE,
368
369 /* a rescue else statement within a lambda expression */
370 PM_CONTEXT_LAMBDA_ELSE,
371
372 /* a rescue statement within a lambda expression */
373 PM_CONTEXT_LAMBDA_RESCUE,
374
375 /* the predicate clause of a loop statement */
376 PM_CONTEXT_LOOP_PREDICATE,
377
378 /* the top level context */
379 PM_CONTEXT_MAIN,
380
381 /* a module declaration */
382 PM_CONTEXT_MODULE,
383
384 /* an ensure statement within a module statement */
385 PM_CONTEXT_MODULE_ENSURE,
386
387 /* a rescue else statement within a module statement */
388 PM_CONTEXT_MODULE_ELSE,
389
390 /* a rescue statement within a module statement */
391 PM_CONTEXT_MODULE_RESCUE,
392
393 /* a multiple target expression */
394 PM_CONTEXT_MULTI_TARGET,
395
396 /* a parenthesized expression */
397 PM_CONTEXT_PARENS,
398
399 /* an END block */
400 PM_CONTEXT_POSTEXE,
401
402 /* a predicate inside an if/elsif/unless statement */
403 PM_CONTEXT_PREDICATE,
404
405 /* a BEGIN block */
406 PM_CONTEXT_PREEXE,
407
408 /* a modifier rescue clause */
409 PM_CONTEXT_RESCUE_MODIFIER,
410
411 /* a singleton class definition */
412 PM_CONTEXT_SCLASS,
413
414 /* an ensure statement with a singleton class */
415 PM_CONTEXT_SCLASS_ENSURE,
416
417 /* a rescue else statement with a singleton class */
418 PM_CONTEXT_SCLASS_ELSE,
419
420 /* a rescue statement with a singleton class */
421 PM_CONTEXT_SCLASS_RESCUE,
422
423 /* a ternary expression */
424 PM_CONTEXT_TERNARY,
425
426 /* an unless statement */
427 PM_CONTEXT_UNLESS,
428
429 /* an until statement */
430 PM_CONTEXT_UNTIL,
431
432 /* a while statement */
433 PM_CONTEXT_WHILE,
434} pm_context_t;
435
436/* This is a node in a linked list of contexts. */
437typedef struct pm_context_node {
438 /* The context that this node represents. */
439 pm_context_t context;
440
441 /* A pointer to the previous context in the linked list. */
442 struct pm_context_node *prev;
444
445/* The type of shareable constant value that can be set. */
446typedef uint8_t pm_shareable_constant_value_t;
447static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0;
448static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL;
449static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING;
450static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY;
451
452/*
453 * This tracks an individual local variable in a certain lexical context, as
454 * well as the number of times is it read.
455 */
456typedef struct {
457 /* The name of the local variable. */
458 pm_constant_id_t name;
459
460 /* The location of the local variable in the source. */
461 pm_location_t location;
462
463 /* The index of the local variable in the local table. */
464 uint32_t index;
465
466 /* The number of times the local variable is read. */
467 uint32_t reads;
468
469 /* The hash of the local variable. */
470 uint32_t hash;
471} pm_local_t;
472
473/*
474 * This is a set of local variables in a certain lexical context (method, class,
475 * module, etc.). We need to track how many times these variables are read in
476 * order to warn if they only get written.
477 */
478typedef struct pm_locals {
479 /* The number of local variables in the set. */
480 uint32_t size;
481
482 /* The capacity of the local variables set. */
483 uint32_t capacity;
484
485 /*
486 * A bloom filter over constant IDs stored in this set. Used to quickly
487 * reject lookups for names that are definitely not present, avoiding the
488 * cost of a linear scan or hash probe.
489 */
490 uint32_t bloom;
491
492 /* The nullable allocated memory for the local variables in the set. */
493 pm_local_t *locals;
495
496/* The flags about scope parameters that can be set. */
497typedef uint8_t pm_scope_parameters_t;
498static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0;
499static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1;
500static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2;
501static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4;
502static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8;
503static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10;
504static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20;
505static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40;
506
507/*
508 * This struct represents a node in a linked list of scopes. Some scopes can see
509 * into their parent scopes, while others cannot.
510 */
511typedef struct pm_scope {
512 /* A pointer to the previous scope in the linked list. */
513 struct pm_scope *previous;
514
515 /* The IDs of the locals in the given scope. */
516 pm_locals_t locals;
517
518 /*
519 * This is a list of the implicit parameters contained within the block.
520 * These will be processed after the block is parsed to determine the kind
521 * of parameters node that should be used and to check if any errors need to
522 * be added.
523 */
524 pm_node_list_t implicit_parameters;
525
526 /*
527 * This is a bitfield that indicates the parameters that are being used in
528 * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants.
529 * There are three different kinds of parameters that can be used in a
530 * scope:
531 *
532 * - Ordinary parameters (e.g., def foo(bar); end)
533 * - Numbered parameters (e.g., def foo; _1; end)
534 * - The it parameter (e.g., def foo; it; end)
535 *
536 * If ordinary parameters are being used, then certain parameters can be
537 * forwarded to another method/structure. Those are indicated by four
538 * additional bits in the params field. For example, some combinations of:
539 *
540 * - def foo(*); end
541 * - def foo(**); end
542 * - def foo(&); end
543 * - def foo(...); end
544 */
545 pm_scope_parameters_t parameters;
546
547 /*
548 * The current state of constant shareability for this scope. This is
549 * changed by magic shareable_constant_value comments.
550 */
551 pm_shareable_constant_value_t shareable_constant;
552
553 /*
554 * A boolean indicating whether or not this scope can see into its parent.
555 * If closed is true, then the scope cannot see into its parent.
556 */
557 bool closed;
558} pm_scope_t;
559
560/*
561 * A struct that represents a stack of boolean values.
562 */
563typedef uint32_t pm_state_stack_t;
564
565/*
566 * This struct represents the overall parser. It contains a reference to the
567 * source file, as well as pointers that indicate where in the source it's
568 * currently parsing. It also contains the most recent and current token that
569 * it's considering.
570 */
572 /* The arena used for all AST-lifetime allocations. Caller-owned. */
573 pm_arena_t *arena;
574
575 /* The arena used for parser metadata (comments, diagnostics, etc.). */
576 pm_arena_t metadata_arena;
577
578 /*
579 * The next node identifier that will be assigned. This is a unique
580 * identifier used to track nodes such that the syntax tree can be dropped
581 * but the node can be found through another parse.
582 */
583 uint32_t node_id;
584
585 /*
586 * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant
587 * constant pool lookups when the same token is resolved multiple times
588 * (e.g., once during lexing for local variable detection, and again
589 * during parsing for node creation).
590 */
591 struct {
592 const uint8_t *start;
593 const uint8_t *end;
595 } constant_cache;
596
597 /* The current state of the lexer. */
598 pm_lex_state_t lex_state;
599
600 /* Tracks the current nesting of (), [], and {}. */
601 int enclosure_nesting;
602
603 /*
604 * Used to temporarily track the nesting of enclosures to determine if a {
605 * is the beginning of a lambda following the parameters of a lambda.
606 */
607 int lambda_enclosure_nesting;
608
609 /*
610 * Used to track the nesting of braces to ensure we get the correct value
611 * when we are interpolating blocks with braces.
612 */
613 int brace_nesting;
614
615 /*
616 * The stack used to determine if a do keyword belongs to the predicate of a
617 * while, until, or for loop.
618 */
619 pm_state_stack_t do_loop_stack;
620
621 /*
622 * The stack used to determine if a do keyword belongs to the beginning of a
623 * block.
624 */
625 pm_state_stack_t accepts_block_stack;
626
627 /* A stack of lex modes. */
628 struct {
629 /* The current mode of the lexer. */
630 pm_lex_mode_t *current;
631
632 /* The stack of lexer modes. */
633 pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
634
635 /* The current index into the lexer mode stack. */
636 size_t index;
637 } lex_modes;
638
639 /* The pointer to the start of the source. */
640 const uint8_t *start;
641
642 /* The pointer to the end of the source. */
643 const uint8_t *end;
644
645 /* The previous token we were considering. */
646 pm_token_t previous;
647
648 /* The current token we're considering. */
649 pm_token_t current;
650
651 /*
652 * This is a special field set on the parser when we need the parser to jump
653 * to a specific location when lexing the next token, as opposed to just
654 * using the end of the previous token. Normally this is NULL.
655 */
656 const uint8_t *next_start;
657
658 /*
659 * This field indicates the end of a heredoc whose identifier was found on
660 * the current line. If another heredoc is found on the same line, then this
661 * will be moved forward to the end of that heredoc. If no heredocs are
662 * found on a line then this is NULL.
663 */
664 const uint8_t *heredoc_end;
665
666 /* The list of comments that have been found while parsing. */
667 pm_list_t comment_list;
668
669 /* The list of magic comments that have been found while parsing. */
670 pm_list_t magic_comment_list;
671
672 /*
673 * An optional location that represents the location of the __END__ marker
674 * and the rest of the content of the file. This content is loaded into the
675 * DATA constant when the file being parsed is the main file being executed.
676 */
677 pm_location_t data_loc;
678
679 /* The list of warnings that have been found while parsing. */
680 pm_list_t warning_list;
681
682 /* The list of errors that have been found while parsing. */
683 pm_list_t error_list;
684
685 /* The current local scope. */
686 pm_scope_t *current_scope;
687
688 /* The current parsing context. */
689 pm_context_node_t *current_context;
690
691 /*
692 * The hash keys for the hash that is currently being parsed. This is not
693 * usually necessary because it can pass it down the various call chains,
694 * but in the event that you're parsing a hash that is being directly
695 * pushed into another hash with **, we need to share the hash keys so that
696 * we can warn for the nested hash as well.
697 */
698 pm_static_literals_t *current_hash_keys;
699
700 /*
701 * The encoding functions for the current file is attached to the parser as
702 * it's parsing so that it can change with a magic comment.
703 */
704 const pm_encoding_t *encoding;
705
706 /*
707 * When the encoding that is being used to parse the source is changed by
708 * prism, we provide the ability here to call out to a user-defined
709 * function.
710 */
711 pm_encoding_changed_callback_t encoding_changed_callback;
712
713 /*
714 * This pointer indicates where a comment must start if it is to be
715 * considered an encoding comment.
716 */
717 const uint8_t *encoding_comment_start;
718
719 /*
720 * When you are lexing through a file, the lexer needs all of the information
721 * that the parser additionally provides (for example, the local table). So if
722 * you want to properly lex Ruby, you need to actually lex it in the context of
723 * the parser. In order to provide this functionality, we optionally allow a
724 * struct to be attached to the parser that calls back out to a user-provided
725 * callback when each token is lexed.
726 */
727 struct {
728 /*
729 * This is the callback that is called when a token is lexed. It is
730 * passed the opaque data pointer, the parser, and the token that was
731 * lexed.
732 */
733 pm_lex_callback_t callback;
734
735 /*
736 * This opaque pointer is used to provide whatever information the user
737 * deemed necessary to the callback. In our case we use it to pass the
738 * array that the tokens get appended into.
739 */
740 void *data;
741 } lex_callback;
742
743 /*
744 * This is the path of the file being parsed. We use the filepath when
745 * constructing SourceFileNodes.
746 */
747 pm_string_t filepath;
748
749 /*
750 * This constant pool keeps all of the constants defined throughout the file
751 * so that we can reference them later.
752 */
753 pm_constant_pool_t constant_pool;
754
755 /* This is the list of line offsets in the source file. */
756 pm_line_offset_list_t line_offsets;
757
758 /*
759 * State communicated from the lexer to the parser for integer tokens.
760 */
761 struct {
762 /*
763 * A flag indicating the base of the integer (binary, octal, decimal,
764 * hexadecimal). Set during lexing and read during node creation.
765 */
766 pm_node_flags_t base;
767
768 /*
769 * When lexing a decimal integer that fits in a uint32_t, we compute
770 * the value during lexing to avoid re-scanning the digits during
771 * parsing. If lexed is true, this holds the result and
772 * pm_integer_parse can be skipped.
773 */
774 uint32_t value;
775
776 /* Whether value holds a valid pre-computed integer. */
777 bool lexed;
778 } integer;
779
780 /*
781 * This string is used to pass information from the lexer to the parser. It
782 * is particularly necessary because of escape sequences.
783 */
784 pm_string_t current_string;
785
786 /*
787 * The line number at the start of the parse. This will be used to offset
788 * the line numbers of all of the locations.
789 */
790 int32_t start_line;
791
792 /*
793 * When a string-like expression is being lexed, any byte or escape sequence
794 * that resolves to a value whose top bit is set (i.e., >= 0x80) will
795 * explicitly set the encoding to the same encoding as the source.
796 * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
797 * resolves to a value whose top bit is set, then the encoding will be
798 * explicitly set to UTF-8.
799 *
800 * The _next_ time this happens, if the encoding that is about to become the
801 * explicitly set encoding does not match the previously set explicit
802 * encoding, a mixed encoding error will be emitted.
803 *
804 * When the expression is finished being lexed, the explicit encoding
805 * controls the encoding of the expression. For the most part this means
806 * that the expression will either be encoded in the source encoding or
807 * UTF-8. This holds for all encodings except US-ASCII. If the source is
808 * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
809 * expression will be encoded as ASCII-8BIT.
810 *
811 * Note that if the expression is a list, different elements within the same
812 * list can have different encodings, so this will get reset between each
813 * element. Furthermore all of this only applies to lists that support
814 * interpolation, because otherwise escapes that could change the encoding
815 * are ignored.
816 *
817 * At first glance, it may make more sense for this to live on the lexer
818 * mode, but we need it here to communicate back to the parser for character
819 * literals that do not push a new lexer mode.
820 */
821 const pm_encoding_t *explicit_encoding;
822
823 /*
824 * When parsing block exits (e.g., break, next, redo), we need to validate
825 * that they are in correct contexts. For the most part we can do this by
826 * looking at our parent contexts. However, modifier while and until
827 * expressions can change that context to make block exits valid. In these
828 * cases, we need to keep track of the block exits and then validate them
829 * after the expression has been parsed.
830 *
831 * We use a pointer here because we don't want to keep a whole list attached
832 * since this will only be used in the context of begin/end expressions.
833 */
834 pm_node_list_t *current_block_exits;
835
836 /* The version of prism that we should use to parse. */
837 pm_options_version_t version;
838
839 /* The command line flags given from the options. */
840 uint8_t command_line;
841
842 /*
843 * Whether or not we have found a frozen_string_literal magic comment with
844 * a true or false value.
845 * May be:
846 * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED
847 * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED
848 * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
849 */
850 int8_t frozen_string_literal;
851
852 /*
853 * Whether or not we are parsing an eval string. This impacts whether or not
854 * we should evaluate if block exits/yields are valid.
855 */
856 bool parsing_eval;
857
858 /*
859 * Whether or not we are parsing a "partial" script, which is a script that
860 * will be evaluated in the context of another script, so we should not
861 * check jumps (next/break/etc.) for validity.
862 */
863 bool partial_script;
864
865 /* Whether or not we're at the beginning of a command. */
866 bool command_start;
867
868 /*
869 * Whether or not we're currently parsing the body of an endless method
870 * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be
871 * consumed by commands (it should bubble up to the outer context).
872 */
873 bool in_endless_def_body;
874
875 /* Whether or not we're currently recovering from a syntax error. */
876 bool recovering;
877
878 /*
879 * Whether or not the source being parsed could become valid if more input
880 * were appended. This is set to false when the parser encounters a token
881 * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to
882 * merely incomplete.
883 */
884 bool continuable;
885
886 /*
887 * This is very specialized behavior for when you want to parse in a context
888 * that does not respect encoding comments. Its main use case is translating
889 * into the whitequark/parser AST which re-encodes source files in UTF-8
890 * before they are parsed and ignores encoding comments.
891 */
892 bool encoding_locked;
893
894 /*
895 * Whether or not the encoding has been changed by a magic comment. We use
896 * this to provide a fast path for the lexer instead of going through the
897 * function pointer.
898 */
899 bool encoding_changed;
900
901 /*
902 * This flag indicates that we are currently parsing a pattern matching
903 * expression and impacts that calculation of newlines.
904 */
905 bool pattern_matching_newlines;
906
907 /* This flag indicates that we are currently parsing a keyword argument. */
908 bool in_keyword_arg;
909
910 /*
911 * Whether or not the parser has seen a token that has semantic meaning
912 * (i.e., a token that is not a comment or whitespace).
913 */
914 bool semantic_token_seen;
915
916 /*
917 * By default, Ruby always warns about mismatched indentation. This can be
918 * toggled with a magic comment.
919 */
920 bool warn_mismatched_indentation;
921
922#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR)
923 /*
924 * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding
925 * the nibble-based tables on every call when the charset hasn't changed
926 * (which is the common case during string/regex/list lexing).
927 */
928 struct {
929 /* The cached charset (null-terminated, max 11 chars + NUL). */
930 uint8_t charset[12];
931
932 /* Nibble-based low lookup table for SIMD matching. */
933 uint8_t low_lut[16];
934
935 /* Nibble-based high lookup table for SIMD matching. */
936 uint8_t high_lut[16];
937
938 /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */
939 uint64_t table[4];
940 } strpbrk_cache;
941#endif
942};
943
944/*
945 * Initialize a parser with the given start and end pointers.
946 */
947void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options);
948
949/*
950 * Free the memory held by the given parser.
951 *
952 * This does not free the `pm_options_t` object that was used to initialize the
953 * parser.
954 */
955void pm_parser_cleanup(pm_parser_t *parser);
956
957#endif
uint32_t pm_constant_id_t
A constant id is a unique identifier for a constant in the constant pool.
A list of byte offsets of newlines in a string.
The parser used to parse Ruby source.
void(* pm_lex_callback_t)(pm_parser_t *parser, pm_token_t *token, void *data)
This is the callback that is called when a token is lexed.
Definition parser.h:55
void(* pm_encoding_changed_callback_t)(pm_parser_t *parser)
When the encoding that is being used to parse the source is changed by prism, we provide the ability ...
Definition parser.h:49
C99 shim for <stdbool.h>
A list of offsets of the start of lines in a string.
This struct represents a slice in the source code, defined by an offset and a length.
Definition ast.h:554
A list of nodes in the source, most often used for lists of children.
Definition ast.h:567
A generic string type that can have various ownership semantics.
Definition stringy.h:18
This struct represents a token in the Ruby source.
Definition ast.h:526