Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
pm_strpbrk.c
2 
6 static inline void
7 pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
8  pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
9 }
10 
14 static inline void
15 pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
16  if (parser->explicit_encoding != NULL) {
17  if (parser->explicit_encoding == parser->encoding) {
18  // Okay, we already locked to this encoding.
19  } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
20  // Not okay, we already found a Unicode escape sequence and this
21  // conflicts.
22  pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
23  } else {
24  // Should not be anything else.
25  assert(false && "unreachable");
26  }
27  }
28 
29  parser->explicit_encoding = parser->encoding;
30 }
31 
35 static inline const uint8_t *
36 pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
37  size_t index = 0;
38 
39  while (index < maximum) {
40  if (strchr((const char *) charset, source[index]) != NULL) {
41  return source + index;
42  }
43 
44  if (source[index] < 0x80) {
45  index++;
46  } else {
47  size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
48 
49  if (width > 0) {
50  index += width;
51  } else if (!validate) {
52  index++;
53  } else {
54  // At this point we know we have an invalid multibyte character.
55  // We'll walk forward as far as we can until we find the next
56  // valid character so that we don't spam the user with a ton of
57  // the same kind of error.
58  const size_t start = index;
59 
60  do {
61  index++;
62  } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
63 
64  pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
65  }
66  }
67  }
68 
69  return NULL;
70 }
71 
75 static inline const uint8_t *
76 pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
77  size_t index = 0;
78 
79  while (index < maximum) {
80  if (strchr((const char *) charset, source[index]) != NULL) {
81  return source + index;
82  }
83 
84  if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
85  index++;
86  }
87 
88  return NULL;
89 }
90 
94 static inline const uint8_t *
95 pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
96  size_t index = 0;
97  const pm_encoding_t *encoding = parser->encoding;
98 
99  while (index < maximum) {
100  if (strchr((const char *) charset, source[index]) != NULL) {
101  return source + index;
102  }
103 
104  if (source[index] < 0x80) {
105  index++;
106  } else {
107  size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
108  if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
109 
110  if (width > 0) {
111  index += width;
112  } else if (!validate) {
113  index++;
114  } else {
115  // At this point we know we have an invalid multibyte character.
116  // We'll walk forward as far as we can until we find the next
117  // valid character so that we don't spam the user with a ton of
118  // the same kind of error.
119  const size_t start = index;
120 
121  do {
122  index++;
123  } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
124 
125  pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
126  }
127  }
128  }
129 
130  return NULL;
131 }
132 
137 static inline const uint8_t *
138 pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
139  size_t index = 0;
140  const pm_encoding_t *encoding = parser->encoding;
141 
142  while (index < maximum) {
143  if (strchr((const char *) charset, source[index]) != NULL) {
144  return source + index;
145  }
146 
147  if (source[index] < 0x80 || !validate) {
148  index++;
149  } else {
150  size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
151  pm_strpbrk_explicit_encoding_set(parser, source, width);
152 
153  if (width > 0) {
154  index += width;
155  } else {
156  // At this point we know we have an invalid multibyte character.
157  // We'll walk forward as far as we can until we find the next
158  // valid character so that we don't spam the user with a ton of
159  // the same kind of error.
160  const size_t start = index;
161 
162  do {
163  index++;
164  } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
165 
166  pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
167  }
168  }
169  }
170 
171  return NULL;
172 }
173 
193 const uint8_t *
194 pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
195  if (length <= 0) {
196  return NULL;
197  } else if (!parser->encoding_changed) {
198  return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
199  } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
200  return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
201  } else if (parser->encoding->multibyte) {
202  return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
203  } else {
204  return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
205  }
206 }
bool pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id,...)
Append a diagnostic to the given list of diagnostics that is using a format string for its message.
Definition: diagnostic.c:787
A custom strpbrk implementation.
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate)
Here we have rolled our own version of strpbrk.
Definition: pm_strpbrk.c:194
#define PM_ENCODING_UTF_8_ENTRY
This is the default UTF-8 encoding.
Definition: encoding.h:245
#define PM_ENCODING_ASCII_8BIT_ENTRY
This is the ASCII-8BIT encoding.
Definition: encoding.h:259
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
Definition: encoding.c:2287
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition: encoding.h:23
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
Definition: encoding.h:29
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition: encoding.h:61
const char * name
The name of the encoding.
Definition: encoding.h:56
This struct represents the overall parser.
Definition: parser.h:640
const pm_encoding_t * explicit_encoding
When a string-like expression is being lexed, any byte or escape sequence that resolves to a value wh...
Definition: parser.h:840
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
Definition: parser.h:755
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
Definition: parser.h:903
pm_list_t error_list
The list of errors that have been found while parsing.
Definition: parser.h:734