Ruby 3.5.0dev (2025-02-22 revision 412997300569c1853c09813e4924b6df3d7e8669)
pm_strpbrk.c
2
6static inline void
7pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
8 pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
9}
10
14static inline void
15pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
16 if (parser->explicit_encoding != NULL) {
17 if (parser->explicit_encoding == parser->encoding) {
18 // Okay, we already locked to this encoding.
19 } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
20 // Not okay, we already found a Unicode escape sequence and this
21 // conflicts.
22 pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
23 } else {
24 // Should not be anything else.
25 assert(false && "unreachable");
26 }
27 }
28
29 parser->explicit_encoding = parser->encoding;
30}
31
35static inline const uint8_t *
36pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
37 size_t index = 0;
38
39 while (index < maximum) {
40 if (strchr((const char *) charset, source[index]) != NULL) {
41 return source + index;
42 }
43
44 if (source[index] < 0x80) {
45 index++;
46 } else {
47 size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
48
49 if (width > 0) {
50 index += width;
51 } else if (!validate) {
52 index++;
53 } else {
54 // At this point we know we have an invalid multibyte character.
55 // We'll walk forward as far as we can until we find the next
56 // valid character so that we don't spam the user with a ton of
57 // the same kind of error.
58 const size_t start = index;
59
60 do {
61 index++;
62 } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
63
64 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
65 }
66 }
67 }
68
69 return NULL;
70}
71
75static inline const uint8_t *
76pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
77 size_t index = 0;
78
79 while (index < maximum) {
80 if (strchr((const char *) charset, source[index]) != NULL) {
81 return source + index;
82 }
83
84 if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
85 index++;
86 }
87
88 return NULL;
89}
90
94static inline const uint8_t *
95pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
96 size_t index = 0;
97 const pm_encoding_t *encoding = parser->encoding;
98
99 while (index < maximum) {
100 if (strchr((const char *) charset, source[index]) != NULL) {
101 return source + index;
102 }
103
104 if (source[index] < 0x80) {
105 index++;
106 } else {
107 size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
108 if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
109
110 if (width > 0) {
111 index += width;
112 } else if (!validate) {
113 index++;
114 } else {
115 // At this point we know we have an invalid multibyte character.
116 // We'll walk forward as far as we can until we find the next
117 // valid character so that we don't spam the user with a ton of
118 // the same kind of error.
119 const size_t start = index;
120
121 do {
122 index++;
123 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
124
125 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
126 }
127 }
128 }
129
130 return NULL;
131}
132
137static inline const uint8_t *
138pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
139 size_t index = 0;
140 const pm_encoding_t *encoding = parser->encoding;
141
142 while (index < maximum) {
143 if (strchr((const char *) charset, source[index]) != NULL) {
144 return source + index;
145 }
146
147 if (source[index] < 0x80 || !validate) {
148 index++;
149 } else {
150 size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
151 pm_strpbrk_explicit_encoding_set(parser, source, width);
152
153 if (width > 0) {
154 index += width;
155 } else {
156 // At this point we know we have an invalid multibyte character.
157 // We'll walk forward as far as we can until we find the next
158 // valid character so that we don't spam the user with a ton of
159 // the same kind of error.
160 const size_t start = index;
161
162 do {
163 index++;
164 } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
165
166 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
167 }
168 }
169 }
170
171 return NULL;
172}
173
193const uint8_t *
194pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
195 if (length <= 0) {
196 return NULL;
197 } else if (!parser->encoding_changed) {
198 return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
199 } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
200 return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
201 } else if (parser->encoding->multibyte) {
202 return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
203 } else {
204 return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
205 }
206}
A custom strpbrk implementation.
#define PM_ENCODING_UTF_8_ENTRY
This is the default UTF-8 encoding.
Definition encoding.h:245
#define PM_ENCODING_ASCII_8BIT_ENTRY
This is the ASCII-8BIT encoding.
Definition encoding.h:259
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition encoding.h:23
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
Definition encoding.h:29
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition encoding.h:61
const char * name
The name of the encoding.
Definition encoding.h:56
This struct represents the overall parser.
Definition parser.h:640
const pm_encoding_t * explicit_encoding
When a string-like expression is being lexed, any byte or escape sequence that resolves to a value wh...
Definition parser.h:840
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
Definition parser.h:755
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
Definition parser.h:903
pm_list_t error_list
The list of errors that have been found while parsing.
Definition parser.h:734