7 pm_strpbrk_invalid_multibyte_character(
pm_parser_t *parser,
const uint8_t *start,
const uint8_t *end) {
15 pm_strpbrk_explicit_encoding_set(
pm_parser_t *parser,
const uint8_t *source,
size_t width) {
25 assert(
false &&
"unreachable");
35 static inline const uint8_t *
36 pm_strpbrk_utf8(
pm_parser_t *parser,
const uint8_t *source,
const uint8_t *charset,
size_t maximum,
bool validate) {
39 while (index < maximum) {
40 if (strchr((
const char *) charset, source[index]) != NULL) {
41 return source + index;
44 if (source[index] < 0x80) {
51 }
else if (!validate) {
58 const size_t start = index;
64 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
75 static inline const uint8_t *
76 pm_strpbrk_ascii_8bit(
pm_parser_t *parser,
const uint8_t *source,
const uint8_t *charset,
size_t maximum,
bool validate) {
79 while (index < maximum) {
80 if (strchr((
const char *) charset, source[index]) != NULL) {
81 return source + index;
84 if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
94 static inline const uint8_t *
95 pm_strpbrk_multi_byte(
pm_parser_t *parser,
const uint8_t *source,
const uint8_t *charset,
size_t maximum,
bool validate) {
99 while (index < maximum) {
100 if (strchr((
const char *) charset, source[index]) != NULL) {
101 return source + index;
104 if (source[index] < 0x80) {
107 size_t width = encoding->
char_width(source + index, (ptrdiff_t) (maximum - index));
108 if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
112 }
else if (!validate) {
119 const size_t start = index;
123 }
while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
125 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
137 static inline const uint8_t *
138 pm_strpbrk_single_byte(
pm_parser_t *parser,
const uint8_t *source,
const uint8_t *charset,
size_t maximum,
bool validate) {
142 while (index < maximum) {
143 if (strchr((
const char *) charset, source[index]) != NULL) {
144 return source + index;
147 if (source[index] < 0x80 || !validate) {
150 size_t width = encoding->
char_width(source + index, (ptrdiff_t) (maximum - index));
151 pm_strpbrk_explicit_encoding_set(parser, source, width);
160 const size_t start = index;
164 }
while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
166 pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
198 return pm_strpbrk_utf8(parser, source, charset, (
size_t) length, validate);
200 return pm_strpbrk_ascii_8bit(parser, source, charset, (
size_t) length, validate);
202 return pm_strpbrk_multi_byte(parser, source, charset, (
size_t) length, validate);
204 return pm_strpbrk_single_byte(parser, source, charset, (
size_t) length, validate);
bool pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id,...)
Append a diagnostic to the given list of diagnostics that is using a format string for its message.
A custom strpbrk implementation.
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate)
Here we have rolled our own version of strpbrk.
#define PM_ENCODING_UTF_8_ENTRY
This is the default UTF-8 encoding.
#define PM_ENCODING_ASCII_8BIT_ENTRY
This is the ASCII-8BIT encoding.
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
This struct defines the functions necessary to implement the encoding interface so we can determine h...
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
bool multibyte
Return true if the encoding is a multibyte encoding.
const char * name
The name of the encoding.
This struct represents the overall parser.
const pm_encoding_t * explicit_encoding
When a string-like expression is being lexed, any byte or escape sequence that resolves to a value wh...
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
pm_list_t error_list
The list of errors that have been found while parsing.