dc/d33/pm__strpbrk_8c_source.html

#include "prism/util/pm_strpbrk.h"


static inline void

pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {

    pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);

}


static inline void

pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {

    if (parser->explicit_encoding != NULL) {

        if (parser->explicit_encoding == parser->encoding) {

            // Okay, we already locked to this encoding.

        } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {

            // Not okay, we already found a Unicode escape sequence and this

            // conflicts.

            pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);

        } else {

            // Should not be anything else.

            assert(false && "unreachable");

        }

    }


    parser->explicit_encoding = parser->encoding;

}


static inline const uint8_t *

pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {

    size_t index = 0;


    while (index < maximum) {

        if (strchr((const char *) charset, source[index]) != NULL) {

            return source + index;

        }


        if (source[index] < 0x80) {

            index++;

        } else {

            size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));


            if (width > 0) {

                index += width;

            } else if (!validate) {

                index++;

            } else {

                // At this point we know we have an invalid multibyte character.

                // We'll walk forward as far as we can until we find the next

                // valid character so that we don't spam the user with a ton of

                // the same kind of error.

                const size_t start = index;


                do {

                    index++;

                } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);


                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);

            }

        }

    }


    return NULL;

}


static inline const uint8_t *

pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {

    size_t index = 0;


    while (index < maximum) {

        if (strchr((const char *) charset, source[index]) != NULL) {

            return source + index;

        }


        if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);

        index++;

    }


    return NULL;

}


static inline const uint8_t *

pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {

    size_t index = 0;

    const pm_encoding_t *encoding = parser->encoding;


    while (index < maximum) {

        if (strchr((const char *) charset, source[index]) != NULL) {

            return source + index;

        }


        if (source[index] < 0x80) {

            index++;

        } else {

            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));

            if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);


            if (width > 0) {

                index += width;

            } else if (!validate) {

                index++;

            } else {

                // At this point we know we have an invalid multibyte character.

                // We'll walk forward as far as we can until we find the next

                // valid character so that we don't spam the user with a ton of

                // the same kind of error.

                const size_t start = index;


                do {

                    index++;

                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);


                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);

            }

        }

    }


    return NULL;

}


static inline const uint8_t *

pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {

    size_t index = 0;

    const pm_encoding_t *encoding = parser->encoding;


    while (index < maximum) {

        if (strchr((const char *) charset, source[index]) != NULL) {

            return source + index;

        }


        if (source[index] < 0x80 || !validate) {

            index++;

        } else {

            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));

            pm_strpbrk_explicit_encoding_set(parser, source, width);


            if (width > 0) {

                index += width;

            } else {

                // At this point we know we have an invalid multibyte character.

                // We'll walk forward as far as we can until we find the next

                // valid character so that we don't spam the user with a ton of

                // the same kind of error.

                const size_t start = index;


                do {

                    index++;

                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);


                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);

            }

        }

    }


    return NULL;

}


const uint8_t *


pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {

    if (length <= 0) {

        return NULL;

    } else if (!parser->encoding_changed) {

        return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);

    } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {

        return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);

    } else if (parser->encoding->multibyte) {

        return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);

    } else {

        return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);

    }

}


pm_strpbrk.h
A custom strpbrk implementation.

PM_ENCODING_UTF_8_ENTRY
#define PM_ENCODING_UTF_8_ENTRY
This is the default UTF-8 encoding.
Definition encoding.h:245

PM_ENCODING_ASCII_8BIT_ENTRY
#define PM_ENCODING_ASCII_8BIT_ENTRY
This is the ASCII-8BIT encoding.
Definition encoding.h:259

pm_encoding_t
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition encoding.h:23

pm_encoding_t::char_width
size_t(* char_width)(const uint8_t *b, ptrdiff_t n)
Return the number of bytes that the next character takes if it is valid in the encoding.
Definition encoding.h:29

pm_encoding_t::multibyte
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition encoding.h:61

pm_encoding_t::name
const char * name
The name of the encoding.
Definition encoding.h:56

pm_parser
This struct represents the overall parser.
Definition parser.h:640

pm_parser::explicit_encoding
const pm_encoding_t * explicit_encoding
When a string-like expression is being lexed, any byte or escape sequence that resolves to a value wh...
Definition parser.h:840

pm_parser::encoding
const pm_encoding_t * encoding
The encoding functions for the current file is attached to the parser as it's parsing so that it can ...
Definition parser.h:755

pm_parser::encoding_changed
bool encoding_changed
Whether or not the encoding has been changed by a magic comment.
Definition parser.h:903

pm_parser::error_list
pm_list_t error_list
The list of errors that have been found while parsing.
Definition parser.h:734