(9aca729140424bbf465c11ab8ab53e5cc6602c01)

The encoding interface and implementations used by the parser. More...

#include "prism/defines.h"
#include "prism/util/pm_strncasecmp.h"
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

Include dependency graph for encoding.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures
struct	pm_encoding_t
	This struct defines the functions necessary to implement the encoding interface so we can determine how many bytes the subsequent character takes. More...

Macros
#define	PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
	All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical.

#define	PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
	All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric.

#define	PRISM_ENCODING_UPPERCASE_BIT 1 << 2
	All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase.

#define	PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
	This is the default UTF-8 encoding.

#define	PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
	This is the US-ASCII encoding.

#define	PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
	This is the ASCII-8BIT encoding.

#define	PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
	This is the EUC-JP encoding.

#define	PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
	This is the Windows-31J encoding.

Enumerations
enum	pm_encoding_type_t { PM_ENCODING_UTF_8 = 0 , PM_ENCODING_US_ASCII , PM_ENCODING_ASCII_8BIT , PM_ENCODING_EUC_JP , PM_ENCODING_WINDOWS_31J , PM_ENCODING_BIG5 , PM_ENCODING_BIG5_HKSCS , PM_ENCODING_BIG5_UAO , PM_ENCODING_CESU_8 , PM_ENCODING_CP51932 , PM_ENCODING_CP850 , PM_ENCODING_CP852 , PM_ENCODING_CP855 , PM_ENCODING_CP949 , PM_ENCODING_CP950 , PM_ENCODING_CP951 , PM_ENCODING_EMACS_MULE , PM_ENCODING_EUC_JP_MS , PM_ENCODING_EUC_JIS_2004 , PM_ENCODING_EUC_KR , PM_ENCODING_EUC_TW , PM_ENCODING_GB12345 , PM_ENCODING_GB18030 , PM_ENCODING_GB1988 , PM_ENCODING_GB2312 , PM_ENCODING_GBK , PM_ENCODING_IBM437 , PM_ENCODING_IBM720 , PM_ENCODING_IBM737 , PM_ENCODING_IBM775 , PM_ENCODING_IBM852 , PM_ENCODING_IBM855 , PM_ENCODING_IBM857 , PM_ENCODING_IBM860 , PM_ENCODING_IBM861 , PM_ENCODING_IBM862 , PM_ENCODING_IBM863 , PM_ENCODING_IBM864 , PM_ENCODING_IBM865 , PM_ENCODING_IBM866 , PM_ENCODING_IBM869 , PM_ENCODING_ISO_8859_1 , PM_ENCODING_ISO_8859_2 , PM_ENCODING_ISO_8859_3 , PM_ENCODING_ISO_8859_4 , PM_ENCODING_ISO_8859_5 , PM_ENCODING_ISO_8859_6 , PM_ENCODING_ISO_8859_7 , PM_ENCODING_ISO_8859_8 , PM_ENCODING_ISO_8859_9 , PM_ENCODING_ISO_8859_10 , PM_ENCODING_ISO_8859_11 , PM_ENCODING_ISO_8859_13 , PM_ENCODING_ISO_8859_14 , PM_ENCODING_ISO_8859_15 , PM_ENCODING_ISO_8859_16 , PM_ENCODING_KOI8_R , PM_ENCODING_KOI8_U , PM_ENCODING_MAC_CENT_EURO , PM_ENCODING_MAC_CROATIAN , PM_ENCODING_MAC_CYRILLIC , PM_ENCODING_MAC_GREEK , PM_ENCODING_MAC_ICELAND , PM_ENCODING_MAC_JAPANESE , PM_ENCODING_MAC_ROMAN , PM_ENCODING_MAC_ROMANIA , PM_ENCODING_MAC_THAI , PM_ENCODING_MAC_TURKISH , PM_ENCODING_MAC_UKRAINE , PM_ENCODING_SHIFT_JIS , PM_ENCODING_SJIS_DOCOMO , PM_ENCODING_SJIS_KDDI , PM_ENCODING_SJIS_SOFTBANK , PM_ENCODING_STATELESS_ISO_2022_JP , PM_ENCODING_STATELESS_ISO_2022_JP_KDDI , PM_ENCODING_TIS_620 , PM_ENCODING_UTF8_MAC , PM_ENCODING_UTF8_DOCOMO , PM_ENCODING_UTF8_KDDI , PM_ENCODING_UTF8_SOFTBANK , PM_ENCODING_WINDOWS_1250 , PM_ENCODING_WINDOWS_1251 , PM_ENCODING_WINDOWS_1252 , PM_ENCODING_WINDOWS_1253 , PM_ENCODING_WINDOWS_1254 , PM_ENCODING_WINDOWS_1255 , PM_ENCODING_WINDOWS_1256 , PM_ENCODING_WINDOWS_1257 , PM_ENCODING_WINDOWS_1258 , PM_ENCODING_WINDOWS_874 , PM_ENCODING_MAXIMUM }
	These are all of the encodings that prism supports. More...

Functions
size_t	pm_encoding_utf_8_char_width (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding.

size_t	pm_encoding_utf_8_alpha_char (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.

size_t	pm_encoding_utf_8_alnum_char (const uint8_t *b, ptrdiff_t n)
	Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.

bool	pm_encoding_utf_8_isupper_char (const uint8_t *b, ptrdiff_t n)
	Return true if the next character in the UTF-8 encoding if it is an uppercase character.

const pm_encoding_t *	pm_encoding_find (const uint8_t start, const uint8_t end)
	Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.

Variables
const uint8_t	pm_encoding_unicode_table [256]
	This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

const pm_encoding_t	pm_encodings [PM_ENCODING_MAXIMUM]
	This is the table of all of the encodings that prism supports.

Detailed Description

The encoding interface and implementations used by the parser.

Definition in file encoding.h.

Macro Definition Documentation

◆ PM_ENCODING_ASCII_8BIT_ENTRY

#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])

This is the ASCII-8BIT encoding.

We need a reference to it so that pm_strpbrk can compare against it because invalid multibyte characters are not a thing in this encoding. It is also needed for handling Regexp encoding flags.

Definition at line 259 of file encoding.h.

◆ PM_ENCODING_EUC_JP_ENTRY

#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])

This is the EUC-JP encoding.

We need a reference to it to quickly process regular expression modifiers.

Definition at line 265 of file encoding.h.

◆ PM_ENCODING_US_ASCII_ENTRY

#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])

This is the US-ASCII encoding.

We need a reference to it to be able to compare against it when a string is being created because it could possibly need to fall back to ASCII-8BIT.

Definition at line 252 of file encoding.h.

◆ PM_ENCODING_UTF_8_ENTRY

#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])

This is the default UTF-8 encoding.

We need a reference to it to quickly create parsers.

Definition at line 245 of file encoding.h.

◆ PM_ENCODING_WINDOWS_31J_ENTRY

#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])

This is the Windows-31J encoding.

We need a reference to it to quickly process regular expression modifiers.

Definition at line 271 of file encoding.h.

◆ PRISM_ENCODING_ALPHABETIC_BIT

#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0

All of the lookup tables use the first bit of each embedded byte to indicate whether the codepoint is alphabetical.

Definition at line 68 of file encoding.h.

◆ PRISM_ENCODING_ALPHANUMERIC_BIT

#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1

All of the lookup tables use the second bit of each embedded byte to indicate whether the codepoint is alphanumeric.

Definition at line 74 of file encoding.h.

◆ PRISM_ENCODING_UPPERCASE_BIT

#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2

All of the lookup tables use the third bit of each embedded byte to indicate whether the codepoint is uppercase.

Definition at line 80 of file encoding.h.

Enumeration Type Documentation

◆ pm_encoding_type_t

enum pm_encoding_type_t

These are all of the encodings that prism supports.

Definition at line 136 of file encoding.h.

Function Documentation

◆ pm_encoding_find()

const pm_encoding_t * pm_encoding_find	(	const uint8_t *	start,
		const uint8_t *	end
	)

Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one can be found, otherwise return NULL.

Parameters

start	A pointer to the first byte of the name.
end	A pointer to the last byte of the name.

Returns: A pointer to the encoding struct if one is found, otherwise NULL.

Definition at line 5131 of file encoding.c.

◆ pm_encoding_utf_8_alnum_char()

size_t pm_encoding_utf_8_alnum_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

Definition at line 2403 of file encoding.c.

◆ pm_encoding_utf_8_alpha_char()

size_t pm_encoding_utf_8_alpha_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

Definition at line 2379 of file encoding.c.

◆ pm_encoding_utf_8_char_width()

size_t pm_encoding_utf_8_char_width	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return the size of the next character in the UTF-8 encoding.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: The number of bytes that the next character takes if it is valid in the encoding, or 0 if it is not.

Definition at line 2360 of file encoding.c.

◆ pm_encoding_utf_8_isupper_char()

bool pm_encoding_utf_8_isupper_char	(	const uint8_t *	b,
		ptrdiff_t	n
	)

Return true if the next character in the UTF-8 encoding if it is an uppercase character.

Parameters

b	The bytes to read.
n	The number of bytes that can be read.

Returns: True if the next character is valid in the encoding and is an uppercase character, or false if it is not.

Definition at line 2427 of file encoding.c.

Variable Documentation

◆ pm_encoding_unicode_table

const uint8_t pm_encoding_unicode_table[256]

extern

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

It is used to indicate whether a character is alphabetical, alphanumeric, or uppercase in unicode mappings.

This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to speed up the default encoding processing.

Note that this table is different from other encodings where we used a lookup table because the indices of those tables are the byte representations, not the codepoints themselves.

Definition at line 2237 of file encoding.c.

◆ pm_encodings

const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]

extern

This is the table of all of the encodings that prism supports.

Definition at line 4400 of file encoding.c.

(9aca729140424bbf465c11ab8ab53e5cc6602c01)

Data Structures

Macros

Enumerations

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ PM_ENCODING_ASCII_8BIT_ENTRY

◆ PM_ENCODING_EUC_JP_ENTRY

◆ PM_ENCODING_US_ASCII_ENTRY

◆ PM_ENCODING_UTF_8_ENTRY

◆ PM_ENCODING_WINDOWS_31J_ENTRY

◆ PRISM_ENCODING_ALPHABETIC_BIT

◆ PRISM_ENCODING_ALPHANUMERIC_BIT

◆ PRISM_ENCODING_UPPERCASE_BIT

Enumeration Type Documentation

◆ pm_encoding_type_t

Function Documentation

◆ pm_encoding_find()

◆ pm_encoding_utf_8_alnum_char()

◆ pm_encoding_utf_8_alpha_char()

◆ pm_encoding_utf_8_char_width()

◆ pm_encoding_utf_8_isupper_char()

Variable Documentation

◆ pm_encoding_unicode_table

◆ pm_encodings