Ruby 4.1.0dev (2026-03-28 revision 6035121c933c965cebf93f624066c95dabbc6d1d)
encoding.h
1#ifndef PRISM_INTERNAL_ENCODING_H
2#define PRISM_INTERNAL_ENCODING_H
3
4#include <stdbool.h>
5#include <stddef.h>
6#include <stdint.h>
7
8/*
9 * This struct defines the functions necessary to implement the encoding
10 * interface so we can determine how many bytes the subsequent character takes.
11 * Each callback should return the number of bytes, or 0 if the next bytes are
12 * invalid for the encoding and type.
13 */
14typedef struct {
15 /*
16 * Return the number of bytes that the next character takes if it is valid
17 * in the encoding. Does not read more than n bytes. It is assumed that n is
18 * at least 1.
19 */
20 size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
21
22 /*
23 * Return the number of bytes that the next character takes if it is valid
24 * in the encoding and is alphabetical. Does not read more than n bytes. It
25 * is assumed that n is at least 1.
26 */
27 size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
28
29 /*
30 * Return the number of bytes that the next character takes if it is valid
31 * in the encoding and is alphanumeric. Does not read more than n bytes. It
32 * is assumed that n is at least 1.
33 */
34 size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
35
36 /*
37 * Return true if the next character is valid in the encoding and is an
38 * uppercase character. Does not read more than n bytes. It is assumed that
39 * n is at least 1.
40 */
41 bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
42
43 /*
44 * The name of the encoding. This should correspond to a value that can be
45 * passed to Encoding.find in Ruby.
46 */
47 const char *name;
48
49 /* Return true if the encoding is a multibyte encoding. */
50 bool multibyte;
52
53/*
54 * All of the lookup tables use the first bit of each embedded byte to indicate
55 * whether the codepoint is alphabetical.
56 */
57#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
58
59/*
60 * All of the lookup tables use the second bit of each embedded byte to indicate
61 * whether the codepoint is alphanumeric.
62 */
63#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
64
65/*
66 * All of the lookup tables use the third bit of each embedded byte to indicate
67 * whether the codepoint is uppercase.
68 */
69#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
70
71/* Return the size of the next character in the UTF-8 encoding. */
72size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);
73
74/*
75 * Return the size of the next character in the UTF-8 encoding if it is an
76 * alphabetical character.
77 */
78size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
79
80/*
81 * Return the size of the next character in the UTF-8 encoding if it is an
82 * alphanumeric character.
83 */
84size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
85
86/*
87 * Return true if the next character in the UTF-8 encoding if it is an uppercase
88 * character.
89 */
90bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
91
92/*
93 * This lookup table is referenced in both the UTF-8 encoding file and the
94 * parser directly in order to speed up the default encoding processing. It is
95 * used to indicate whether a character is alphabetical, alphanumeric, or
96 * uppercase in unicode mappings.
97 */
98extern const uint8_t pm_encoding_unicode_table[256];
99
100/* These are all of the encodings that prism supports. */
101typedef enum {
102 PM_ENCODING_UTF_8 = 0,
103 PM_ENCODING_US_ASCII,
104 PM_ENCODING_ASCII_8BIT,
105 PM_ENCODING_EUC_JP,
106 PM_ENCODING_WINDOWS_31J,
107
108/* We optionally support excluding the full set of encodings to only support the
109 * minimum necessary to process Ruby code without encoding comments. */
110#ifndef PRISM_ENCODING_EXCLUDE_FULL
111 PM_ENCODING_BIG5,
112 PM_ENCODING_BIG5_HKSCS,
113 PM_ENCODING_BIG5_UAO,
114 PM_ENCODING_CESU_8,
115 PM_ENCODING_CP51932,
116 PM_ENCODING_CP850,
117 PM_ENCODING_CP852,
118 PM_ENCODING_CP855,
119 PM_ENCODING_CP949,
120 PM_ENCODING_CP950,
121 PM_ENCODING_CP951,
122 PM_ENCODING_EMACS_MULE,
123 PM_ENCODING_EUC_JP_MS,
124 PM_ENCODING_EUC_JIS_2004,
125 PM_ENCODING_EUC_KR,
126 PM_ENCODING_EUC_TW,
127 PM_ENCODING_GB12345,
128 PM_ENCODING_GB18030,
129 PM_ENCODING_GB1988,
130 PM_ENCODING_GB2312,
131 PM_ENCODING_GBK,
132 PM_ENCODING_IBM437,
133 PM_ENCODING_IBM720,
134 PM_ENCODING_IBM737,
135 PM_ENCODING_IBM775,
136 PM_ENCODING_IBM852,
137 PM_ENCODING_IBM855,
138 PM_ENCODING_IBM857,
139 PM_ENCODING_IBM860,
140 PM_ENCODING_IBM861,
141 PM_ENCODING_IBM862,
142 PM_ENCODING_IBM863,
143 PM_ENCODING_IBM864,
144 PM_ENCODING_IBM865,
145 PM_ENCODING_IBM866,
146 PM_ENCODING_IBM869,
147 PM_ENCODING_ISO_8859_1,
148 PM_ENCODING_ISO_8859_2,
149 PM_ENCODING_ISO_8859_3,
150 PM_ENCODING_ISO_8859_4,
151 PM_ENCODING_ISO_8859_5,
152 PM_ENCODING_ISO_8859_6,
153 PM_ENCODING_ISO_8859_7,
154 PM_ENCODING_ISO_8859_8,
155 PM_ENCODING_ISO_8859_9,
156 PM_ENCODING_ISO_8859_10,
157 PM_ENCODING_ISO_8859_11,
158 PM_ENCODING_ISO_8859_13,
159 PM_ENCODING_ISO_8859_14,
160 PM_ENCODING_ISO_8859_15,
161 PM_ENCODING_ISO_8859_16,
162 PM_ENCODING_KOI8_R,
163 PM_ENCODING_KOI8_U,
164 PM_ENCODING_MAC_CENT_EURO,
165 PM_ENCODING_MAC_CROATIAN,
166 PM_ENCODING_MAC_CYRILLIC,
167 PM_ENCODING_MAC_GREEK,
168 PM_ENCODING_MAC_ICELAND,
169 PM_ENCODING_MAC_JAPANESE,
170 PM_ENCODING_MAC_ROMAN,
171 PM_ENCODING_MAC_ROMANIA,
172 PM_ENCODING_MAC_THAI,
173 PM_ENCODING_MAC_TURKISH,
174 PM_ENCODING_MAC_UKRAINE,
175 PM_ENCODING_SHIFT_JIS,
176 PM_ENCODING_SJIS_DOCOMO,
177 PM_ENCODING_SJIS_KDDI,
178 PM_ENCODING_SJIS_SOFTBANK,
179 PM_ENCODING_STATELESS_ISO_2022_JP,
180 PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
181 PM_ENCODING_TIS_620,
182 PM_ENCODING_UTF8_MAC,
183 PM_ENCODING_UTF8_DOCOMO,
184 PM_ENCODING_UTF8_KDDI,
185 PM_ENCODING_UTF8_SOFTBANK,
186 PM_ENCODING_WINDOWS_1250,
187 PM_ENCODING_WINDOWS_1251,
188 PM_ENCODING_WINDOWS_1252,
189 PM_ENCODING_WINDOWS_1253,
190 PM_ENCODING_WINDOWS_1254,
191 PM_ENCODING_WINDOWS_1255,
192 PM_ENCODING_WINDOWS_1256,
193 PM_ENCODING_WINDOWS_1257,
194 PM_ENCODING_WINDOWS_1258,
195 PM_ENCODING_WINDOWS_874,
196#endif
197
198 PM_ENCODING_MAXIMUM
199} pm_encoding_type_t;
200
201/* This is the table of all of the encodings that prism supports. */
202extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
203
204/*
205 * This is the default UTF-8 encoding. We need a reference to it to quickly
206 * create parsers.
207 */
208#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
209
210/*
211 * This is the US-ASCII encoding. We need a reference to it to be able to
212 * compare against it when a string is being created because it could possibly
213 * need to fall back to ASCII-8BIT.
214 */
215#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
216
217/*
218 * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
219 * can compare against it because invalid multibyte characters are not a thing
220 * in this encoding. It is also needed for handling Regexp encoding flags.
221 */
222#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
223
224/*
225 * This is the EUC-JP encoding. We need a reference to it to quickly process
226 * regular expression modifiers.
227 */
228#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
229
230/*
231 * This is the Windows-31J encoding. We need a reference to it to quickly
232 * process regular expression modifiers.
233 */
234#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
235
236/*
237 * Parse the given name of an encoding and return a pointer to the corresponding
238 * encoding struct if one can be found, otherwise return NULL.
239 */
240const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
241
242#endif
C99 shim for <stdbool.h>