Ruby  3.4.0dev (2024-11-22 revision 0989400a925cd201defdca9eb28eb87200b30785)
encoding.h
Go to the documentation of this file.
1 
6 #ifndef PRISM_ENCODING_H
7 #define PRISM_ENCODING_H
8 
9 #include "prism/defines.h"
11 
12 #include <assert.h>
13 #include <stdbool.h>
14 #include <stddef.h>
15 #include <stdint.h>
16 
23 typedef struct {
29  size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
30 
36  size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
37 
43  size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
44 
50  bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
51 
56  const char *name;
57 
61  bool multibyte;
63 
68 #define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
69 
74 #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
75 
80 #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
81 
90 size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);
91 
101 size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
102 
112 size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
113 
123 bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
124 
131 extern const uint8_t pm_encoding_unicode_table[256];
132 
136 typedef enum {
137  PM_ENCODING_UTF_8 = 0,
138  PM_ENCODING_US_ASCII,
139  PM_ENCODING_ASCII_8BIT,
140  PM_ENCODING_EUC_JP,
141  PM_ENCODING_WINDOWS_31J,
142 
143 // We optionally support excluding the full set of encodings to only support the
144 // minimum necessary to process Ruby code without encoding comments.
145 #ifndef PRISM_ENCODING_EXCLUDE_FULL
146  PM_ENCODING_BIG5,
147  PM_ENCODING_BIG5_HKSCS,
148  PM_ENCODING_BIG5_UAO,
149  PM_ENCODING_CESU_8,
150  PM_ENCODING_CP51932,
151  PM_ENCODING_CP850,
152  PM_ENCODING_CP852,
153  PM_ENCODING_CP855,
154  PM_ENCODING_CP949,
155  PM_ENCODING_CP950,
156  PM_ENCODING_CP951,
157  PM_ENCODING_EMACS_MULE,
158  PM_ENCODING_EUC_JP_MS,
159  PM_ENCODING_EUC_JIS_2004,
160  PM_ENCODING_EUC_KR,
161  PM_ENCODING_EUC_TW,
162  PM_ENCODING_GB12345,
163  PM_ENCODING_GB18030,
164  PM_ENCODING_GB1988,
165  PM_ENCODING_GB2312,
166  PM_ENCODING_GBK,
167  PM_ENCODING_IBM437,
168  PM_ENCODING_IBM720,
169  PM_ENCODING_IBM737,
170  PM_ENCODING_IBM775,
171  PM_ENCODING_IBM852,
172  PM_ENCODING_IBM855,
173  PM_ENCODING_IBM857,
174  PM_ENCODING_IBM860,
175  PM_ENCODING_IBM861,
176  PM_ENCODING_IBM862,
177  PM_ENCODING_IBM863,
178  PM_ENCODING_IBM864,
179  PM_ENCODING_IBM865,
180  PM_ENCODING_IBM866,
181  PM_ENCODING_IBM869,
182  PM_ENCODING_ISO_8859_1,
183  PM_ENCODING_ISO_8859_2,
184  PM_ENCODING_ISO_8859_3,
185  PM_ENCODING_ISO_8859_4,
186  PM_ENCODING_ISO_8859_5,
187  PM_ENCODING_ISO_8859_6,
188  PM_ENCODING_ISO_8859_7,
189  PM_ENCODING_ISO_8859_8,
190  PM_ENCODING_ISO_8859_9,
191  PM_ENCODING_ISO_8859_10,
192  PM_ENCODING_ISO_8859_11,
193  PM_ENCODING_ISO_8859_13,
194  PM_ENCODING_ISO_8859_14,
195  PM_ENCODING_ISO_8859_15,
196  PM_ENCODING_ISO_8859_16,
197  PM_ENCODING_KOI8_R,
198  PM_ENCODING_KOI8_U,
199  PM_ENCODING_MAC_CENT_EURO,
200  PM_ENCODING_MAC_CROATIAN,
201  PM_ENCODING_MAC_CYRILLIC,
202  PM_ENCODING_MAC_GREEK,
203  PM_ENCODING_MAC_ICELAND,
204  PM_ENCODING_MAC_JAPANESE,
205  PM_ENCODING_MAC_ROMAN,
206  PM_ENCODING_MAC_ROMANIA,
207  PM_ENCODING_MAC_THAI,
208  PM_ENCODING_MAC_TURKISH,
209  PM_ENCODING_MAC_UKRAINE,
210  PM_ENCODING_SHIFT_JIS,
211  PM_ENCODING_SJIS_DOCOMO,
212  PM_ENCODING_SJIS_KDDI,
213  PM_ENCODING_SJIS_SOFTBANK,
214  PM_ENCODING_STATELESS_ISO_2022_JP,
215  PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
216  PM_ENCODING_TIS_620,
217  PM_ENCODING_UTF8_MAC,
218  PM_ENCODING_UTF8_DOCOMO,
219  PM_ENCODING_UTF8_KDDI,
220  PM_ENCODING_UTF8_SOFTBANK,
221  PM_ENCODING_WINDOWS_1250,
222  PM_ENCODING_WINDOWS_1251,
223  PM_ENCODING_WINDOWS_1252,
224  PM_ENCODING_WINDOWS_1253,
225  PM_ENCODING_WINDOWS_1254,
226  PM_ENCODING_WINDOWS_1255,
227  PM_ENCODING_WINDOWS_1256,
228  PM_ENCODING_WINDOWS_1257,
229  PM_ENCODING_WINDOWS_1258,
230  PM_ENCODING_WINDOWS_874,
231 #endif
232 
233  PM_ENCODING_MAXIMUM
235 
239 extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
240 
245 #define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
246 
252 #define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
253 
259 #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
260 
265 #define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
266 
271 #define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
272 
281 const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
282 
283 #endif
A custom strncasecmp implementation.
Macro definitions used throughout the prism library.
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n)
Return true if the next character in the UTF-8 encoding if it is an uppercase character.
Definition: encoding.c:2346
const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]
This is the table of all of the encodings that prism supports.
Definition: encoding.c:4295
pm_encoding_type_t
These are all of the encodings that prism supports.
Definition: encoding.h:136
const uint8_t pm_encoding_unicode_table[256]
This lookup table is referenced in both the UTF-8 encoding file and the parser directly in order to s...
Definition: encoding.c:2164
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding if it is an alphabetical character.
Definition: encoding.c:2306
const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end)
Parse the given name of an encoding and return a pointer to the corresponding encoding struct if one ...
Definition: encoding.c:5026
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding.
Definition: encoding.c:2287
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n)
Return the size of the next character in the UTF-8 encoding if it is an alphanumeric character.
Definition: encoding.c:2326
C99 shim for <stdbool.h>
This struct defines the functions necessary to implement the encoding interface so we can determine h...
Definition: encoding.h:23
bool multibyte
Return true if the encoding is a multibyte encoding.
Definition: encoding.h:61
const char * name
The name of the encoding.
Definition: encoding.h:56