Ruby 4.1.0dev (2026-04-04 revision 3b6245536cf55da9e8bfcdb03c845fe9ef931d7f)
regenc.c (3b6245536cf55da9e8bfcdb03c845fe9ef931d7f)
1/**********************************************************************
2 regenc.c - Onigmo (Oniguruma-mod) (regular expression library)
3**********************************************************************/
4/*-
5 * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
6 * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include "regint.h"
32
33OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
34
35extern int
36onigenc_init(void)
37{
38 return 0;
39}
40
41extern OnigEncoding
42onigenc_get_default_encoding(void)
43{
44 return OnigEncDefaultCharEncoding;
45}
46
47extern int
48onigenc_set_default_encoding(OnigEncoding enc)
49{
50 OnigEncDefaultCharEncoding = enc;
51 return 0;
52}
53
54extern int
55onigenc_mbclen(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc)
56{
57 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e);
58 if (ONIGENC_MBCLEN_CHARFOUND_P(ret)) {
59 ret = ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
60 if (p + ret > e) ret = (int)(e - p); // just for case
61 return ret;
62 }
63 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret)) {
64 return (int)(e - p);
65 }
66 return p < e ? 1 : 0;
67}
68
69extern int
70onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, OnigEncoding enc)
71{
72 int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc, p, e);
73 if (ONIGENC_MBCLEN_CHARFOUND_P(ret))
74 return ONIGENC_MBCLEN_CHARFOUND_LEN(ret);
75 else if (ONIGENC_MBCLEN_NEEDMORE_P(ret))
76 return (int )(e - p) + ONIGENC_MBCLEN_NEEDMORE_LEN(ret);
77 return 1;
78}
79
80extern UChar*
81onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
82{
83 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
84 if (p < s) {
85 p += enclen(enc, p, end);
86 }
87 return p;
88}
89
90extern UChar*
91onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
92 const UChar* start, const UChar* s, const UChar* end, const UChar** prev)
93{
94 UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
95
96 if (p < s) {
97 if (prev) *prev = (const UChar* )p;
98 p += enclen(enc, p, end);
99 }
100 else {
101 if (prev) *prev = (const UChar* )NULL; /* Sorry */
102 }
103 return p;
104}
105
106extern UChar*
107onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
108{
109 if (s <= start)
110 return (UChar* )NULL;
111
112 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
113}
114
115extern UChar*
116onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end, int n)
117{
118 while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
119 if (s <= start)
120 return (UChar* )NULL;
121
122 s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1, end);
123 }
124 return (UChar* )s;
125}
126
127extern UChar*
128onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
129{
130 UChar* q = (UChar* )p;
131 while (n-- > 0) {
132 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
133 }
134 return (q <= end ? q : NULL);
135}
136
137extern int
138onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
139{
140 int n = 0;
141 UChar* q = (UChar* )p;
142
143 while (q < end) {
144 q += ONIGENC_MBC_ENC_LEN(enc, q, end);
145 n++;
146 }
147 return n;
148}
149
150extern int
151onigenc_strlen_null(OnigEncoding enc, const UChar* s)
152{
153 int n = 0;
154 UChar* p = (UChar* )s;
155 UChar* e;
156
157 while (1) {
158 if (*p == '\0') {
159 UChar* q;
160 int len = ONIGENC_MBC_MINLEN(enc);
161
162 if (len == 1) return n;
163 q = p + 1;
164 while (len > 1) {
165 if (*q != '\0') break;
166 q++;
167 len--;
168 }
169 if (len == 1) return n;
170 }
171 e = p + ONIGENC_MBC_MAXLEN(enc);
172 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
173 n++;
174 }
175}
176
177extern int
178onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
179{
180 UChar* start = (UChar* )s;
181 UChar* p = (UChar* )s;
182 UChar* e;
183
184 while (1) {
185 if (*p == '\0') {
186 UChar* q;
187 int len = ONIGENC_MBC_MINLEN(enc);
188
189 if (len == 1) return (int )(p - start);
190 q = p + 1;
191 while (len > 1) {
192 if (*q != '\0') break;
193 q++;
194 len--;
195 }
196 if (len == 1) return (int )(p - start);
197 }
198 e = p + ONIGENC_MBC_MAXLEN(enc);
199 p += ONIGENC_MBC_ENC_LEN(enc, p, e);
200 }
201}
202
203const UChar OnigEncAsciiToLowerCaseTable[] = {
204 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
205 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
206 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
207 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
208 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
209 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
210 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
211 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
212 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
213 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
214 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
215 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
216 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
217 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
218 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
219 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
220 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
221 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
222 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
223 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
224 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
225 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
226 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
227 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
228 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
229 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
230 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
231 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
232 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
233 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
234 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
235 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
236};
237
238#ifdef USE_UPPER_CASE_TABLE
239const UChar OnigEncAsciiToUpperCaseTable[256] = {
240 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
241 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
242 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
243 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
244 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
245 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
246 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
247 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
248 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
249 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
250 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
251 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
252 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
253 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
254 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
255 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
256 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
257 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
258 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
259 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
260 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
261 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
262 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
263 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
264 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
265 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
266 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
267 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
268 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
269 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
270 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
271 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
272};
273#endif
274
275const unsigned short OnigEncAsciiCtypeTable[256] = {
276 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
277 0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
278 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
279 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
280 0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
281 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
282 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
283 0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
284 0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
285 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
286 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
287 0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
288 0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
289 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
290 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
291 0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
292 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
293 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
294 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
295 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
296 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
297 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
298 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
299 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
300 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
301 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
302 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
303 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
304 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
305 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
306 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
307 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
308};
309
310const UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
311 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
312 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
313 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
314 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
315 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
316 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
317 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
318 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
319 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
320 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
321 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
322 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
323 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
324 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
325 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
326 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
327 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
328 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
329 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
330 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
331 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
332 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
333 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
334 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
335 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
336 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
337 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
338 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
339 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
340 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
341 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
342 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
343};
344
345#ifdef USE_UPPER_CASE_TABLE
346const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
347 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
348 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
349 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
350 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
351 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
352 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
353 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
354 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
355 '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
356 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
357 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
358 '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
359 '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
360 '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
361 '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
362 '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
363 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
364 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
365 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
366 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
367 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
368 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
369 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
370 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
371 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
372 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
373 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
374 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
375 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
376 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
377 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
378 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
379};
380#endif
381
382#if 0
383extern void
384onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED)
385{
386 /* nothing */
387 /* obsoleted. */
388}
389#endif
390
391extern UChar*
392onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end)
393{
394 return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s, end);
395}
396
397const OnigPairCaseFoldCodes OnigAsciiLowerMap[] = {
398 { 0x41, 0x61 },
399 { 0x42, 0x62 },
400 { 0x43, 0x63 },
401 { 0x44, 0x64 },
402 { 0x45, 0x65 },
403 { 0x46, 0x66 },
404 { 0x47, 0x67 },
405 { 0x48, 0x68 },
406 { 0x49, 0x69 },
407 { 0x4a, 0x6a },
408 { 0x4b, 0x6b },
409 { 0x4c, 0x6c },
410 { 0x4d, 0x6d },
411 { 0x4e, 0x6e },
412 { 0x4f, 0x6f },
413 { 0x50, 0x70 },
414 { 0x51, 0x71 },
415 { 0x52, 0x72 },
416 { 0x53, 0x73 },
417 { 0x54, 0x74 },
418 { 0x55, 0x75 },
419 { 0x56, 0x76 },
420 { 0x57, 0x77 },
421 { 0x58, 0x78 },
422 { 0x59, 0x79 },
423 { 0x5a, 0x7a }
424};
425
426extern int
427onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
428 OnigApplyAllCaseFoldFunc f, void* arg,
429 OnigEncoding enc ARG_UNUSED)
430{
431 OnigCodePoint code;
432 int i, r;
433
434 for (i = 0; i < numberof(OnigAsciiLowerMap); i++) {
435 code = OnigAsciiLowerMap[i].to;
436 r = (*f)(OnigAsciiLowerMap[i].from, &code, 1, arg);
437 if (r != 0) return r;
438
439 code = OnigAsciiLowerMap[i].from;
440 r = (*f)(OnigAsciiLowerMap[i].to, &code, 1, arg);
441 if (r != 0) return r;
442 }
443
444 return 0;
445}
446
447extern int
448onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
449 const OnigUChar* p, const OnigUChar* end ARG_UNUSED,
450 OnigCaseFoldCodeItem items[], OnigEncoding enc ARG_UNUSED)
451{
452 if (0x41 <= *p && *p <= 0x5a) {
453 items[0].byte_len = 1;
454 items[0].code_len = 1;
455 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
456 return 1;
457 }
458 else if (0x61 <= *p && *p <= 0x7a) {
459 items[0].byte_len = 1;
460 items[0].code_len = 1;
461 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
462 return 1;
463 }
464 else
465 return 0;
466}
467
468static int
469ss_apply_all_case_fold(OnigCaseFoldType flag ARG_UNUSED,
470 OnigApplyAllCaseFoldFunc f, void* arg)
471{
472 OnigCodePoint ss[] = { 0x73, 0x73 };
473
474 return (*f)((OnigCodePoint )0xdf, ss, 2, arg);
475}
476
477extern int
478onigenc_apply_all_case_fold_with_map(int map_size,
479 const OnigPairCaseFoldCodes map[],
480 int ess_tsett_flag, OnigCaseFoldType flag,
481 OnigApplyAllCaseFoldFunc f, void* arg)
482{
483 OnigCodePoint code;
484 int i, r;
485
486 r = onigenc_ascii_apply_all_case_fold(flag, f, arg, 0);
487 if (r != 0) return r;
488
489 for (i = 0; i < map_size; i++) {
490 code = map[i].to;
491 r = (*f)(map[i].from, &code, 1, arg);
492 if (r != 0) return r;
493
494 code = map[i].from;
495 r = (*f)(map[i].to, &code, 1, arg);
496 if (r != 0) return r;
497 }
498
499 if (ess_tsett_flag != 0)
500 return ss_apply_all_case_fold(flag, f, arg);
501
502 return 0;
503}
504
505extern int
506onigenc_get_case_fold_codes_by_str_with_map(int map_size,
507 const OnigPairCaseFoldCodes map[],
508 int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED,
509 const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])
510{
511 if (0x41 <= *p && *p <= 0x5a) {
512 items[0].byte_len = 1;
513 items[0].code_len = 1;
514 items[0].code[0] = (OnigCodePoint )(*p + 0x20);
515 if (*p == 0x53 && ess_tsett_flag != 0 && end > p + 1
516 && (*(p+1) == 0x53 || *(p+1) == 0x73)) {
517 /* SS */
518 items[1].byte_len = 2;
519 items[1].code_len = 1;
520 items[1].code[0] = (OnigCodePoint )0xdf;
521 return 2;
522 }
523 else
524 return 1;
525 }
526 else if (0x61 <= *p && *p <= 0x7a) {
527 items[0].byte_len = 1;
528 items[0].code_len = 1;
529 items[0].code[0] = (OnigCodePoint )(*p - 0x20);
530 if (*p == 0x73 && ess_tsett_flag != 0 && end > p + 1
531 && (*(p+1) == 0x73 || *(p+1) == 0x53)) {
532 /* ss */
533 items[1].byte_len = 2;
534 items[1].code_len = 1;
535 items[1].code[0] = (OnigCodePoint )0xdf;
536 return 2;
537 }
538 else
539 return 1;
540 }
541 else if (*p == 0xdf && ess_tsett_flag != 0) {
542 items[0].byte_len = 1;
543 items[0].code_len = 2;
544 items[0].code[0] = (OnigCodePoint )'s';
545 items[0].code[1] = (OnigCodePoint )'s';
546
547 items[1].byte_len = 1;
548 items[1].code_len = 2;
549 items[1].code[0] = (OnigCodePoint )'S';
550 items[1].code[1] = (OnigCodePoint )'S';
551
552 items[2].byte_len = 1;
553 items[2].code_len = 2;
554 items[2].code[0] = (OnigCodePoint )'s';
555 items[2].code[1] = (OnigCodePoint )'S';
556
557 items[3].byte_len = 1;
558 items[3].code_len = 2;
559 items[3].code[0] = (OnigCodePoint )'S';
560 items[3].code[1] = (OnigCodePoint )'s';
561
562 return 4;
563 }
564 else {
565 int i;
566
567 for (i = 0; i < map_size; i++) {
568 if (*p == map[i].from) {
569 items[0].byte_len = 1;
570 items[0].code_len = 1;
571 items[0].code[0] = map[i].to;
572 return 1;
573 }
574 else if (*p == map[i].to) {
575 items[0].byte_len = 1;
576 items[0].code_len = 1;
577 items[0].code[0] = map[i].from;
578 return 1;
579 }
580 }
581 }
582
583 return 0;
584}
585
586
587extern int
588onigenc_not_support_get_ctype_code_range(OnigCtype ctype ARG_UNUSED,
589 OnigCodePoint* sb_out ARG_UNUSED,
590 const OnigCodePoint* ranges[] ARG_UNUSED,
591 OnigEncoding enc)
592{
593 return ONIG_NO_SUPPORT_CONFIG;
594}
595
596extern int
597onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end, OnigEncoding enc ARG_UNUSED)
598{
599 if (p < end) {
600 if (*p == 0x0a) return 1;
601 }
602 return 0;
603}
604
605/* for single byte encodings */
606extern int
607onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, const UChar** p,
608 const UChar* end, UChar* lower, OnigEncoding enc ARG_UNUSED)
609{
610 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
611
612 (*p)++;
613 return 1; /* return byte length of converted char to lower */
614}
615
616#if 0
617extern int
618onigenc_ascii_is_mbc_ambiguous(OnigCaseFoldType flag ARG_UNUSED,
619 const UChar** pp, const UChar* end ARG_UNUSED)
620{
621 const UChar* p = *pp;
622
623 (*pp)++;
624 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
625}
626#endif
627
628extern int
629onigenc_single_byte_mbc_enc_len(const UChar* p ARG_UNUSED, const UChar* e ARG_UNUSED,
630 OnigEncoding enc ARG_UNUSED)
631{
632 return 1;
633}
634
635extern OnigCodePoint
636onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
637 OnigEncoding enc ARG_UNUSED)
638{
639 return (OnigCodePoint )(*p);
640}
641
642extern int
643onigenc_single_byte_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
644{
645 if (code > 0xff)
646 return ONIGERR_INVALID_CODE_POINT_VALUE;
647 return 1;
648}
649
650extern int
651onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED)
652{
653 if (code > 0xff) {
654 return ONIGERR_INVALID_CODE_POINT_VALUE;
655 }
656 *buf = (UChar )(code & 0xff);
657 return 1;
658}
659
660extern UChar*
661onigenc_single_byte_left_adjust_char_head(const UChar* start ARG_UNUSED,
662 const UChar* s,
663 const UChar* end ARG_UNUSED,
664 OnigEncoding enc ARG_UNUSED)
665{
666 return (UChar* )s;
667}
668
669extern int
670onigenc_always_true_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
671 const UChar* end ARG_UNUSED,
672 OnigEncoding enc ARG_UNUSED)
673{
674 return TRUE;
675}
676
677extern int
678onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
679 const UChar* end ARG_UNUSED,
680 OnigEncoding enc ARG_UNUSED)
681{
682 return FALSE;
683}
684
685extern int
686onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype,
687 OnigEncoding enc ARG_UNUSED)
688{
689 if (code < 128)
690 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
691 else
692 return FALSE;
693}
694
695extern OnigCodePoint
696onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
697{
698 int c, i, len;
699 OnigCodePoint n;
700
701 len = enclen(enc, p, end);
702 n = (OnigCodePoint )(*p++);
703 if (len == 1) return n;
704
705 for (i = 1; i < len; i++) {
706 if (p >= end) break;
707 c = *p++;
708 n <<= 8; n += c;
709 }
710 return n;
711}
712
713extern int
714onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag ARG_UNUSED,
715 const UChar** pp, const UChar* end ARG_UNUSED,
716 UChar* lower)
717{
718 int len;
719 const UChar *p = *pp;
720
721 if (ONIGENC_IS_MBC_ASCII(p)) {
722 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
723 (*pp)++;
724 return 1;
725 }
726 else {
727 int i;
728
729 len = enclen(enc, p, end);
730 for (i = 0; i < len; i++) {
731 *lower++ = *p++;
732 }
733 (*pp) += len;
734 return len; /* return byte length of converted to lower char */
735 }
736}
737
738#if 0
739extern int
740onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,
741 const UChar** pp, const UChar* end ARG_UNUSED)
742{
743 const UChar* p = *pp;
744
745 if (ONIGENC_IS_MBC_ASCII(p)) {
746 (*pp)++;
747 return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
748 }
749
750 (*pp) += enclen(enc, p);
751 return FALSE;
752}
753#endif
754
755extern int
756onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
757{
758 if (code <= 0xff) return 1;
759 if (code <= 0xffff) return 2;
760 return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
761}
762
763extern int
764onigenc_mb4_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
765{
766 if ((code & 0xff000000) != 0) return 4;
767 else if ((code & 0xff0000) != 0) return 3;
768 else if ((code & 0xff00) != 0) return 2;
769 else return 1;
770}
771
772extern int
773onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
774{
775 UChar *p = buf;
776
777 if ((code & 0xff00) != 0) {
778 *p++ = (UChar )((code >> 8) & 0xff);
779 }
780 *p++ = (UChar )(code & 0xff);
781
782#if 1
783 if (enclen(enc, buf, p) != (p - buf))
784 return ONIGERR_INVALID_CODE_POINT_VALUE;
785#endif
786 return (int )(p - buf);
787}
788
789extern int
790onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
791{
792 UChar *p = buf;
793
794 if ((code & 0xff000000) != 0) {
795 *p++ = (UChar )((code >> 24) & 0xff);
796 }
797 if ((code & 0xff0000) != 0 || p != buf) {
798 *p++ = (UChar )((code >> 16) & 0xff);
799 }
800 if ((code & 0xff00) != 0 || p != buf) {
801 *p++ = (UChar )((code >> 8) & 0xff);
802 }
803 *p++ = (UChar )(code & 0xff);
804
805#if 1
806 if (enclen(enc, buf, p) != (p - buf))
807 return ONIGERR_INVALID_CODE_POINT_VALUE;
808#endif
809 return (int )(p - buf);
810}
811
812extern int
813onigenc_minimum_property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end)
814{
815 static const PosixBracketEntryType PBS[] = {
816 POSIX_BRACKET_ENTRY_INIT("Alnum", ONIGENC_CTYPE_ALNUM),
817 POSIX_BRACKET_ENTRY_INIT("Alpha", ONIGENC_CTYPE_ALPHA),
818 POSIX_BRACKET_ENTRY_INIT("Blank", ONIGENC_CTYPE_BLANK),
819 POSIX_BRACKET_ENTRY_INIT("Cntrl", ONIGENC_CTYPE_CNTRL),
820 POSIX_BRACKET_ENTRY_INIT("Digit", ONIGENC_CTYPE_DIGIT),
821 POSIX_BRACKET_ENTRY_INIT("Graph", ONIGENC_CTYPE_GRAPH),
822 POSIX_BRACKET_ENTRY_INIT("Lower", ONIGENC_CTYPE_LOWER),
823 POSIX_BRACKET_ENTRY_INIT("Print", ONIGENC_CTYPE_PRINT),
824 POSIX_BRACKET_ENTRY_INIT("Punct", ONIGENC_CTYPE_PUNCT),
825 POSIX_BRACKET_ENTRY_INIT("Space", ONIGENC_CTYPE_SPACE),
826 POSIX_BRACKET_ENTRY_INIT("Upper", ONIGENC_CTYPE_UPPER),
827 POSIX_BRACKET_ENTRY_INIT("XDigit", ONIGENC_CTYPE_XDIGIT),
828 POSIX_BRACKET_ENTRY_INIT("ASCII", ONIGENC_CTYPE_ASCII),
829 POSIX_BRACKET_ENTRY_INIT("Word", ONIGENC_CTYPE_WORD),
830 };
831
832 const PosixBracketEntryType *pb;
833 int len;
834
835 len = onigenc_strlen(enc, p, end);
836 for (pb = PBS; pb < PBS + numberof(PBS); pb++) {
837 if (len == pb->len &&
838 onigenc_with_ascii_strnicmp(enc, p, end, pb->name, pb->len) == 0)
839 return pb->ctype;
840 }
841
842 return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
843}
844
845extern int
846onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
847 unsigned int ctype)
848{
849 if (code < 128)
850 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
851 else {
852 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
853 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
854 }
855 }
856
857 return FALSE;
858}
859
860extern int
861onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
862 unsigned int ctype)
863{
864 if (code < 128)
865 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
866 else {
867 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
868 return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
869 }
870 }
871
872 return FALSE;
873}
874
875extern int
876onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
877 const UChar* sascii /* ascii */, int n)
878{
879 int x, c;
880
881 while (n-- > 0) {
882 if (p >= end) return (int )(*sascii);
883
884 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
885 x = *sascii - c;
886 if (x) return x;
887
888 sascii++;
889 p += enclen(enc, p, end);
890 }
891 return 0;
892}
893
894extern int
895onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end,
896 const UChar* sascii /* ascii */, int n)
897{
898 int x, c;
899
900 while (n-- > 0) {
901 if (p >= end) return (int )(*sascii);
902
903 c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
904 if (ONIGENC_IS_ASCII_CODE(c))
905 c = ONIGENC_ASCII_CODE_TO_LOWER_CASE(c);
906 x = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*sascii) - c;
907 if (x) return x;
908
909 sascii++;
910 p += enclen(enc, p, end);
911 }
912 return 0;
913}
914
915#if 0
916/* Property management */
917static int
918resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize)
919{
920 size_t size;
921 const OnigCodePoint **list = *plist;
922
923 size = sizeof(OnigCodePoint*) * new_size;
924 if (IS_NULL(list)) {
925 list = (const OnigCodePoint** )xmalloc(size);
926 if (IS_NULL(list)) return ONIGERR_MEMORY;
927 }
928 else {
929 const OnigCodePoint **tmp;
930 tmp = (const OnigCodePoint** )xrealloc((void* )list, size);
931 if (IS_NULL(tmp)) return ONIGERR_MEMORY;
932 list = tmp;
933 }
934
935 *plist = list;
936 *psize = new_size;
937
938 return 0;
939}
940
941extern int
942onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop,
943 hash_table_type **table, const OnigCodePoint*** plist, int *pnum,
944 int *psize)
945{
946#define PROP_INIT_SIZE 16
947
948 int r;
949
950 if (*psize <= *pnum) {
951 int new_size = (*psize == 0 ? PROP_INIT_SIZE : *psize * 2);
952 r = resize_property_list(new_size, plist, psize);
953 if (r != 0) return r;
954 }
955
956 (*plist)[*pnum] = prop;
957
958 if (ONIG_IS_NULL(*table)) {
959 *table = onig_st_init_strend_table_with_size(PROP_INIT_SIZE);
960 if (ONIG_IS_NULL(*table)) return ONIGERR_MEMORY;
961 }
962
963 *pnum = *pnum + 1;
964 onig_st_insert_strend(*table, name, name + strlen((char* )name),
965 (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE));
966 return 0;
967}
968#endif
969
970#ifdef USE_CASE_MAP_API
971extern int
972onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end,
973 OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc)
974{
975 OnigCodePoint code;
976 OnigUChar *to_start = to;
977 OnigCaseFoldType flags = *flagP;
978 int codepoint_length;
979
980 while (*pp < end && to < to_end) {
981 codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end);
982 if (codepoint_length < 0)
983 return codepoint_length; /* encoding invalid */
984 code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
985 *pp += codepoint_length;
986
987 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
988 flags |= ONIGENC_CASE_MODIFIED;
989 code -= 'a' - 'A';
990 }
991 else if (code >= 'A' && code <= 'Z' &&
992 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
993 flags |= ONIGENC_CASE_MODIFIED;
994 code += 'a' - 'A';
995 }
996 to += ONIGENC_CODE_TO_MBC(enc, code, to);
997 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
998 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
999 }
1000 *flagP = flags;
1001 return (int )(to - to_start);
1002}
1003
1004extern int
1005onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp,
1006 const OnigUChar* end, OnigUChar* to, OnigUChar* to_end,
1007 const struct OnigEncodingTypeST* enc)
1008{
1009 OnigCodePoint code;
1010 OnigUChar *to_start = to;
1011 OnigCaseFoldType flags = *flagP;
1012
1013 while (*pp < end && to < to_end) {
1014 code = *(*pp)++;
1015
1016 if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) {
1017 flags |= ONIGENC_CASE_MODIFIED;
1018 code -= 'a' - 'A';
1019 }
1020 else if (code >= 'A' && code <= 'Z' &&
1021 (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) {
1022 flags |= ONIGENC_CASE_MODIFIED;
1023 code += 'a' - 'A';
1024 }
1025 *to++ = code;
1026 if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */
1027 flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE);
1028 }
1029 *flagP = flags;
1030 return (int )(to - to_start);
1031}
1032#endif
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
int len
Length of the buffer.
Definition io.h:8
Definition regenc.h:119