Ruby 3.5.0dev (2025-02-20 revision 34098b669c0cbc024cd08e686891f1dfe0a10aaf)
transcode.c (34098b669c0cbc024cd08e686891f1dfe0a10aaf)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "ruby/encoding.h"
23
24#include "transcode_data.h"
25#include "id.h"
26
27#define ENABLE_ECONV_NEWLINE_OPTION 1
28
29/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30static VALUE rb_eUndefinedConversionError;
31static VALUE rb_eInvalidByteSequenceError;
32static VALUE rb_eConverterNotFoundError;
33
34VALUE rb_cEncodingConverter;
35
36static ID id_destination_encoding;
37static ID id_destination_encoding_name;
38static ID id_error_bytes;
39static ID id_error_char;
40static ID id_incomplete_input;
41static ID id_readagain_bytes;
42static ID id_source_encoding;
43static ID id_source_encoding_name;
44
45static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46static VALUE sym_xml, sym_text, sym_attr;
47static VALUE sym_universal_newline;
48static VALUE sym_crlf_newline;
49static VALUE sym_cr_newline;
50static VALUE sym_lf_newline;
51#ifdef ENABLE_ECONV_NEWLINE_OPTION
52static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
53#endif
54static VALUE sym_partial_input;
55
56static VALUE sym_invalid_byte_sequence;
57static VALUE sym_undefined_conversion;
58static VALUE sym_destination_buffer_full;
59static VALUE sym_source_buffer_empty;
60static VALUE sym_finished;
61static VALUE sym_after_output;
62static VALUE sym_incomplete_input;
63
64static unsigned char *
65allocate_converted_string(const char *sname, const char *dname,
66 const unsigned char *str, size_t len,
67 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
68 size_t *dst_len_ptr);
69
70/* dynamic structure, one per conversion (similar to iconv_t) */
71/* may carry conversion state (e.g. for iso-2022-jp) */
72typedef struct rb_transcoding {
73 const rb_transcoder *transcoder;
74
75 int flags;
76
77 int resume_position;
78 unsigned int next_table;
79 VALUE next_info;
80 unsigned char next_byte;
81 unsigned int output_index;
82
83 ssize_t recognized_len; /* already interpreted */
84 ssize_t readagain_len; /* not yet interpreted */
85 union {
86 unsigned char ary[8]; /* max_input <= sizeof(ary) */
87 unsigned char *ptr; /* length: max_input */
88 } readbuf; /* recognized_len + readagain_len used */
89
90 ssize_t writebuf_off;
91 ssize_t writebuf_len;
92 union {
93 unsigned char ary[8]; /* max_output <= sizeof(ary) */
94 unsigned char *ptr; /* length: max_output */
95 } writebuf;
96
97 union rb_transcoding_state_t { /* opaque data for stateful encoding */
98 void *ptr;
99 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
100 double dummy_for_alignment;
101 } state;
103#define TRANSCODING_READBUF(tc) \
104 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
105 (tc)->readbuf.ary : \
106 (tc)->readbuf.ptr)
107#define TRANSCODING_WRITEBUF(tc) \
108 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
109 (tc)->writebuf.ary : \
110 (tc)->writebuf.ptr)
111#define TRANSCODING_WRITEBUF_SIZE(tc) \
112 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
113 sizeof((tc)->writebuf.ary) : \
114 (size_t)(tc)->transcoder->max_output)
115#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
116#define TRANSCODING_STATE(tc) \
117 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
118 (tc)->state.ary : \
119 (tc)->state.ptr)
120
121typedef struct {
122 struct rb_transcoding *tc;
123 unsigned char *out_buf_start;
124 unsigned char *out_data_start;
125 unsigned char *out_data_end;
126 unsigned char *out_buf_end;
127 rb_econv_result_t last_result;
129
131 int flags;
132 int started; /* bool */
133
134 const char *source_encoding_name;
135 const char *destination_encoding_name;
136
137 const unsigned char *replacement_str;
138 size_t replacement_len;
139 const char *replacement_enc;
140
141 unsigned char *in_buf_start;
142 unsigned char *in_data_start;
143 unsigned char *in_data_end;
144 unsigned char *in_buf_end;
145 rb_econv_elem_t *elems;
146 int replacement_allocated; /* bool */
147 int num_allocated;
148 int num_trans;
149 int num_finished;
150 struct rb_transcoding *last_tc;
151
152 /* last error */
153 struct {
154 rb_econv_result_t result;
155 struct rb_transcoding *error_tc;
156 const char *source_encoding;
157 const char *destination_encoding;
158 const unsigned char *error_bytes_start;
159 size_t error_bytes_len;
160 size_t readagain_len;
161 } last_error;
162
163 /* The following fields are only for Encoding::Converter.
164 * rb_econv_open set them NULL. */
165 rb_encoding *source_encoding;
166 rb_encoding *destination_encoding;
167};
168
169/*
170 * Dispatch data and logic
171 */
172
173#define DECORATOR_P(sname, dname) (*(sname) == '\0')
174
175typedef struct {
176 const char *sname;
177 const char *dname;
178 const char *lib; /* null means no need to load a library */
179 const rb_transcoder *transcoder;
181
182static st_table *transcoder_table;
183
184static int
185free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
186{
187 xfree((void *)val);
188 return ST_DELETE;
189}
190
191static int
192free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
193{
194 st_foreach((void *)val, free_inner_transcode_i, 0);
195 st_free_table((void *)val);
196 return ST_DELETE;
197}
198
199void
200rb_free_transcoder_table(void)
201{
202 st_foreach(transcoder_table, free_transcode_i, 0);
203 st_free_table(transcoder_table);
204}
205
206static transcoder_entry_t *
207make_transcoder_entry(const char *sname, const char *dname)
208{
209 st_data_t val;
210 st_table *table2;
211
212 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
213 val = (st_data_t)st_init_strcasetable();
214 st_add_direct(transcoder_table, (st_data_t)sname, val);
215 }
216 table2 = (st_table *)val;
217 if (!st_lookup(table2, (st_data_t)dname, &val)) {
219 entry->sname = sname;
220 entry->dname = dname;
221 entry->lib = NULL;
222 entry->transcoder = NULL;
223 val = (st_data_t)entry;
224 st_add_direct(table2, (st_data_t)dname, val);
225 }
226 return (transcoder_entry_t *)val;
227}
228
229static transcoder_entry_t *
230get_transcoder_entry(const char *sname, const char *dname)
231{
232 st_data_t val;
233 st_table *table2;
234
235 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
236 return NULL;
237 }
238 table2 = (st_table *)val;
239 if (!st_lookup(table2, (st_data_t)dname, &val)) {
240 return NULL;
241 }
242 return (transcoder_entry_t *)val;
243}
244
245void
246rb_register_transcoder(const rb_transcoder *tr)
247{
248 const char *const sname = tr->src_encoding;
249 const char *const dname = tr->dst_encoding;
250
251 transcoder_entry_t *entry;
252
253 entry = make_transcoder_entry(sname, dname);
254 if (entry->transcoder) {
255 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
256 sname, dname);
257 }
258
259 entry->transcoder = tr;
260}
261
262static void
263declare_transcoder(const char *sname, const char *dname, const char *lib)
264{
265 transcoder_entry_t *entry;
266
267 entry = make_transcoder_entry(sname, dname);
268 entry->lib = lib;
269}
270
271static const char transcoder_lib_prefix[] = "enc/trans/";
272
273void
274rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
275{
276 if (!lib) {
277 rb_raise(rb_eArgError, "invalid library name - (null)");
278 }
279 declare_transcoder(enc1, enc2, lib);
280}
281
282#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
283
284typedef struct search_path_queue_tag {
285 struct search_path_queue_tag *next;
286 const char *enc;
288
289typedef struct {
290 st_table *visited;
291 search_path_queue_t *queue;
292 search_path_queue_t **queue_last_ptr;
293 const char *base_enc;
295
296static int
297transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
298{
299 const char *dname = (const char *)key;
302
303 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
304 return ST_CONTINUE;
305 }
306
308 q->enc = dname;
309 q->next = NULL;
310 *bfs->queue_last_ptr = q;
311 bfs->queue_last_ptr = &q->next;
312
313 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
314 return ST_CONTINUE;
315}
316
317static int
318transcode_search_path(const char *sname, const char *dname,
319 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
320 void *arg)
321{
324 st_data_t val;
325 st_table *table2;
326 int found;
327 int pathlen = -1;
328
329 if (encoding_equal(sname, dname))
330 return -1;
331
333 q->enc = sname;
334 q->next = NULL;
335 bfs.queue_last_ptr = &q->next;
336 bfs.queue = q;
337
338 bfs.visited = st_init_strcasetable();
339 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
340
341 while (bfs.queue) {
342 q = bfs.queue;
343 bfs.queue = q->next;
344 if (!bfs.queue)
345 bfs.queue_last_ptr = &bfs.queue;
346
347 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
348 xfree(q);
349 continue;
350 }
351 table2 = (st_table *)val;
352
353 if (st_lookup(table2, (st_data_t)dname, &val)) {
354 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
355 xfree(q);
356 found = 1;
357 goto cleanup;
358 }
359
360 bfs.base_enc = q->enc;
361 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
362 bfs.base_enc = NULL;
363
364 xfree(q);
365 }
366 found = 0;
367
368 cleanup:
369 while (bfs.queue) {
370 q = bfs.queue;
371 bfs.queue = q->next;
372 xfree(q);
373 }
374
375 if (found) {
376 const char *enc = dname;
377 int depth;
378 pathlen = 0;
379 while (1) {
380 st_lookup(bfs.visited, (st_data_t)enc, &val);
381 if (!val)
382 break;
383 pathlen++;
384 enc = (const char *)val;
385 }
386 depth = pathlen;
387 enc = dname;
388 while (1) {
389 st_lookup(bfs.visited, (st_data_t)enc, &val);
390 if (!val)
391 break;
392 callback((const char *)val, enc, --depth, arg);
393 enc = (const char *)val;
394 }
395 }
396
397 st_free_table(bfs.visited);
398
399 return pathlen; /* is -1 if not found */
400}
401
402int rb_require_internal_silent(VALUE fname);
403
404static const rb_transcoder *
405load_transcoder_entry(transcoder_entry_t *entry)
406{
407 if (entry->transcoder)
408 return entry->transcoder;
409
410 if (entry->lib) {
411 const char *const lib = entry->lib;
412 const size_t len = strlen(lib);
413 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
414 const VALUE fn = rb_str_new(0, total_len);
415 char *const path = RSTRING_PTR(fn);
416
417 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
418 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
419 rb_str_set_len(fn, total_len);
420 OBJ_FREEZE(fn);
421 rb_require_internal_silent(fn);
422 }
423
424 if (entry->transcoder)
425 return entry->transcoder;
426
427 return NULL;
428}
429
430static const char*
431get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
432{
433 if (encoding_equal(encname, "UTF-8")) {
434 *len_ret = 3;
435 *repl_encname_ptr = "UTF-8";
436 return "\xEF\xBF\xBD";
437 }
438 else {
439 *len_ret = 1;
440 *repl_encname_ptr = "US-ASCII";
441 return "?";
442 }
443}
444
445/*
446 * Transcoding engine logic
447 */
448
449static const unsigned char *
450transcode_char_start(rb_transcoding *tc,
451 const unsigned char *in_start,
452 const unsigned char *inchar_start,
453 const unsigned char *in_p,
454 size_t *char_len_ptr)
455{
456 const unsigned char *ptr;
457 if (inchar_start - in_start < tc->recognized_len) {
458 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
459 inchar_start, unsigned char, in_p - inchar_start);
460 ptr = TRANSCODING_READBUF(tc);
461 }
462 else {
463 ptr = inchar_start - tc->recognized_len;
464 }
465 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
466 return ptr;
467}
468
470transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
471 const unsigned char *in_stop, unsigned char *out_stop,
472 rb_transcoding *tc,
473 const int opt)
474{
475 const rb_transcoder *tr = tc->transcoder;
476 int unitlen = tr->input_unit_length;
477 ssize_t readagain_len = 0;
478
479 const unsigned char *inchar_start;
480 const unsigned char *in_p;
481
482 unsigned char *out_p;
483
484 in_p = inchar_start = *in_pos;
485
486 out_p = *out_pos;
487
488#define SUSPEND(ret, num) \
489 do { \
490 tc->resume_position = (num); \
491 if (0 < in_p - inchar_start) \
492 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
493 inchar_start, unsigned char, in_p - inchar_start); \
494 *in_pos = in_p; \
495 *out_pos = out_p; \
496 tc->recognized_len += in_p - inchar_start; \
497 if (readagain_len) { \
498 tc->recognized_len -= readagain_len; \
499 tc->readagain_len = readagain_len; \
500 } \
501 return (ret); \
502 resume_label ## num:; \
503 } while (0)
504#define SUSPEND_OBUF(num) \
505 do { \
506 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
507 } while (0)
508
509#define SUSPEND_AFTER_OUTPUT(num) \
510 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
511 SUSPEND(econv_after_output, num); \
512 }
513
514#define next_table (tc->next_table)
515#define next_info (tc->next_info)
516#define next_byte (tc->next_byte)
517#define writebuf_len (tc->writebuf_len)
518#define writebuf_off (tc->writebuf_off)
519
520 switch (tc->resume_position) {
521 case 0: break;
522 case 1: goto resume_label1;
523 case 2: goto resume_label2;
524 case 3: goto resume_label3;
525 case 4: goto resume_label4;
526 case 5: goto resume_label5;
527 case 6: goto resume_label6;
528 case 7: goto resume_label7;
529 case 8: goto resume_label8;
530 case 9: goto resume_label9;
531 case 10: goto resume_label10;
532 case 11: goto resume_label11;
533 case 12: goto resume_label12;
534 case 13: goto resume_label13;
535 case 14: goto resume_label14;
536 case 15: goto resume_label15;
537 case 16: goto resume_label16;
538 case 17: goto resume_label17;
539 case 18: goto resume_label18;
540 case 19: goto resume_label19;
541 case 20: goto resume_label20;
542 case 21: goto resume_label21;
543 case 22: goto resume_label22;
544 case 23: goto resume_label23;
545 case 24: goto resume_label24;
546 case 25: goto resume_label25;
547 case 26: goto resume_label26;
548 case 27: goto resume_label27;
549 case 28: goto resume_label28;
550 case 29: goto resume_label29;
551 case 30: goto resume_label30;
552 case 31: goto resume_label31;
553 case 32: goto resume_label32;
554 case 33: goto resume_label33;
555 case 34: goto resume_label34;
556 }
557
558 while (1) {
559 inchar_start = in_p;
560 tc->recognized_len = 0;
561 next_table = tr->conv_tree_start;
562
563 SUSPEND_AFTER_OUTPUT(24);
564
565 if (in_stop <= in_p) {
566 if (!(opt & ECONV_PARTIAL_INPUT))
567 break;
568 SUSPEND(econv_source_buffer_empty, 7);
569 continue;
570 }
571
572#define BYTE_ADDR(index) (tr->byte_array + (index))
573#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
574#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
575#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
576#define BL_MIN_BYTE (BL_BASE[0])
577#define BL_MAX_BYTE (BL_BASE[1])
578#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
579#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
580
581 next_byte = (unsigned char)*in_p++;
582 follow_byte:
583 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
584 next_info = INVALID;
585 else {
586 next_info = (VALUE)BL_ACTION(next_byte);
587 }
588 follow_info:
589 switch (next_info & 0x1F) {
590 case NOMAP:
591 {
592 const unsigned char *p = inchar_start;
593 writebuf_off = 0;
594 while (p < in_p) {
595 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
596 }
597 writebuf_len = writebuf_off;
598 writebuf_off = 0;
599 while (writebuf_off < writebuf_len) {
600 SUSPEND_OBUF(3);
601 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
602 }
603 }
604 continue;
605 case 0x00: case 0x04: case 0x08: case 0x0C:
606 case 0x10: case 0x14: case 0x18: case 0x1C:
607 SUSPEND_AFTER_OUTPUT(25);
608 while (in_p >= in_stop) {
609 if (!(opt & ECONV_PARTIAL_INPUT))
610 goto incomplete;
611 SUSPEND(econv_source_buffer_empty, 5);
612 }
613 next_byte = (unsigned char)*in_p++;
614 next_table = (unsigned int)next_info;
615 goto follow_byte;
616 case ZERObt: /* drop input */
617 continue;
618 case ONEbt:
619 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
620 continue;
621 case TWObt:
622 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
623 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
624 continue;
625 case THREEbt:
626 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
627 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
628 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
629 continue;
630 case FOURbt:
631 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
632 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
633 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
634 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
635 continue;
636 case GB4bt:
637 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
638 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
639 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
640 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
641 continue;
642 case STR1:
643 tc->output_index = 0;
644 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
645 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
646 tc->output_index++;
647 }
648 continue;
649 case FUNii:
650 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
651 goto follow_info;
652 case FUNsi:
653 {
654 const unsigned char *char_start;
655 size_t char_len;
656 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
657 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
658 goto follow_info;
659 }
660 case FUNio:
661 SUSPEND_OBUF(13);
662 if (tr->max_output <= out_stop - out_p)
663 out_p += tr->func_io(TRANSCODING_STATE(tc),
664 next_info, out_p, out_stop - out_p);
665 else {
666 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
667 next_info,
668 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
669 writebuf_off = 0;
670 while (writebuf_off < writebuf_len) {
671 SUSPEND_OBUF(20);
672 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
673 }
674 }
675 break;
676 case FUNso:
677 {
678 const unsigned char *char_start;
679 size_t char_len;
680 SUSPEND_OBUF(14);
681 if (tr->max_output <= out_stop - out_p) {
682 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
683 out_p += tr->func_so(TRANSCODING_STATE(tc),
684 char_start, (size_t)char_len,
685 out_p, out_stop - out_p);
686 }
687 else {
688 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
689 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
690 char_start, (size_t)char_len,
691 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
692 writebuf_off = 0;
693 while (writebuf_off < writebuf_len) {
694 SUSPEND_OBUF(22);
695 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
696 }
697 }
698 break;
699 }
700 case FUNsio:
701 {
702 const unsigned char *char_start;
703 size_t char_len;
704 SUSPEND_OBUF(33);
705 if (tr->max_output <= out_stop - out_p) {
706 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
707 out_p += tr->func_sio(TRANSCODING_STATE(tc),
708 char_start, (size_t)char_len, next_info,
709 out_p, out_stop - out_p);
710 }
711 else {
712 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
713 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
714 char_start, (size_t)char_len, next_info,
715 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
716 writebuf_off = 0;
717 while (writebuf_off < writebuf_len) {
718 SUSPEND_OBUF(34);
719 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
720 }
721 }
722 break;
723 }
724 case INVALID:
725 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
726 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
727 SUSPEND_AFTER_OUTPUT(26);
728 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
729 in_p = in_stop;
730 SUSPEND(econv_source_buffer_empty, 8);
731 }
732 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
733 in_p = in_stop;
734 }
735 else {
736 in_p = inchar_start + (unitlen - tc->recognized_len);
737 }
738 }
739 else {
740 ssize_t invalid_len; /* including the last byte which causes invalid */
741 ssize_t discard_len;
742 invalid_len = tc->recognized_len + (in_p - inchar_start);
743 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
744 readagain_len = invalid_len - discard_len;
745 }
746 goto invalid;
747 case UNDEF:
748 goto undef;
749 default:
750 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
751 }
752 continue;
753
754 invalid:
755 SUSPEND(econv_invalid_byte_sequence, 1);
756 continue;
757
758 incomplete:
759 SUSPEND(econv_incomplete_input, 27);
760 continue;
761
762 undef:
763 SUSPEND(econv_undefined_conversion, 2);
764 continue;
765 }
766
767 /* cleanup */
768 if (tr->finish_func) {
769 SUSPEND_OBUF(4);
770 if (tr->max_output <= out_stop - out_p) {
771 out_p += tr->finish_func(TRANSCODING_STATE(tc),
772 out_p, out_stop - out_p);
773 }
774 else {
775 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
776 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
777 writebuf_off = 0;
778 while (writebuf_off < writebuf_len) {
779 SUSPEND_OBUF(23);
780 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
781 }
782 }
783 }
784 while (1)
785 SUSPEND(econv_finished, 6);
786#undef SUSPEND
787#undef next_table
788#undef next_info
789#undef next_byte
790#undef writebuf_len
791#undef writebuf_off
792}
793
795transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
796 const unsigned char *in_stop, unsigned char *out_stop,
797 rb_transcoding *tc,
798 const int opt)
799{
800 if (tc->readagain_len) {
801 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
802 const unsigned char *readagain_pos = readagain_buf;
803 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
805
806 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
807 unsigned char, tc->readagain_len);
808 tc->readagain_len = 0;
809 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
810 if (res != econv_source_buffer_empty) {
811 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
812 readagain_pos, unsigned char, readagain_stop - readagain_pos);
813 tc->readagain_len += readagain_stop - readagain_pos;
814 return res;
815 }
816 }
817 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
818}
819
820static rb_transcoding *
821rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
822{
823 rb_transcoding *tc;
824
825 tc = ALLOC(rb_transcoding);
826 tc->transcoder = tr;
827 tc->flags = flags;
828 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
829 tc->state.ptr = xmalloc(tr->state_size);
830 if (tr->state_init_func) {
831 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
832 }
833 tc->resume_position = 0;
834 tc->recognized_len = 0;
835 tc->readagain_len = 0;
836 tc->writebuf_len = 0;
837 tc->writebuf_off = 0;
838 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
839 tc->readbuf.ptr = xmalloc(tr->max_input);
840 }
841 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
842 tc->writebuf.ptr = xmalloc(tr->max_output);
843 }
844 return tc;
845}
846
848rb_transcoding_convert(rb_transcoding *tc,
849 const unsigned char **input_ptr, const unsigned char *input_stop,
850 unsigned char **output_ptr, unsigned char *output_stop,
851 int flags)
852{
853 return transcode_restartable(
854 input_ptr, output_ptr,
855 input_stop, output_stop,
856 tc, flags);
857}
858
859static void
860rb_transcoding_close(rb_transcoding *tc)
861{
862 const rb_transcoder *tr = tc->transcoder;
863 if (tr->state_fini_func) {
864 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
865 }
866 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
867 xfree(tc->state.ptr);
868 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
869 xfree(tc->readbuf.ptr);
870 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
871 xfree(tc->writebuf.ptr);
872 xfree(tc);
873}
874
875static size_t
876rb_transcoding_memsize(rb_transcoding *tc)
877{
878 size_t size = sizeof(rb_transcoding);
879 const rb_transcoder *tr = tc->transcoder;
880
881 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
882 size += tr->state_size;
883 }
884 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
885 size += tr->max_input;
886 }
887 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
888 size += tr->max_output;
889 }
890 return size;
891}
892
893static rb_econv_t *
894rb_econv_alloc(int n_hint)
895{
896 rb_econv_t *ec;
897
898 if (n_hint <= 0)
899 n_hint = 1;
900
901 ec = ALLOC(rb_econv_t);
902 ec->flags = 0;
903 ec->source_encoding_name = NULL;
904 ec->destination_encoding_name = NULL;
905 ec->started = 0;
906 ec->replacement_str = NULL;
907 ec->replacement_len = 0;
908 ec->replacement_enc = NULL;
909 ec->replacement_allocated = 0;
910 ec->in_buf_start = NULL;
911 ec->in_data_start = NULL;
912 ec->in_data_end = NULL;
913 ec->in_buf_end = NULL;
914 ec->num_allocated = n_hint;
915 ec->num_trans = 0;
916 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
917 ec->num_finished = 0;
918 ec->last_tc = NULL;
919 ec->last_error.result = econv_source_buffer_empty;
920 ec->last_error.error_tc = NULL;
921 ec->last_error.source_encoding = NULL;
922 ec->last_error.destination_encoding = NULL;
923 ec->last_error.error_bytes_start = NULL;
924 ec->last_error.error_bytes_len = 0;
925 ec->last_error.readagain_len = 0;
926 ec->source_encoding = NULL;
927 ec->destination_encoding = NULL;
928 return ec;
929}
930
931static int
932rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
933{
934 int n, j;
935 int bufsize = 4096;
936 unsigned char *p;
937
938 if (ec->num_trans == ec->num_allocated) {
939 n = ec->num_allocated * 2;
940 REALLOC_N(ec->elems, rb_econv_elem_t, n);
941 ec->num_allocated = n;
942 }
943
944 p = xmalloc(bufsize);
945
946 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
947
948 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
949 ec->elems[i].out_buf_start = p;
950 ec->elems[i].out_buf_end = p + bufsize;
951 ec->elems[i].out_data_start = p;
952 ec->elems[i].out_data_end = p;
953 ec->elems[i].last_result = econv_source_buffer_empty;
954
955 ec->num_trans++;
956
957 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
958 for (j = ec->num_trans-1; i <= j; j--) {
959 rb_transcoding *tc = ec->elems[j].tc;
960 const rb_transcoder *tr2 = tc->transcoder;
961 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
962 ec->last_tc = tc;
963 break;
964 }
965 }
966
967 return 0;
968}
969
970static rb_econv_t *
971rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
972{
973 rb_econv_t *ec;
974 int i, ret;
975
976 for (i = 0; i < n; i++) {
977 const rb_transcoder *tr;
978 tr = load_transcoder_entry(entries[i]);
979 if (!tr)
980 return NULL;
981 }
982
983 ec = rb_econv_alloc(n);
984
985 for (i = 0; i < n; i++) {
986 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
987 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
988 if (ret == -1) {
989 rb_econv_close(ec);
990 return NULL;
991 }
992 }
993
994 return ec;
995}
996
998 transcoder_entry_t **entries;
999 int num_additional;
1000};
1001
1002static void
1003trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1004{
1005 struct trans_open_t *toarg = arg;
1006
1007 if (!toarg->entries) {
1008 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1009 }
1010 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1011}
1012
1013static rb_econv_t *
1014rb_econv_open0(const char *sname, const char *dname, int ecflags)
1015{
1016 transcoder_entry_t **entries = NULL;
1017 int num_trans;
1018 rb_econv_t *ec;
1019
1020 /* Just check if sname and dname are defined */
1021 /* (This check is needed?) */
1022 if (*sname) rb_enc_find_index(sname);
1023 if (*dname) rb_enc_find_index(dname);
1024
1025 if (*sname == '\0' && *dname == '\0') {
1026 num_trans = 0;
1027 entries = NULL;
1028 sname = dname = "";
1029 }
1030 else {
1031 struct trans_open_t toarg;
1032 toarg.entries = NULL;
1033 toarg.num_additional = 0;
1034 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1035 entries = toarg.entries;
1036 if (num_trans < 0) {
1037 xfree(entries);
1038 return NULL;
1039 }
1040 }
1041
1042 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1043 xfree(entries);
1044 if (!ec)
1045 return NULL;
1046
1047 ec->flags = ecflags;
1048 ec->source_encoding_name = sname;
1049 ec->destination_encoding_name = dname;
1050
1051 return ec;
1052}
1053
1054#define MAX_ECFLAGS_DECORATORS 32
1055
1056static int
1057decorator_names(int ecflags, const char **decorators_ret)
1058{
1059 int num_decorators;
1060
1061 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1066 case 0:
1067 break;
1068 default:
1069 return -1;
1070 }
1071
1072 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1074 return -1;
1075
1076 num_decorators = 0;
1077
1078 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1079 decorators_ret[num_decorators++] = "xml_text_escape";
1081 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1082 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1083 decorators_ret[num_decorators++] = "xml_attr_quote";
1084
1085 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1086 decorators_ret[num_decorators++] = "crlf_newline";
1087 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1088 decorators_ret[num_decorators++] = "cr_newline";
1089 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1090 decorators_ret[num_decorators++] = "lf_newline";
1092 decorators_ret[num_decorators++] = "universal_newline";
1093
1094 return num_decorators;
1095}
1096
1097rb_econv_t *
1098rb_econv_open(const char *sname, const char *dname, int ecflags)
1099{
1100 rb_econv_t *ec;
1101 int num_decorators;
1102 const char *decorators[MAX_ECFLAGS_DECORATORS];
1103 int i;
1104
1105 num_decorators = decorator_names(ecflags, decorators);
1106 if (num_decorators == -1)
1107 return NULL;
1108
1109 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1110 if (!ec)
1111 return NULL;
1112
1113 for (i = 0; i < num_decorators; i++)
1114 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1115 rb_econv_close(ec);
1116 return NULL;
1117 }
1118
1119 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1120
1121 return ec;
1122}
1123
1124static int
1125trans_sweep(rb_econv_t *ec,
1126 const unsigned char **input_ptr, const unsigned char *input_stop,
1127 unsigned char **output_ptr, unsigned char *output_stop,
1128 int flags,
1129 int start)
1130{
1131 int try;
1132 int i, f;
1133
1134 const unsigned char **ipp, *is, *iold;
1135 unsigned char **opp, *os, *oold;
1137
1138 try = 1;
1139 while (try) {
1140 try = 0;
1141 for (i = start; i < ec->num_trans; i++) {
1142 rb_econv_elem_t *te = &ec->elems[i];
1143
1144 if (i == 0) {
1145 ipp = input_ptr;
1146 is = input_stop;
1147 }
1148 else {
1149 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1150 ipp = (const unsigned char **)&prev_te->out_data_start;
1151 is = prev_te->out_data_end;
1152 }
1153
1154 if (i == ec->num_trans-1) {
1155 opp = output_ptr;
1156 os = output_stop;
1157 }
1158 else {
1159 if (te->out_buf_start != te->out_data_start) {
1160 ssize_t len = te->out_data_end - te->out_data_start;
1161 ssize_t off = te->out_data_start - te->out_buf_start;
1162 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1163 te->out_data_start = te->out_buf_start;
1164 te->out_data_end -= off;
1165 }
1166 opp = &te->out_data_end;
1167 os = te->out_buf_end;
1168 }
1169
1170 f = flags;
1171 if (ec->num_finished != i)
1173 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1174 start = 1;
1175 flags &= ~ECONV_AFTER_OUTPUT;
1176 }
1177 if (i != 0)
1178 f &= ~ECONV_AFTER_OUTPUT;
1179 iold = *ipp;
1180 oold = *opp;
1181 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1182 if (iold != *ipp || oold != *opp)
1183 try = 1;
1184
1185 switch (res) {
1189 case econv_after_output:
1190 return i;
1191
1194 break;
1195
1196 case econv_finished:
1197 ec->num_finished = i+1;
1198 break;
1199 }
1200 }
1201 }
1202 return -1;
1203}
1204
1205static rb_econv_result_t
1206rb_trans_conv(rb_econv_t *ec,
1207 const unsigned char **input_ptr, const unsigned char *input_stop,
1208 unsigned char **output_ptr, unsigned char *output_stop,
1209 int flags,
1210 int *result_position_ptr)
1211{
1212 int i;
1213 int needreport_index;
1214 int sweep_start;
1215
1216 unsigned char empty_buf;
1217 unsigned char *empty_ptr = &empty_buf;
1218
1219 if (!input_ptr) {
1220 input_ptr = (const unsigned char **)&empty_ptr;
1221 input_stop = empty_ptr;
1222 }
1223
1224 if (!output_ptr) {
1225 output_ptr = &empty_ptr;
1226 output_stop = empty_ptr;
1227 }
1228
1229 if (ec->elems[0].last_result == econv_after_output)
1230 ec->elems[0].last_result = econv_source_buffer_empty;
1231
1232 for (i = ec->num_trans-1; 0 <= i; i--) {
1233 switch (ec->elems[i].last_result) {
1237 case econv_after_output:
1238 case econv_finished:
1239 sweep_start = i+1;
1240 goto found_needreport;
1241
1244 break;
1245
1246 default:
1247 rb_bug("unexpected transcode last result");
1248 }
1249 }
1250
1251 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1252
1253 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1254 (flags & ECONV_AFTER_OUTPUT)) {
1256
1257 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1259 result_position_ptr);
1260
1261 if (res == econv_source_buffer_empty)
1262 return econv_after_output;
1263 return res;
1264 }
1265
1266 sweep_start = 0;
1267
1268 found_needreport:
1269
1270 do {
1271 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1272 sweep_start = needreport_index + 1;
1273 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1274
1275 for (i = ec->num_trans-1; 0 <= i; i--) {
1276 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1277 rb_econv_result_t res = ec->elems[i].last_result;
1278 if (res == econv_invalid_byte_sequence ||
1279 res == econv_incomplete_input ||
1281 res == econv_after_output) {
1282 ec->elems[i].last_result = econv_source_buffer_empty;
1283 }
1284 if (result_position_ptr)
1285 *result_position_ptr = i;
1286 return res;
1287 }
1288 }
1289 if (result_position_ptr)
1290 *result_position_ptr = -1;
1292}
1293
1294static rb_econv_result_t
1295rb_econv_convert0(rb_econv_t *ec,
1296 const unsigned char **input_ptr, const unsigned char *input_stop,
1297 unsigned char **output_ptr, unsigned char *output_stop,
1298 int flags)
1299{
1301 int result_position;
1302 int has_output = 0;
1303
1304 memset(&ec->last_error, 0, sizeof(ec->last_error));
1305
1306 if (ec->num_trans == 0) {
1307 size_t len;
1308 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1309 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1310 len = output_stop - *output_ptr;
1311 memcpy(*output_ptr, ec->in_data_start, len);
1312 *output_ptr = output_stop;
1313 ec->in_data_start += len;
1315 goto gotresult;
1316 }
1317 len = ec->in_data_end - ec->in_data_start;
1318 memcpy(*output_ptr, ec->in_data_start, len);
1319 *output_ptr += len;
1320 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1321 if (flags & ECONV_AFTER_OUTPUT) {
1322 res = econv_after_output;
1323 goto gotresult;
1324 }
1325 }
1326 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1327 len = output_stop - *output_ptr;
1328 }
1329 else {
1330 len = input_stop - *input_ptr;
1331 }
1332 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1333 *(*output_ptr)++ = *(*input_ptr)++;
1334 res = econv_after_output;
1335 goto gotresult;
1336 }
1337 memcpy(*output_ptr, *input_ptr, len);
1338 *output_ptr += len;
1339 *input_ptr += len;
1340 if (*input_ptr != input_stop)
1342 else if (flags & ECONV_PARTIAL_INPUT)
1344 else
1345 res = econv_finished;
1346 goto gotresult;
1347 }
1348
1349 if (ec->elems[ec->num_trans-1].out_data_start) {
1350 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1351 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1352 if (data_start != data_end) {
1353 size_t len;
1354 if (output_stop - *output_ptr < data_end - data_start) {
1355 len = output_stop - *output_ptr;
1356 memcpy(*output_ptr, data_start, len);
1357 *output_ptr = output_stop;
1358 ec->elems[ec->num_trans-1].out_data_start += len;
1360 goto gotresult;
1361 }
1362 len = data_end - data_start;
1363 memcpy(*output_ptr, data_start, len);
1364 *output_ptr += len;
1365 ec->elems[ec->num_trans-1].out_data_start =
1366 ec->elems[ec->num_trans-1].out_data_end =
1367 ec->elems[ec->num_trans-1].out_buf_start;
1368 has_output = 1;
1369 }
1370 }
1371
1372 if (ec->in_buf_start &&
1373 ec->in_data_start != ec->in_data_end) {
1374 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1375 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1376 if (res != econv_source_buffer_empty)
1377 goto gotresult;
1378 }
1379
1380 if (has_output &&
1381 (flags & ECONV_AFTER_OUTPUT) &&
1382 *input_ptr != input_stop) {
1383 input_stop = *input_ptr;
1384 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1385 if (res == econv_source_buffer_empty)
1386 res = econv_after_output;
1387 }
1388 else if ((flags & ECONV_AFTER_OUTPUT) ||
1389 ec->num_trans == 1) {
1390 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1391 }
1392 else {
1393 flags |= ECONV_AFTER_OUTPUT;
1394 do {
1395 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1396 } while (res == econv_after_output);
1397 }
1398
1399 gotresult:
1400 ec->last_error.result = res;
1401 if (res == econv_invalid_byte_sequence ||
1402 res == econv_incomplete_input ||
1404 rb_transcoding *error_tc = ec->elems[result_position].tc;
1405 ec->last_error.error_tc = error_tc;
1406 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1407 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1408 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1409 ec->last_error.error_bytes_len = error_tc->recognized_len;
1410 ec->last_error.readagain_len = error_tc->readagain_len;
1411 }
1412
1413 return res;
1414}
1415
1416static int output_replacement_character(rb_econv_t *ec);
1417
1418static int
1419output_hex_charref(rb_econv_t *ec)
1420{
1421 int ret;
1422 unsigned char utfbuf[1024];
1423 const unsigned char *utf;
1424 size_t utf_len;
1425 int utf_allocated = 0;
1426 char charef_buf[16];
1427 const unsigned char *p;
1428
1429 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1430 utf = ec->last_error.error_bytes_start;
1431 utf_len = ec->last_error.error_bytes_len;
1432 }
1433 else {
1434 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1435 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1436 utfbuf, sizeof(utfbuf),
1437 &utf_len);
1438 if (!utf)
1439 return -1;
1440 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1441 utf_allocated = 1;
1442 }
1443
1444 if (utf_len % 4 != 0)
1445 goto fail;
1446
1447 p = utf;
1448 while (4 <= utf_len) {
1449 unsigned int u = 0;
1450 u += p[0] << 24;
1451 u += p[1] << 16;
1452 u += p[2] << 8;
1453 u += p[3];
1454 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1455
1456 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1457 if (ret == -1)
1458 goto fail;
1459
1460 p += 4;
1461 utf_len -= 4;
1462 }
1463
1464 if (utf_allocated)
1465 xfree((void *)utf);
1466 return 0;
1467
1468 fail:
1469 if (utf_allocated)
1470 xfree((void *)utf);
1471 return -1;
1472}
1473
1476 const unsigned char **input_ptr, const unsigned char *input_stop,
1477 unsigned char **output_ptr, unsigned char *output_stop,
1478 int flags)
1479{
1481
1482 unsigned char empty_buf;
1483 unsigned char *empty_ptr = &empty_buf;
1484
1485 ec->started = 1;
1486
1487 if (!input_ptr) {
1488 input_ptr = (const unsigned char **)&empty_ptr;
1489 input_stop = empty_ptr;
1490 }
1491
1492 if (!output_ptr) {
1493 output_ptr = &empty_ptr;
1494 output_stop = empty_ptr;
1495 }
1496
1497 resume:
1498 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1499
1500 if (ret == econv_invalid_byte_sequence ||
1501 ret == econv_incomplete_input) {
1502 /* deal with invalid byte sequence */
1503 /* todo: add more alternative behaviors */
1504 switch (ec->flags & ECONV_INVALID_MASK) {
1506 if (output_replacement_character(ec) == 0)
1507 goto resume;
1508 }
1509 }
1510
1511 if (ret == econv_undefined_conversion) {
1512 /* valid character in source encoding
1513 * but no related character(s) in destination encoding */
1514 /* todo: add more alternative behaviors */
1515 switch (ec->flags & ECONV_UNDEF_MASK) {
1517 if (output_replacement_character(ec) == 0)
1518 goto resume;
1519 break;
1520
1522 if (output_hex_charref(ec) == 0)
1523 goto resume;
1524 break;
1525 }
1526 }
1527
1528 return ret;
1529}
1530
1531const char *
1533{
1534 rb_transcoding *tc = ec->last_tc;
1535 const rb_transcoder *tr;
1536
1537 if (tc == NULL)
1538 return "";
1539
1540 tr = tc->transcoder;
1541
1542 if (tr->asciicompat_type == asciicompat_encoder)
1543 return tr->src_encoding;
1544 return tr->dst_encoding;
1545}
1546
1547static unsigned char *
1548allocate_converted_string(const char *sname, const char *dname,
1549 const unsigned char *str, size_t len,
1550 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1551 size_t *dst_len_ptr)
1552{
1553 unsigned char *dst_str;
1554 size_t dst_len;
1555 size_t dst_bufsize;
1556
1557 rb_econv_t *ec;
1559
1560 const unsigned char *sp;
1561 unsigned char *dp;
1562
1563 if (caller_dst_buf)
1564 dst_bufsize = caller_dst_bufsize;
1565 else if (len == 0)
1566 dst_bufsize = 1;
1567 else
1568 dst_bufsize = len;
1569
1570 ec = rb_econv_open(sname, dname, 0);
1571 if (ec == NULL)
1572 return NULL;
1573 if (caller_dst_buf)
1574 dst_str = caller_dst_buf;
1575 else
1576 dst_str = xmalloc(dst_bufsize);
1577 dst_len = 0;
1578 sp = str;
1579 dp = dst_str+dst_len;
1580 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1581 dst_len = dp - dst_str;
1582 while (res == econv_destination_buffer_full) {
1583 if (SIZE_MAX/2 < dst_bufsize) {
1584 goto fail;
1585 }
1586 dst_bufsize *= 2;
1587 if (dst_str == caller_dst_buf) {
1588 unsigned char *tmp;
1589 tmp = xmalloc(dst_bufsize);
1590 memcpy(tmp, dst_str, dst_bufsize/2);
1591 dst_str = tmp;
1592 }
1593 else {
1594 dst_str = xrealloc(dst_str, dst_bufsize);
1595 }
1596 dp = dst_str+dst_len;
1597 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1598 dst_len = dp - dst_str;
1599 }
1600 if (res != econv_finished) {
1601 goto fail;
1602 }
1603 rb_econv_close(ec);
1604 *dst_len_ptr = dst_len;
1605 return dst_str;
1606
1607 fail:
1608 if (dst_str != caller_dst_buf)
1609 xfree(dst_str);
1610 rb_econv_close(ec);
1611 return NULL;
1612}
1613
1614/* result: 0:success -1:failure */
1615int
1617 const unsigned char *str, size_t len, const char *str_encoding)
1618{
1619 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1620 unsigned char insert_buf[4096];
1621 const unsigned char *insert_str = NULL;
1622 size_t insert_len;
1623
1624 int last_trans_index;
1625 rb_transcoding *tc;
1626
1627 unsigned char **buf_start_p;
1628 unsigned char **data_start_p;
1629 unsigned char **data_end_p;
1630 unsigned char **buf_end_p;
1631
1632 size_t need;
1633
1634 ec->started = 1;
1635
1636 if (len == 0)
1637 return 0;
1638
1639 if (encoding_equal(insert_encoding, str_encoding)) {
1640 insert_str = str;
1641 insert_len = len;
1642 }
1643 else {
1644 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1645 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1646 if (insert_str == NULL)
1647 return -1;
1648 }
1649
1650 need = insert_len;
1651
1652 last_trans_index = ec->num_trans-1;
1653 if (ec->num_trans == 0) {
1654 tc = NULL;
1655 buf_start_p = &ec->in_buf_start;
1656 data_start_p = &ec->in_data_start;
1657 data_end_p = &ec->in_data_end;
1658 buf_end_p = &ec->in_buf_end;
1659 }
1660 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1661 tc = ec->elems[last_trans_index].tc;
1662 need += tc->readagain_len;
1663 if (need < insert_len)
1664 goto fail;
1665 if (last_trans_index == 0) {
1666 buf_start_p = &ec->in_buf_start;
1667 data_start_p = &ec->in_data_start;
1668 data_end_p = &ec->in_data_end;
1669 buf_end_p = &ec->in_buf_end;
1670 }
1671 else {
1672 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1673 buf_start_p = &ee->out_buf_start;
1674 data_start_p = &ee->out_data_start;
1675 data_end_p = &ee->out_data_end;
1676 buf_end_p = &ee->out_buf_end;
1677 }
1678 }
1679 else {
1680 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1681 buf_start_p = &ee->out_buf_start;
1682 data_start_p = &ee->out_data_start;
1683 data_end_p = &ee->out_data_end;
1684 buf_end_p = &ee->out_buf_end;
1685 tc = ec->elems[last_trans_index].tc;
1686 }
1687
1688 if (*buf_start_p == NULL) {
1689 unsigned char *buf = xmalloc(need);
1690 *buf_start_p = buf;
1691 *data_start_p = buf;
1692 *data_end_p = buf;
1693 *buf_end_p = buf+need;
1694 }
1695 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1696 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1697 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1698 *data_start_p = *buf_start_p;
1699 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1700 unsigned char *buf;
1701 size_t s = (*data_end_p - *buf_start_p) + need;
1702 if (s < need)
1703 goto fail;
1704 buf = xrealloc(*buf_start_p, s);
1705 *data_start_p = buf;
1706 *data_end_p = buf + (*data_end_p - *buf_start_p);
1707 *buf_start_p = buf;
1708 *buf_end_p = buf + s;
1709 }
1710 }
1711
1712 memcpy(*data_end_p, insert_str, insert_len);
1713 *data_end_p += insert_len;
1714 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1715 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1716 *data_end_p += tc->readagain_len;
1717 tc->readagain_len = 0;
1718 }
1719
1720 if (insert_str != str && insert_str != insert_buf)
1721 xfree((void*)insert_str);
1722 return 0;
1723
1724 fail:
1725 if (insert_str != str && insert_str != insert_buf)
1726 xfree((void*)insert_str);
1727 return -1;
1728}
1729
1730void
1732{
1733 int i;
1734
1735 if (ec->replacement_allocated) {
1736 xfree((void *)ec->replacement_str);
1737 }
1738 for (i = 0; i < ec->num_trans; i++) {
1739 rb_transcoding_close(ec->elems[i].tc);
1740 xfree(ec->elems[i].out_buf_start);
1741 }
1742 xfree(ec->in_buf_start);
1743 xfree(ec->elems);
1744 xfree(ec);
1745}
1746
1747size_t
1748rb_econv_memsize(rb_econv_t *ec)
1749{
1750 size_t size = sizeof(rb_econv_t);
1751 int i;
1752
1753 if (ec->replacement_allocated) {
1754 size += ec->replacement_len;
1755 }
1756 for (i = 0; i < ec->num_trans; i++) {
1757 size += rb_transcoding_memsize(ec->elems[i].tc);
1758
1759 if (ec->elems[i].out_buf_start) {
1760 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1761 }
1762 }
1763 size += ec->in_buf_end - ec->in_buf_start;
1764 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1765
1766 return size;
1767}
1768
1769int
1771{
1772 if (ec->num_trans == 0)
1773 return 0;
1774#if SIZEOF_SIZE_T > SIZEOF_INT
1775 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1776#endif
1777 return (int)ec->elems[0].tc->readagain_len;
1778}
1779
1780void
1781rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1782{
1783 rb_transcoding *tc;
1784 if (ec->num_trans == 0 || n == 0)
1785 return;
1786 tc = ec->elems[0].tc;
1787 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1788 tc->readagain_len -= n;
1789}
1790
1792 const char *ascii_compat_name;
1793 const char *ascii_incompat_name;
1794};
1795
1796static int
1797asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1798{
1799 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1800 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1801 const rb_transcoder *tr;
1802
1803 if (DECORATOR_P(entry->sname, entry->dname))
1804 return ST_CONTINUE;
1805 tr = load_transcoder_entry(entry);
1806 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1807 data->ascii_compat_name = tr->dst_encoding;
1808 return ST_STOP;
1809 }
1810 return ST_CONTINUE;
1811}
1812
1813const char *
1814rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1815{
1816 st_data_t v;
1817 st_table *table2;
1818 struct asciicompat_encoding_t data;
1819
1820 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1821 return NULL;
1822 table2 = (st_table *)v;
1823
1824 /*
1825 * Assumption:
1826 * There is at most one transcoder for
1827 * converting from ASCII incompatible encoding.
1828 *
1829 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1830 */
1831 if (table2->num_entries != 1)
1832 return NULL;
1833
1834 data.ascii_incompat_name = ascii_incompat_name;
1835 data.ascii_compat_name = NULL;
1836 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1837 return data.ascii_compat_name;
1838}
1839
1840/*
1841 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1842 *
1843 * If the result of the conversion is not compatible with the encoding of
1844 * `dst`, `dst` may not be valid encoding.
1845 */
1846VALUE
1847rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1848{
1849 unsigned const char *sp, *se;
1850 unsigned char *ds, *dp, *de;
1852 int max_output;
1853 enum ruby_coderange_type coderange;
1854 rb_encoding *dst_enc = ec->destination_encoding;
1855
1856 if (NIL_P(dst)) {
1857 dst = rb_str_buf_new(len);
1858 if (dst_enc) {
1859 rb_enc_associate(dst, dst_enc);
1860 }
1861 coderange = ENC_CODERANGE_7BIT; // scan from the start
1862 }
1863 else {
1864 dst_enc = rb_enc_get(dst);
1865 coderange = rb_enc_str_coderange(dst);
1866 }
1867
1868 if (ec->last_tc)
1869 max_output = ec->last_tc->transcoder->max_output;
1870 else
1871 max_output = 1;
1872
1873 do {
1874 int cr;
1875 long dlen = RSTRING_LEN(dst);
1876 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1877 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1878 if (LONG_MAX < new_capa)
1879 rb_raise(rb_eArgError, "too long string");
1880 rb_str_modify_expand(dst, new_capa - dlen);
1881 }
1882 sp = (const unsigned char *)ss;
1883 se = sp + len;
1884 ds = (unsigned char *)RSTRING_PTR(dst);
1885 de = ds + rb_str_capacity(dst);
1886 dp = ds += dlen;
1887 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1888 switch (coderange) {
1889 case ENC_CODERANGE_7BIT:
1891 cr = (int)coderange;
1892 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1893 coderange = cr;
1894 ENC_CODERANGE_SET(dst, coderange);
1895 break;
1898 break;
1899 }
1900 len -= (const char *)sp - ss;
1901 ss = (const char *)sp;
1902 rb_str_set_len(dst, dlen + (dp - ds));
1904 } while (res == econv_destination_buffer_full);
1905
1906 return dst;
1907}
1908
1909VALUE
1910rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1911{
1912 src = rb_str_new_frozen(src);
1913 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1914 RB_GC_GUARD(src);
1915 return dst;
1916}
1917
1918VALUE
1920{
1921 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1922}
1923
1924VALUE
1925rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1926{
1927 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1928}
1929
1930VALUE
1932{
1933 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1934}
1935
1936static int
1937rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1938{
1939 transcoder_entry_t *entry;
1940 const rb_transcoder *tr;
1941
1942 if (ec->started != 0)
1943 return -1;
1944
1945 entry = get_transcoder_entry(sname, dname);
1946 if (!entry)
1947 return -1;
1948
1949 tr = load_transcoder_entry(entry);
1950 if (!tr) return -1;
1951
1952 return rb_econv_add_transcoder_at(ec, tr, n);
1953}
1954
1955static int
1956rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1957{
1958 return rb_econv_add_converter(ec, "", decorator_name, n);
1959}
1960
1961int
1962rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1963{
1964 const rb_transcoder *tr;
1965
1966 if (ec->num_trans == 0)
1967 return rb_econv_decorate_at(ec, decorator_name, 0);
1968
1969 tr = ec->elems[0].tc->transcoder;
1970
1971 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1972 tr->asciicompat_type == asciicompat_decoder)
1973 return rb_econv_decorate_at(ec, decorator_name, 1);
1974
1975 return rb_econv_decorate_at(ec, decorator_name, 0);
1976}
1977
1978int
1979rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1980{
1981 const rb_transcoder *tr;
1982
1983 if (ec->num_trans == 0)
1984 return rb_econv_decorate_at(ec, decorator_name, 0);
1985
1986 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1987
1988 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1989 tr->asciicompat_type == asciicompat_encoder)
1990 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1991
1992 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1993}
1994
1995void
1997{
1998 const char *dname = 0;
1999
2000 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2002 dname = "universal_newline";
2003 break;
2005 dname = "crlf_newline";
2006 break;
2008 dname = "cr_newline";
2009 break;
2011 dname = "lf_newline";
2012 break;
2013 }
2014
2015 if (dname) {
2016 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2017 int num_trans = ec->num_trans;
2018 int i, j = 0;
2019
2020 for (i=0; i < num_trans; i++) {
2021 if (transcoder == ec->elems[i].tc->transcoder) {
2022 rb_transcoding_close(ec->elems[i].tc);
2023 xfree(ec->elems[i].out_buf_start);
2024 ec->num_trans--;
2025 }
2026 else
2027 ec->elems[j++] = ec->elems[i];
2028 }
2029 }
2030
2031 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2032}
2033
2034static VALUE
2035econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2036{
2037 int has_description = 0;
2038
2039 if (NIL_P(mesg))
2040 mesg = rb_str_new(NULL, 0);
2041
2042 if (*sname != '\0' || *dname != '\0') {
2043 if (*sname == '\0')
2044 rb_str_cat2(mesg, dname);
2045 else if (*dname == '\0')
2046 rb_str_cat2(mesg, sname);
2047 else
2048 rb_str_catf(mesg, "%s to %s", sname, dname);
2049 has_description = 1;
2050 }
2051
2052 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2056 const char *pre = "";
2057 if (has_description)
2058 rb_str_cat2(mesg, " with ");
2059 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2060 rb_str_cat2(mesg, pre); pre = ",";
2061 rb_str_cat2(mesg, "universal_newline");
2062 }
2063 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2064 rb_str_cat2(mesg, pre); pre = ",";
2065 rb_str_cat2(mesg, "crlf_newline");
2066 }
2067 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2068 rb_str_cat2(mesg, pre); pre = ",";
2069 rb_str_cat2(mesg, "cr_newline");
2070 }
2071 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2072 rb_str_cat2(mesg, pre); pre = ",";
2073 rb_str_cat2(mesg, "lf_newline");
2074 }
2075 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2076 rb_str_cat2(mesg, pre); pre = ",";
2077 rb_str_cat2(mesg, "xml_text");
2078 }
2079 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2080 rb_str_cat2(mesg, pre); pre = ",";
2081 rb_str_cat2(mesg, "xml_attr_content");
2082 }
2083 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2084 rb_str_cat2(mesg, pre); pre = ",";
2085 rb_str_cat2(mesg, "xml_attr_quote");
2086 }
2087 has_description = 1;
2088 }
2089 if (!has_description) {
2090 rb_str_cat2(mesg, "no-conversion");
2091 }
2092
2093 return mesg;
2094}
2095
2096VALUE
2097rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2098{
2099 VALUE mesg, exc;
2100 mesg = rb_str_new_cstr("code converter not found (");
2101 econv_description(sname, dname, ecflags, mesg);
2102 rb_str_cat2(mesg, ")");
2103 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2104 return exc;
2105}
2106
2107static VALUE
2108make_econv_exception(rb_econv_t *ec)
2109{
2110 VALUE mesg, exc;
2111 if (ec->last_error.result == econv_invalid_byte_sequence ||
2112 ec->last_error.result == econv_incomplete_input) {
2113 const char *err = (const char *)ec->last_error.error_bytes_start;
2114 size_t error_len = ec->last_error.error_bytes_len;
2115 VALUE bytes = rb_str_new(err, error_len);
2116 VALUE dumped = rb_str_dump(bytes);
2117 size_t readagain_len = ec->last_error.readagain_len;
2118 VALUE bytes2 = Qnil;
2119 VALUE dumped2;
2120 if (ec->last_error.result == econv_incomplete_input) {
2121 mesg = rb_sprintf("incomplete %s on %s",
2122 StringValueCStr(dumped),
2123 ec->last_error.source_encoding);
2124 }
2125 else if (readagain_len) {
2126 bytes2 = rb_str_new(err+error_len, readagain_len);
2127 dumped2 = rb_str_dump(bytes2);
2128 mesg = rb_sprintf("%s followed by %s on %s",
2129 StringValueCStr(dumped),
2130 StringValueCStr(dumped2),
2131 ec->last_error.source_encoding);
2132 }
2133 else {
2134 mesg = rb_sprintf("%s on %s",
2135 StringValueCStr(dumped),
2136 ec->last_error.source_encoding);
2137 }
2138
2139 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2140 rb_ivar_set(exc, id_error_bytes, bytes);
2141 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2142 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2143 goto set_encs;
2144 }
2145 if (ec->last_error.result == econv_undefined_conversion) {
2146 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2147 ec->last_error.error_bytes_len);
2148 VALUE dumped = Qnil;
2149 int idx;
2150 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2151 rb_encoding *utf8 = rb_utf8_encoding();
2152 const char *start, *end;
2153 int n;
2154 start = (const char *)ec->last_error.error_bytes_start;
2155 end = start + ec->last_error.error_bytes_len;
2156 n = rb_enc_precise_mbclen(start, end, utf8);
2157 if (MBCLEN_CHARFOUND_P(n) &&
2158 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2159 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2160 dumped = rb_sprintf("U+%04X", cc);
2161 }
2162 }
2163 if (NIL_P(dumped))
2164 dumped = rb_str_dump(bytes);
2165 if (strcmp(ec->last_error.source_encoding,
2166 ec->source_encoding_name) == 0 &&
2167 strcmp(ec->last_error.destination_encoding,
2168 ec->destination_encoding_name) == 0) {
2169 mesg = rb_sprintf("%s from %s to %s",
2170 StringValueCStr(dumped),
2171 ec->last_error.source_encoding,
2172 ec->last_error.destination_encoding);
2173 }
2174 else {
2175 int i;
2176 mesg = rb_sprintf("%s to %s in conversion from %s",
2177 StringValueCStr(dumped),
2178 ec->last_error.destination_encoding,
2179 ec->source_encoding_name);
2180 for (i = 0; i < ec->num_trans; i++) {
2181 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2182 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2183 rb_str_catf(mesg, " to %s",
2184 ec->elems[i].tc->transcoder->dst_encoding);
2185 }
2186 }
2187 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2188 idx = rb_enc_find_index(ec->last_error.source_encoding);
2189 if (0 <= idx)
2190 rb_enc_associate_index(bytes, idx);
2191 rb_ivar_set(exc, id_error_char, bytes);
2192 goto set_encs;
2193 }
2194 return Qnil;
2195
2196 set_encs:
2197 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2198 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2199 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2200 if (0 <= idx)
2201 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2202 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2203 if (0 <= idx)
2204 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2205 return exc;
2206}
2207
2208static void
2209more_output_buffer(
2210 VALUE destination,
2211 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2212 int max_output,
2213 unsigned char **out_start_ptr,
2214 unsigned char **out_pos,
2215 unsigned char **out_stop_ptr)
2216{
2217 size_t len = (*out_pos - *out_start_ptr);
2218 size_t new_len = (len + max_output) * 2;
2219 *out_start_ptr = resize_destination(destination, len, new_len);
2220 *out_pos = *out_start_ptr + len;
2221 *out_stop_ptr = *out_start_ptr + new_len;
2222}
2223
2224static int
2225make_replacement(rb_econv_t *ec)
2226{
2227 rb_transcoding *tc;
2228 const rb_transcoder *tr;
2229 const unsigned char *replacement;
2230 const char *repl_enc;
2231 const char *ins_enc;
2232 size_t len;
2233
2234 if (ec->replacement_str)
2235 return 0;
2236
2238
2239 tc = ec->last_tc;
2240 if (*ins_enc) {
2241 tr = tc->transcoder;
2242 rb_enc_find(tr->dst_encoding);
2243 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2244 }
2245 else {
2246 replacement = (unsigned char *)"?";
2247 len = 1;
2248 repl_enc = "";
2249 }
2250
2251 ec->replacement_str = replacement;
2252 ec->replacement_len = len;
2253 ec->replacement_enc = repl_enc;
2254 ec->replacement_allocated = 0;
2255 return 0;
2256}
2257
2258int
2260 const unsigned char *str, size_t len, const char *encname)
2261{
2262 unsigned char *str2;
2263 size_t len2;
2264 const char *encname2;
2265
2267
2268 if (!*encname2 || encoding_equal(encname, encname2)) {
2269 str2 = xmalloc(len);
2270 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2271 len2 = len;
2272 encname2 = encname;
2273 }
2274 else {
2275 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2276 if (!str2)
2277 return -1;
2278 }
2279
2280 if (ec->replacement_allocated) {
2281 xfree((void *)ec->replacement_str);
2282 }
2283 ec->replacement_allocated = 1;
2284 ec->replacement_str = str2;
2285 ec->replacement_len = len2;
2286 ec->replacement_enc = encname2;
2287 return 0;
2288}
2289
2290static int
2291output_replacement_character(rb_econv_t *ec)
2292{
2293 int ret;
2294
2295 if (make_replacement(ec) == -1)
2296 return -1;
2297
2298 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2299 if (ret == -1)
2300 return -1;
2301
2302 return 0;
2303}
2304
2305#if 1
2306#define hash_fallback rb_hash_aref
2307
2308static VALUE
2309proc_fallback(VALUE fallback, VALUE c)
2310{
2311 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2312}
2313
2314static VALUE
2315method_fallback(VALUE fallback, VALUE c)
2316{
2317 return rb_method_call(1, &c, fallback);
2318}
2319
2320static VALUE
2321aref_fallback(VALUE fallback, VALUE c)
2322{
2323 return rb_funcallv_public(fallback, idAREF, 1, &c);
2324}
2325
2326static void
2327transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2328 const unsigned char *in_stop, unsigned char *out_stop,
2329 VALUE destination,
2330 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2331 const char *src_encoding,
2332 const char *dst_encoding,
2333 int ecflags,
2334 VALUE ecopts)
2335{
2336 rb_econv_t *ec;
2337 rb_transcoding *last_tc;
2339 unsigned char *out_start = *out_pos;
2340 int max_output;
2341 VALUE exc;
2342 VALUE fallback = Qnil;
2343 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2344
2345 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2346 if (!ec)
2347 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2348
2349 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2350 fallback = rb_hash_aref(ecopts, sym_fallback);
2351 if (RB_TYPE_P(fallback, T_HASH)) {
2352 fallback_func = hash_fallback;
2353 }
2354 else if (rb_obj_is_proc(fallback)) {
2355 fallback_func = proc_fallback;
2356 }
2357 else if (rb_obj_is_method(fallback)) {
2358 fallback_func = method_fallback;
2359 }
2360 else {
2361 fallback_func = aref_fallback;
2362 }
2363 }
2364 last_tc = ec->last_tc;
2365 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2366
2367 resume:
2368 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2369
2370 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2371 VALUE rep = rb_enc_str_new(
2372 (const char *)ec->last_error.error_bytes_start,
2373 ec->last_error.error_bytes_len,
2374 rb_enc_find(ec->last_error.source_encoding));
2375 rep = (*fallback_func)(fallback, rep);
2376 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2377 StringValue(rep);
2378 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2379 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2380 if ((int)ret == -1) {
2381 rb_raise(rb_eArgError, "too big fallback string");
2382 }
2383 goto resume;
2384 }
2385 }
2386
2387 if (ret == econv_invalid_byte_sequence ||
2388 ret == econv_incomplete_input ||
2390 exc = make_econv_exception(ec);
2391 rb_econv_close(ec);
2392 rb_exc_raise(exc);
2393 }
2394
2395 if (ret == econv_destination_buffer_full) {
2396 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2397 goto resume;
2398 }
2399
2400 rb_econv_close(ec);
2401 return;
2402}
2403#else
2404/* sample transcode_loop implementation in byte-by-byte stream style */
2405static void
2406transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2407 const unsigned char *in_stop, unsigned char *out_stop,
2408 VALUE destination,
2409 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2410 const char *src_encoding,
2411 const char *dst_encoding,
2412 int ecflags,
2413 VALUE ecopts)
2414{
2415 rb_econv_t *ec;
2416 rb_transcoding *last_tc;
2418 unsigned char *out_start = *out_pos;
2419 const unsigned char *ptr;
2420 int max_output;
2421 VALUE exc;
2422
2423 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2424 if (!ec)
2425 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2426
2427 last_tc = ec->last_tc;
2428 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2429
2431 ptr = *in_pos;
2432 while (ret != econv_finished) {
2433 unsigned char input_byte;
2434 const unsigned char *p = &input_byte;
2435
2436 if (ret == econv_source_buffer_empty) {
2437 if (ptr < in_stop) {
2438 input_byte = *ptr;
2439 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2440 }
2441 else {
2442 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2443 }
2444 }
2445 else {
2446 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2447 }
2448 if (&input_byte != p)
2449 ptr += p - &input_byte;
2450 switch (ret) {
2454 exc = make_econv_exception(ec);
2455 rb_econv_close(ec);
2456 rb_exc_raise(exc);
2457 break;
2458
2460 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2461 break;
2462
2464 break;
2465
2466 case econv_finished:
2467 break;
2468 }
2469 }
2470 rb_econv_close(ec);
2471 *in_pos = in_stop;
2472 return;
2473}
2474#endif
2475
2476
2477/*
2478 * String-specific code
2479 */
2480
2481static unsigned char *
2482str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2483{
2484 rb_str_resize(destination, new_len);
2485 return (unsigned char *)RSTRING_PTR(destination);
2486}
2487
2488static int
2489econv_opts(VALUE opt, int ecflags)
2490{
2491 VALUE v;
2492 int newlineflag = 0;
2493
2494 v = rb_hash_aref(opt, sym_invalid);
2495 if (NIL_P(v)) {
2496 }
2497 else if (v==sym_replace) {
2498 ecflags |= ECONV_INVALID_REPLACE;
2499 }
2500 else {
2501 rb_raise(rb_eArgError, "unknown value for invalid character option");
2502 }
2503
2504 v = rb_hash_aref(opt, sym_undef);
2505 if (NIL_P(v)) {
2506 }
2507 else if (v==sym_replace) {
2508 ecflags |= ECONV_UNDEF_REPLACE;
2509 }
2510 else {
2511 rb_raise(rb_eArgError, "unknown value for undefined character option");
2512 }
2513
2514 v = rb_hash_aref(opt, sym_replace);
2515 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2516 ecflags |= ECONV_UNDEF_REPLACE;
2517 }
2518
2519 v = rb_hash_aref(opt, sym_xml);
2520 if (!NIL_P(v)) {
2521 if (v==sym_text) {
2523 }
2524 else if (v==sym_attr) {
2526 }
2527 else if (SYMBOL_P(v)) {
2528 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2529 }
2530 else {
2531 rb_raise(rb_eArgError, "unexpected value for xml option");
2532 }
2533 }
2534
2535#ifdef ENABLE_ECONV_NEWLINE_OPTION
2536 v = rb_hash_aref(opt, sym_newline);
2537 if (!NIL_P(v)) {
2538 newlineflag = 2;
2539 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2540 if (v == sym_universal) {
2542 }
2543 else if (v == sym_crlf) {
2545 }
2546 else if (v == sym_cr) {
2547 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2548 }
2549 else if (v == sym_lf) {
2550 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2551 }
2552 else if (SYMBOL_P(v)) {
2553 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2554 rb_sym2str(v));
2555 }
2556 else {
2557 rb_raise(rb_eArgError, "unexpected value for newline option");
2558 }
2559 }
2560#endif
2561 {
2562 int setflags = 0;
2563
2564 v = rb_hash_aref(opt, sym_universal_newline);
2565 if (RTEST(v))
2567 newlineflag |= !NIL_P(v);
2568
2569 v = rb_hash_aref(opt, sym_crlf_newline);
2570 if (RTEST(v))
2571 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2572 newlineflag |= !NIL_P(v);
2573
2574 v = rb_hash_aref(opt, sym_cr_newline);
2575 if (RTEST(v))
2576 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2577 newlineflag |= !NIL_P(v);
2578
2579 v = rb_hash_aref(opt, sym_lf_newline);
2580 if (RTEST(v))
2581 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2582 newlineflag |= !NIL_P(v);
2583
2584 switch (newlineflag) {
2585 case 1:
2586 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2587 ecflags |= setflags;
2588 break;
2589
2590 case 3:
2591 rb_warning(":newline option precedes other newline options");
2592 break;
2593 }
2594 }
2595
2596 return ecflags;
2597}
2598
2599int
2600rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2601{
2602 VALUE newhash = Qnil;
2603 VALUE v;
2604
2605 if (NIL_P(opthash)) {
2606 *opts = Qnil;
2607 return ecflags;
2608 }
2609 ecflags = econv_opts(opthash, ecflags);
2610
2611 v = rb_hash_aref(opthash, sym_replace);
2612 if (!NIL_P(v)) {
2613 StringValue(v);
2614 if (is_broken_string(v)) {
2615 VALUE dumped = rb_str_dump(v);
2616 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2617 StringValueCStr(dumped),
2618 rb_enc_name(rb_enc_get(v)));
2619 }
2620 v = rb_str_new_frozen(v);
2621 newhash = rb_hash_new();
2622 rb_hash_aset(newhash, sym_replace, v);
2623 }
2624
2625 v = rb_hash_aref(opthash, sym_fallback);
2626 if (!NIL_P(v)) {
2627 VALUE h = rb_check_hash_type(v);
2628 if (NIL_P(h)
2629 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2630 : (v = h, 1)) {
2631 if (NIL_P(newhash))
2632 newhash = rb_hash_new();
2633 rb_hash_aset(newhash, sym_fallback, v);
2634 }
2635 }
2636
2637 if (!NIL_P(newhash))
2638 rb_hash_freeze(newhash);
2639 *opts = newhash;
2640
2641 return ecflags;
2642}
2643
2644int
2646{
2647 return rb_econv_prepare_options(opthash, opts, 0);
2648}
2649
2650rb_econv_t *
2651rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2652{
2653 rb_econv_t *ec;
2654 VALUE replacement;
2655
2656 if (NIL_P(opthash)) {
2657 replacement = Qnil;
2658 }
2659 else {
2660 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2661 rb_bug("rb_econv_open_opts called with invalid opthash");
2662 replacement = rb_hash_aref(opthash, sym_replace);
2663 }
2664
2665 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2666 if (!ec)
2667 return ec;
2668
2669 if (!NIL_P(replacement)) {
2670 int ret;
2671 rb_encoding *enc = rb_enc_get(replacement);
2672
2673 ret = rb_econv_set_replacement(ec,
2674 (const unsigned char *)RSTRING_PTR(replacement),
2675 RSTRING_LEN(replacement),
2676 rb_enc_name(enc));
2677 if (ret == -1) {
2678 rb_econv_close(ec);
2679 return NULL;
2680 }
2681 }
2682 return ec;
2683}
2684
2685static int
2686enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2687{
2688 rb_encoding *enc;
2689 const char *n;
2690 int encidx;
2691 VALUE encval;
2692
2693 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2694 !(enc = rb_enc_from_index(encidx))) {
2695 enc = NULL;
2696 encidx = 0;
2697 n = StringValueCStr(*arg);
2698 }
2699 else {
2700 n = rb_enc_name(enc);
2701 }
2702
2703 *name_p = n;
2704 *enc_p = enc;
2705
2706 return encidx;
2707}
2708
2709static int
2710str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2711 const char **sname_p, rb_encoding **senc_p,
2712 const char **dname_p, rb_encoding **denc_p)
2713{
2714 rb_encoding *senc, *denc;
2715 const char *sname, *dname;
2716 int sencidx, dencidx;
2717
2718 dencidx = enc_arg(arg1, &dname, &denc);
2719
2720 if (NIL_P(*arg2)) {
2721 sencidx = rb_enc_get_index(str);
2722 senc = rb_enc_from_index(sencidx);
2723 sname = rb_enc_name(senc);
2724 }
2725 else {
2726 sencidx = enc_arg(arg2, &sname, &senc);
2727 }
2728
2729 *sname_p = sname;
2730 *senc_p = senc;
2731 *dname_p = dname;
2732 *denc_p = denc;
2733 return dencidx;
2734}
2735
2736static int
2737str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2738{
2739 VALUE dest;
2740 VALUE str = *self;
2741 VALUE arg1, arg2;
2742 long blen, slen;
2743 unsigned char *buf, *bp, *sp;
2744 const unsigned char *fromp;
2745 rb_encoding *senc, *denc;
2746 const char *sname, *dname;
2747 int dencidx;
2748 int explicitly_invalid_replace = TRUE;
2749
2750 rb_check_arity(argc, 0, 2);
2751
2752 if (argc == 0) {
2753 arg1 = rb_enc_default_internal();
2754 if (NIL_P(arg1)) {
2755 if (!ecflags) return -1;
2756 arg1 = rb_obj_encoding(str);
2757 }
2758 if (!(ecflags & ECONV_INVALID_MASK)) {
2759 explicitly_invalid_replace = FALSE;
2760 }
2762 }
2763 else {
2764 arg1 = argv[0];
2765 }
2766 arg2 = argc<=1 ? Qnil : argv[1];
2767 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2768
2769 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2773 if (senc && senc == denc) {
2774 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2775 VALUE rep = Qnil;
2776 if (!NIL_P(ecopts)) {
2777 rep = rb_hash_aref(ecopts, sym_replace);
2778 }
2779 dest = rb_enc_str_scrub(senc, str, rep);
2780 if (NIL_P(dest)) dest = str;
2781 *self = dest;
2782 return dencidx;
2783 }
2784 return NIL_P(arg2) ? -1 : dencidx;
2785 }
2786 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2787 if (is_ascii_string(str)) {
2788 return dencidx;
2789 }
2790 }
2791 if (encoding_equal(sname, dname)) {
2792 return NIL_P(arg2) ? -1 : dencidx;
2793 }
2794 }
2795 else {
2796 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2797 rb_encoding *utf8 = rb_utf8_encoding();
2798 str = rb_str_conv_enc(str, senc, utf8);
2799 senc = utf8;
2800 sname = "UTF-8";
2801 }
2802 if (encoding_equal(sname, dname)) {
2803 sname = "";
2804 dname = "";
2805 }
2806 }
2807
2808 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2809 slen = RSTRING_LEN(str);
2810 blen = slen + 30; /* len + margin */
2811 dest = rb_str_tmp_new(blen);
2812 bp = (unsigned char *)RSTRING_PTR(dest);
2813
2814 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2815 if (fromp != sp+slen) {
2816 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2817 }
2818 buf = (unsigned char *)RSTRING_PTR(dest);
2819 *bp = '\0';
2820 rb_str_set_len(dest, bp - buf);
2821
2822 /* set encoding */
2823 if (!denc) {
2824 dencidx = rb_define_dummy_encoding(dname);
2825 RB_GC_GUARD(arg1);
2826 RB_GC_GUARD(arg2);
2827 }
2828 *self = dest;
2829
2830 return dencidx;
2831}
2832
2833static int
2834str_transcode(int argc, VALUE *argv, VALUE *self)
2835{
2836 VALUE opt;
2837 int ecflags = 0;
2838 VALUE ecopts = Qnil;
2839
2840 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2841 if (!NIL_P(opt)) {
2842 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2843 }
2844 return str_transcode0(argc, argv, self, ecflags, ecopts);
2845}
2846
2847static inline VALUE
2848str_encode_associate(VALUE str, int encidx)
2849{
2850 int cr = 0;
2851
2852 rb_enc_associate_index(str, encidx);
2853
2854 /* transcoded string never be broken. */
2855 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2856 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2857 }
2858 else {
2860 }
2861 ENC_CODERANGE_SET(str, cr);
2862 return str;
2863}
2864
2865/*
2866 * call-seq:
2867 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2868 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2869 *
2870 * Like #encode, but applies encoding changes to +self+; returns +self+.
2871 *
2872 */
2873
2874static VALUE
2875str_encode_bang(int argc, VALUE *argv, VALUE str)
2876{
2877 VALUE newstr;
2878 int encidx;
2879
2880 rb_check_frozen(str);
2881
2882 newstr = str;
2883 encidx = str_transcode(argc, argv, &newstr);
2884
2885 if (encidx < 0) return str;
2886 if (newstr == str) {
2887 rb_enc_associate_index(str, encidx);
2888 return str;
2889 }
2890 rb_str_shared_replace(str, newstr);
2891 return str_encode_associate(str, encidx);
2892}
2893
2894static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2895
2896/*
2897 * call-seq:
2898 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2899 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2900 *
2901 * :include: doc/string/encode.rdoc
2902 *
2903 */
2904
2905static VALUE
2906str_encode(int argc, VALUE *argv, VALUE str)
2907{
2908 VALUE newstr = str;
2909 int encidx = str_transcode(argc, argv, &newstr);
2910 return encoded_dup(newstr, str, encidx);
2911}
2912
2913VALUE
2914rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2915{
2916 int argc = 1;
2917 VALUE *argv = &to;
2918 VALUE newstr = str;
2919 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2920 return encoded_dup(newstr, str, encidx);
2921}
2922
2923static VALUE
2924encoded_dup(VALUE newstr, VALUE str, int encidx)
2925{
2926 if (encidx < 0) return rb_str_dup(str);
2927 if (newstr == str) {
2928 newstr = rb_str_dup(str);
2929 rb_enc_associate_index(newstr, encidx);
2930 return newstr;
2931 }
2932 else {
2933 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2934 }
2935 return str_encode_associate(newstr, encidx);
2936}
2937
2938/*
2939 * Document-class: Encoding::Converter
2940 *
2941 * Encoding conversion class.
2942 */
2943static void
2944econv_free(void *ptr)
2945{
2946 rb_econv_t *ec = ptr;
2947 rb_econv_close(ec);
2948}
2949
2950static size_t
2951econv_memsize(const void *ptr)
2952{
2953 return sizeof(rb_econv_t);
2954}
2955
2956static const rb_data_type_t econv_data_type = {
2957 "econv",
2958 {0, econv_free, econv_memsize,},
2959 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2960};
2961
2962static VALUE
2963econv_s_allocate(VALUE klass)
2964{
2965 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2966}
2967
2968static rb_encoding *
2969make_dummy_encoding(const char *name)
2970{
2971 rb_encoding *enc;
2972 int idx;
2973 idx = rb_define_dummy_encoding(name);
2974 enc = rb_enc_from_index(idx);
2975 return enc;
2976}
2977
2978static rb_encoding *
2979make_encoding(const char *name)
2980{
2981 rb_encoding *enc;
2982 enc = rb_enc_find(name);
2983 if (!enc)
2984 enc = make_dummy_encoding(name);
2985 return enc;
2986}
2987
2988static VALUE
2989make_encobj(const char *name)
2990{
2991 return rb_enc_from_encoding(make_encoding(name));
2992}
2993
2994/*
2995 * call-seq:
2996 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2997 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2998 *
2999 * Returns the corresponding ASCII compatible encoding.
3000 *
3001 * Returns nil if the argument is an ASCII compatible encoding.
3002 *
3003 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3004 * can represents exactly the same characters as the given ASCII incompatible encoding.
3005 * So, no conversion undefined error occurs when converting between the two encodings.
3006 *
3007 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3008 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3009 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3010 *
3011 */
3012static VALUE
3013econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3014{
3015 const char *arg_name, *result_name;
3016 rb_encoding *arg_enc, *result_enc;
3017
3018 enc_arg(&arg, &arg_name, &arg_enc);
3019
3020 result_name = rb_econv_asciicompat_encoding(arg_name);
3021
3022 if (result_name == NULL)
3023 return Qnil;
3024
3025 result_enc = make_encoding(result_name);
3026
3027 return rb_enc_from_encoding(result_enc);
3028}
3029
3030static void
3031econv_args(int argc, VALUE *argv,
3032 VALUE *snamev_p, VALUE *dnamev_p,
3033 const char **sname_p, const char **dname_p,
3034 rb_encoding **senc_p, rb_encoding **denc_p,
3035 int *ecflags_p,
3036 VALUE *ecopts_p)
3037{
3038 VALUE opt, flags_v, ecopts;
3039 int sidx, didx;
3040 const char *sname, *dname;
3041 rb_encoding *senc, *denc;
3042 int ecflags;
3043
3044 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3045
3046 if (!NIL_P(flags_v)) {
3047 if (!NIL_P(opt)) {
3048 rb_error_arity(argc + 1, 2, 3);
3049 }
3050 ecflags = NUM2INT(rb_to_int(flags_v));
3051 ecopts = Qnil;
3052 }
3053 else if (!NIL_P(opt)) {
3054 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3055 }
3056 else {
3057 ecflags = 0;
3058 ecopts = Qnil;
3059 }
3060
3061 senc = NULL;
3062 sidx = rb_to_encoding_index(*snamev_p);
3063 if (0 <= sidx) {
3064 senc = rb_enc_from_index(sidx);
3065 }
3066 else {
3067 StringValue(*snamev_p);
3068 }
3069
3070 denc = NULL;
3071 didx = rb_to_encoding_index(*dnamev_p);
3072 if (0 <= didx) {
3073 denc = rb_enc_from_index(didx);
3074 }
3075 else {
3076 StringValue(*dnamev_p);
3077 }
3078
3079 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3080 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3081
3082 *sname_p = sname;
3083 *dname_p = dname;
3084 *senc_p = senc;
3085 *denc_p = denc;
3086 *ecflags_p = ecflags;
3087 *ecopts_p = ecopts;
3088}
3089
3090static int
3091decorate_convpath(VALUE convpath, int ecflags)
3092{
3093 int num_decorators;
3094 const char *decorators[MAX_ECFLAGS_DECORATORS];
3095 int i;
3096 int n, len;
3097
3098 num_decorators = decorator_names(ecflags, decorators);
3099 if (num_decorators == -1)
3100 return -1;
3101
3102 len = n = RARRAY_LENINT(convpath);
3103 if (n != 0) {
3104 VALUE pair = RARRAY_AREF(convpath, n-1);
3105 if (RB_TYPE_P(pair, T_ARRAY)) {
3106 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3107 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3108 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3109 const rb_transcoder *tr = load_transcoder_entry(entry);
3110 if (!tr)
3111 return -1;
3112 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3113 tr->asciicompat_type == asciicompat_encoder) {
3114 n--;
3115 rb_ary_store(convpath, len + num_decorators - 1, pair);
3116 }
3117 }
3118 else {
3119 rb_ary_store(convpath, len + num_decorators - 1, pair);
3120 }
3121 }
3122
3123 for (i = 0; i < num_decorators; i++)
3124 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3125
3126 return 0;
3127}
3128
3129static void
3130search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3131{
3132 VALUE *ary_p = arg;
3133 VALUE v;
3134
3135 if (NIL_P(*ary_p)) {
3136 *ary_p = rb_ary_new();
3137 }
3138
3139 if (DECORATOR_P(sname, dname)) {
3140 v = rb_str_new_cstr(dname);
3141 }
3142 else {
3143 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3144 }
3145 rb_ary_store(*ary_p, depth, v);
3146}
3147
3148/*
3149 * call-seq:
3150 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3151 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3152 *
3153 * Returns a conversion path.
3154 *
3155 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3156 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3157 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3158 *
3159 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3160 * or
3161 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3162 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3163 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3164 * # "universal_newline"]
3165 *
3166 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3167 * or
3168 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3169 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3170 * # "universal_newline",
3171 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3172 */
3173static VALUE
3174econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3175{
3176 VALUE snamev, dnamev;
3177 const char *sname, *dname;
3178 rb_encoding *senc, *denc;
3179 int ecflags;
3180 VALUE ecopts;
3181 VALUE convpath;
3182
3183 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3184
3185 convpath = Qnil;
3186 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3187
3188 if (NIL_P(convpath)) {
3189 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3190 RB_GC_GUARD(snamev);
3191 RB_GC_GUARD(dnamev);
3192 rb_exc_raise(exc);
3193 }
3194
3195 if (decorate_convpath(convpath, ecflags) == -1) {
3196 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3197 RB_GC_GUARD(snamev);
3198 RB_GC_GUARD(dnamev);
3199 rb_exc_raise(exc);
3200 }
3201
3202 return convpath;
3203}
3204
3205/*
3206 * Check the existence of a conversion path.
3207 * Returns the number of converters in the conversion path.
3208 * result: >=0:success -1:failure
3209 */
3210int
3211rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3212{
3213 VALUE convpath = Qnil;
3214 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3215 &convpath);
3216 return RTEST(convpath);
3217}
3218
3220 rb_econv_t *ec;
3221 int index;
3222 int ret;
3223};
3224
3225static void
3226rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3227{
3229 int ret;
3230
3231 if (a->ret == -1)
3232 return;
3233
3234 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3235
3236 a->ret = ret;
3237 return;
3238}
3239
3240static rb_econv_t *
3241rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3242 const char **sname_p, const char **dname_p,
3243 rb_encoding **senc_p, rb_encoding**denc_p)
3244{
3245 rb_econv_t *ec;
3246 long i;
3247 int ret, first=1;
3248 VALUE elt;
3249 rb_encoding *senc = 0, *denc = 0;
3250 const char *sname, *dname;
3251
3252 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3253 DATA_PTR(self) = ec;
3254
3255 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3256 VALUE snamev, dnamev;
3257 VALUE pair;
3258 elt = rb_ary_entry(convpath, i);
3259 if (!NIL_P(pair = rb_check_array_type(elt))) {
3260 if (RARRAY_LEN(pair) != 2)
3261 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3262 snamev = rb_ary_entry(pair, 0);
3263 enc_arg(&snamev, &sname, &senc);
3264 dnamev = rb_ary_entry(pair, 1);
3265 enc_arg(&dnamev, &dname, &denc);
3266 }
3267 else {
3268 sname = "";
3269 dname = StringValueCStr(elt);
3270 }
3271 if (DECORATOR_P(sname, dname)) {
3272 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3273 if (ret == -1) {
3274 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3275 RB_GC_GUARD(snamev);
3276 RB_GC_GUARD(dnamev);
3277 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3278 }
3279 }
3280 else {
3281 int j = ec->num_trans;
3282 struct rb_econv_init_by_convpath_t arg;
3283 arg.ec = ec;
3284 arg.index = ec->num_trans;
3285 arg.ret = 0;
3286 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3287 if (ret == -1 || arg.ret == -1) {
3288 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3289 RB_GC_GUARD(snamev);
3290 RB_GC_GUARD(dnamev);
3291 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3292 }
3293 if (first) {
3294 first = 0;
3295 *senc_p = senc;
3296 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3297 }
3298 *denc_p = denc;
3299 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3300 }
3301 }
3302
3303 if (first) {
3304 *senc_p = NULL;
3305 *denc_p = NULL;
3306 *sname_p = "";
3307 *dname_p = "";
3308 }
3309
3310 ec->source_encoding_name = *sname_p;
3311 ec->destination_encoding_name = *dname_p;
3312
3313 return ec;
3314}
3315
3316/*
3317 * call-seq:
3318 * Encoding::Converter.new(source_encoding, destination_encoding)
3319 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3320 * Encoding::Converter.new(convpath)
3321 *
3322 * possible options elements:
3323 * hash form:
3324 * :invalid => nil # raise error on invalid byte sequence (default)
3325 * :invalid => :replace # replace invalid byte sequence
3326 * :undef => nil # raise error on undefined conversion (default)
3327 * :undef => :replace # replace undefined conversion
3328 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3329 * :newline => :universal # decorator for converting CRLF and CR to LF
3330 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3331 * :newline => :crlf # decorator for converting LF to CRLF
3332 * :newline => :cr # decorator for converting LF to CR
3333 * :universal_newline => true # decorator for converting CRLF and CR to LF
3334 * :crlf_newline => true # decorator for converting LF to CRLF
3335 * :cr_newline => true # decorator for converting LF to CR
3336 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3337 * :xml => :text # escape as XML CharData.
3338 * :xml => :attr # escape as XML AttValue
3339 * integer form:
3340 * Encoding::Converter::INVALID_REPLACE
3341 * Encoding::Converter::UNDEF_REPLACE
3342 * Encoding::Converter::UNDEF_HEX_CHARREF
3343 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3344 * Encoding::Converter::LF_NEWLINE_DECORATOR
3345 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3346 * Encoding::Converter::CR_NEWLINE_DECORATOR
3347 * Encoding::Converter::XML_TEXT_DECORATOR
3348 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3349 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3350 *
3351 * Encoding::Converter.new creates an instance of Encoding::Converter.
3352 *
3353 * Source_encoding and destination_encoding should be a string or
3354 * Encoding object.
3355 *
3356 * opt should be nil, a hash or an integer.
3357 *
3358 * convpath should be an array.
3359 * convpath may contain
3360 * - two-element arrays which contain encodings or encoding names, or
3361 * - strings representing decorator names.
3362 *
3363 * Encoding::Converter.new optionally takes an option.
3364 * The option should be a hash or an integer.
3365 * The option hash can contain :invalid => nil, etc.
3366 * The option integer should be logical-or of constants such as
3367 * Encoding::Converter::INVALID_REPLACE, etc.
3368 *
3369 * [:invalid => nil]
3370 * Raise error on invalid byte sequence. This is a default behavior.
3371 * [:invalid => :replace]
3372 * Replace invalid byte sequence by replacement string.
3373 * [:undef => nil]
3374 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3375 * This is a default behavior.
3376 * [:undef => :replace]
3377 * Replace undefined character in destination_encoding with replacement string.
3378 * [:replace => string]
3379 * Specify the replacement string.
3380 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3381 * [:universal_newline => true]
3382 * Convert CRLF and CR to LF.
3383 * [:crlf_newline => true]
3384 * Convert LF to CRLF.
3385 * [:cr_newline => true]
3386 * Convert LF to CR.
3387 * [:lf_newline => true]
3388 * Convert CRLF and CR to LF (when writing).
3389 * [:xml => :text]
3390 * Escape as XML CharData.
3391 * This form can be used as an HTML 4.0 #PCDATA.
3392 * - '&' -> '&amp;'
3393 * - '<' -> '&lt;'
3394 * - '>' -> '&gt;'
3395 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3396 * [:xml => :attr]
3397 * Escape as XML AttValue.
3398 * The converted result is quoted as "...".
3399 * This form can be used as an HTML 4.0 attribute value.
3400 * - '&' -> '&amp;'
3401 * - '<' -> '&lt;'
3402 * - '>' -> '&gt;'
3403 * - '"' -> '&quot;'
3404 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3405 *
3406 * Examples:
3407 * # UTF-16BE to UTF-8
3408 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3409 *
3410 * # Usually, decorators such as newline conversion are inserted last.
3411 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3412 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3413 * # "universal_newline"]
3414 *
3415 * # But, if the last encoding is ASCII incompatible,
3416 * # decorators are inserted before the last conversion.
3417 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3418 * p ec.convpath #=> ["crlf_newline",
3419 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3420 *
3421 * # Conversion path can be specified directly.
3422 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3423 * p ec.convpath #=> ["universal_newline",
3424 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3425 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3426 */
3427static VALUE
3428econv_init(int argc, VALUE *argv, VALUE self)
3429{
3430 VALUE ecopts;
3431 VALUE snamev, dnamev;
3432 const char *sname, *dname;
3433 rb_encoding *senc, *denc;
3434 rb_econv_t *ec;
3435 int ecflags;
3436 VALUE convpath;
3437
3438 if (rb_check_typeddata(self, &econv_data_type)) {
3439 rb_raise(rb_eTypeError, "already initialized");
3440 }
3441
3442 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3443 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3444 ecflags = 0;
3445 ecopts = Qnil;
3446 }
3447 else {
3448 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3449 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3450 }
3451
3452 if (!ec) {
3453 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3454 RB_GC_GUARD(snamev);
3455 RB_GC_GUARD(dnamev);
3456 rb_exc_raise(exc);
3457 }
3458
3459 if (!DECORATOR_P(sname, dname)) {
3460 if (!senc)
3461 senc = make_dummy_encoding(sname);
3462 if (!denc)
3463 denc = make_dummy_encoding(dname);
3464 RB_GC_GUARD(snamev);
3465 RB_GC_GUARD(dnamev);
3466 }
3467
3468 ec->source_encoding = senc;
3469 ec->destination_encoding = denc;
3470
3471 DATA_PTR(self) = ec;
3472
3473 return self;
3474}
3475
3476/*
3477 * call-seq:
3478 * ec.inspect -> string
3479 *
3480 * Returns a printable version of <i>ec</i>
3481 *
3482 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3483 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3484 *
3485 */
3486static VALUE
3487econv_inspect(VALUE self)
3488{
3489 const char *cname = rb_obj_classname(self);
3490 rb_econv_t *ec;
3491
3492 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3493 if (!ec)
3494 return rb_sprintf("#<%s: uninitialized>", cname);
3495 else {
3496 const char *sname = ec->source_encoding_name;
3497 const char *dname = ec->destination_encoding_name;
3498 VALUE str;
3499 str = rb_sprintf("#<%s: ", cname);
3500 econv_description(sname, dname, ec->flags, str);
3501 rb_str_cat2(str, ">");
3502 return str;
3503 }
3504}
3505
3506static rb_econv_t *
3507check_econv(VALUE self)
3508{
3509 rb_econv_t *ec;
3510
3511 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3512 if (!ec) {
3513 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3514 }
3515 return ec;
3516}
3517
3518static VALUE
3519econv_get_encoding(rb_encoding *encoding)
3520{
3521 if (!encoding)
3522 return Qnil;
3523 return rb_enc_from_encoding(encoding);
3524}
3525
3526/*
3527 * call-seq:
3528 * ec.source_encoding -> encoding
3529 *
3530 * Returns the source encoding as an Encoding object.
3531 */
3532static VALUE
3533econv_source_encoding(VALUE self)
3534{
3535 rb_econv_t *ec = check_econv(self);
3536 return econv_get_encoding(ec->source_encoding);
3537}
3538
3539/*
3540 * call-seq:
3541 * ec.destination_encoding -> encoding
3542 *
3543 * Returns the destination encoding as an Encoding object.
3544 */
3545static VALUE
3546econv_destination_encoding(VALUE self)
3547{
3548 rb_econv_t *ec = check_econv(self);
3549 return econv_get_encoding(ec->destination_encoding);
3550}
3551
3552/*
3553 * call-seq:
3554 * ec.convpath -> ary
3555 *
3556 * Returns the conversion path of ec.
3557 *
3558 * The result is an array of conversions.
3559 *
3560 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3561 * p ec.convpath
3562 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3563 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3564 * # "crlf_newline"]
3565 *
3566 * Each element of the array is a pair of encodings or a string.
3567 * A pair means an encoding conversion.
3568 * A string means a decorator.
3569 *
3570 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3571 * a converter from ISO-8859-1 to UTF-8.
3572 * "crlf_newline" means newline converter from LF to CRLF.
3573 */
3574static VALUE
3575econv_convpath(VALUE self)
3576{
3577 rb_econv_t *ec = check_econv(self);
3578 VALUE result;
3579 int i;
3580
3581 result = rb_ary_new();
3582 for (i = 0; i < ec->num_trans; i++) {
3583 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3584 VALUE v;
3585 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3586 v = rb_str_new_cstr(tr->dst_encoding);
3587 else
3588 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3589 rb_ary_push(result, v);
3590 }
3591 return result;
3592}
3593
3594/*
3595 * call-seq:
3596 * ec == other -> true or false
3597 */
3598static VALUE
3599econv_equal(VALUE self, VALUE other)
3600{
3601 rb_econv_t *ec1 = check_econv(self);
3602 rb_econv_t *ec2;
3603 int i;
3604
3605 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3606 return Qnil;
3607 }
3608 ec2 = DATA_PTR(other);
3609 if (!ec2) return Qfalse;
3610 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3611 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3612 return Qfalse;
3613 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3614 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3615 return Qfalse;
3616 if (ec1->flags != ec2->flags) return Qfalse;
3617 if (ec1->replacement_enc != ec2->replacement_enc &&
3618 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3619 return Qfalse;
3620 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3621 if (ec1->replacement_str != ec2->replacement_str &&
3622 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3623 return Qfalse;
3624
3625 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3626 for (i = 0; i < ec1->num_trans; i++) {
3627 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3628 return Qfalse;
3629 }
3630 return Qtrue;
3631}
3632
3633static VALUE
3634econv_result_to_symbol(rb_econv_result_t res)
3635{
3636 switch (res) {
3637 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3638 case econv_incomplete_input: return sym_incomplete_input;
3639 case econv_undefined_conversion: return sym_undefined_conversion;
3640 case econv_destination_buffer_full: return sym_destination_buffer_full;
3641 case econv_source_buffer_empty: return sym_source_buffer_empty;
3642 case econv_finished: return sym_finished;
3643 case econv_after_output: return sym_after_output;
3644 default: return INT2NUM(res); /* should not be reached */
3645 }
3646}
3647
3648/*
3649 * call-seq:
3650 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3651 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3652 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3653 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3654 *
3655 * possible opt elements:
3656 * hash form:
3657 * :partial_input => true # source buffer may be part of larger source
3658 * :after_output => true # stop conversion after output before input
3659 * integer form:
3660 * Encoding::Converter::PARTIAL_INPUT
3661 * Encoding::Converter::AFTER_OUTPUT
3662 *
3663 * possible results:
3664 * :invalid_byte_sequence
3665 * :incomplete_input
3666 * :undefined_conversion
3667 * :after_output
3668 * :destination_buffer_full
3669 * :source_buffer_empty
3670 * :finished
3671 *
3672 * primitive_convert converts source_buffer into destination_buffer.
3673 *
3674 * source_buffer should be a string or nil.
3675 * nil means an empty string.
3676 *
3677 * destination_buffer should be a string.
3678 *
3679 * destination_byteoffset should be an integer or nil.
3680 * nil means the end of destination_buffer.
3681 * If it is omitted, nil is assumed.
3682 *
3683 * destination_bytesize should be an integer or nil.
3684 * nil means unlimited.
3685 * If it is omitted, nil is assumed.
3686 *
3687 * opt should be nil, a hash or an integer.
3688 * nil means no flags.
3689 * If it is omitted, nil is assumed.
3690 *
3691 * primitive_convert converts the content of source_buffer from beginning
3692 * and store the result into destination_buffer.
3693 *
3694 * destination_byteoffset and destination_bytesize specify the region which
3695 * the converted result is stored.
3696 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3697 * If destination_byteoffset is nil,
3698 * destination_buffer.bytesize is used for appending the result.
3699 * destination_bytesize specifies maximum number of bytes.
3700 * If destination_bytesize is nil,
3701 * destination size is unlimited.
3702 * After conversion, destination_buffer is resized to
3703 * destination_byteoffset + actually produced number of bytes.
3704 * Also destination_buffer's encoding is set to destination_encoding.
3705 *
3706 * primitive_convert drops the converted part of source_buffer.
3707 * the dropped part is converted in destination_buffer or
3708 * buffered in Encoding::Converter object.
3709 *
3710 * primitive_convert stops conversion when one of following condition met.
3711 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3712 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3713 * - unexpected end of source buffer (:incomplete_input)
3714 * this occur only when :partial_input is not specified.
3715 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3716 * - character not representable in output encoding (:undefined_conversion)
3717 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3718 * - after some output is generated, before input is done (:after_output)
3719 * this occur only when :after_output is specified.
3720 * - destination buffer is full (:destination_buffer_full)
3721 * this occur only when destination_bytesize is non-nil.
3722 * - source buffer is empty (:source_buffer_empty)
3723 * this occur only when :partial_input is specified.
3724 * - conversion is finished (:finished)
3725 *
3726 * example:
3727 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3728 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3729 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3730 *
3731 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3732 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3733 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3734 * ret = ec.primitive_convert(src, dst="", nil, 1)
3735 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3736 * ret = ec.primitive_convert(src, dst="", nil, 1)
3737 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3738 * ret = ec.primitive_convert(src, dst="", nil, 1)
3739 * p [ret, src, dst] #=> [:finished, "", "i"]
3740 *
3741 */
3742static VALUE
3743econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3744{
3745 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3746 rb_econv_t *ec = check_econv(self);
3748 const unsigned char *ip, *is;
3749 unsigned char *op, *os;
3750 long output_byteoffset, output_bytesize;
3751 unsigned long output_byteend;
3752 int flags;
3753
3754 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3755
3756 if (NIL_P(output_byteoffset_v))
3757 output_byteoffset = 0; /* dummy */
3758 else
3759 output_byteoffset = NUM2LONG(output_byteoffset_v);
3760
3761 if (NIL_P(output_bytesize_v))
3762 output_bytesize = 0; /* dummy */
3763 else
3764 output_bytesize = NUM2LONG(output_bytesize_v);
3765
3766 if (!NIL_P(flags_v)) {
3767 if (!NIL_P(opt)) {
3768 rb_error_arity(argc + 1, 2, 5);
3769 }
3770 flags = NUM2INT(rb_to_int(flags_v));
3771 }
3772 else if (!NIL_P(opt)) {
3773 VALUE v;
3774 flags = 0;
3775 v = rb_hash_aref(opt, sym_partial_input);
3776 if (RTEST(v))
3777 flags |= ECONV_PARTIAL_INPUT;
3778 v = rb_hash_aref(opt, sym_after_output);
3779 if (RTEST(v))
3780 flags |= ECONV_AFTER_OUTPUT;
3781 }
3782 else {
3783 flags = 0;
3784 }
3785
3786 StringValue(output);
3787 if (!NIL_P(input))
3788 StringValue(input);
3789 rb_str_modify(output);
3790
3791 if (NIL_P(output_bytesize_v)) {
3792 output_bytesize = rb_str_capacity(output);
3793
3794 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3795 output_bytesize = RSTRING_LEN(input);
3796 }
3797
3798 retry:
3799
3800 if (NIL_P(output_byteoffset_v))
3801 output_byteoffset = RSTRING_LEN(output);
3802
3803 if (output_byteoffset < 0)
3804 rb_raise(rb_eArgError, "negative output_byteoffset");
3805
3806 if (RSTRING_LEN(output) < output_byteoffset)
3807 rb_raise(rb_eArgError, "output_byteoffset too big");
3808
3809 if (output_bytesize < 0)
3810 rb_raise(rb_eArgError, "negative output_bytesize");
3811
3812 output_byteend = (unsigned long)output_byteoffset +
3813 (unsigned long)output_bytesize;
3814
3815 if (output_byteend < (unsigned long)output_byteoffset ||
3816 LONG_MAX < output_byteend)
3817 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3818
3819 if (rb_str_capacity(output) < output_byteend)
3820 rb_str_resize(output, output_byteend);
3821
3822 if (NIL_P(input)) {
3823 ip = is = NULL;
3824 }
3825 else {
3826 ip = (const unsigned char *)RSTRING_PTR(input);
3827 is = ip + RSTRING_LEN(input);
3828 }
3829
3830 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3831 os = op + output_bytesize;
3832
3833 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3834 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3835 if (!NIL_P(input)) {
3836 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3837 }
3838
3839 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3840 if (LONG_MAX / 2 < output_bytesize)
3841 rb_raise(rb_eArgError, "too long conversion result");
3842 output_bytesize *= 2;
3843 output_byteoffset_v = Qnil;
3844 goto retry;
3845 }
3846
3847 if (ec->destination_encoding) {
3848 rb_enc_associate(output, ec->destination_encoding);
3849 }
3850
3851 return econv_result_to_symbol(res);
3852}
3853
3854/*
3855 * call-seq:
3856 * ec.convert(source_string) -> destination_string
3857 *
3858 * Convert source_string and return destination_string.
3859 *
3860 * source_string is assumed as a part of source.
3861 * i.e. :partial_input=>true is specified internally.
3862 * finish method should be used last.
3863 *
3864 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3865 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3866 * puts ec.finish.dump #=> ""
3867 *
3868 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3869 * puts ec.convert("\xA4").dump #=> ""
3870 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3871 * puts ec.finish.dump #=> ""
3872 *
3873 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3874 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3875 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3876 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3877 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3878 *
3879 * If a conversion error occur,
3880 * Encoding::UndefinedConversionError or
3881 * Encoding::InvalidByteSequenceError is raised.
3882 * Encoding::Converter#convert doesn't supply methods to recover or restart
3883 * from these exceptions.
3884 * When you want to handle these conversion errors,
3885 * use Encoding::Converter#primitive_convert.
3886 *
3887 */
3888static VALUE
3889econv_convert(VALUE self, VALUE source_string)
3890{
3891 VALUE ret, dst;
3892 VALUE av[5];
3893 int ac;
3894 rb_econv_t *ec = check_econv(self);
3895
3896 StringValue(source_string);
3897
3898 dst = rb_str_new(NULL, 0);
3899
3900 av[0] = rb_str_dup(source_string);
3901 av[1] = dst;
3902 av[2] = Qnil;
3903 av[3] = Qnil;
3905 ac = 5;
3906
3907 ret = econv_primitive_convert(ac, av, self);
3908
3909 if (ret == sym_invalid_byte_sequence ||
3910 ret == sym_undefined_conversion ||
3911 ret == sym_incomplete_input) {
3912 VALUE exc = make_econv_exception(ec);
3913 rb_exc_raise(exc);
3914 }
3915
3916 if (ret == sym_finished) {
3917 rb_raise(rb_eArgError, "converter already finished");
3918 }
3919
3920 if (ret != sym_source_buffer_empty) {
3921 rb_bug("unexpected result of econv_primitive_convert");
3922 }
3923
3924 return dst;
3925}
3926
3927/*
3928 * call-seq:
3929 * ec.finish -> string
3930 *
3931 * Finishes the converter.
3932 * It returns the last part of the converted string.
3933 *
3934 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3935 * p ec.convert("\u3042") #=> "\e$B$\""
3936 * p ec.finish #=> "\e(B"
3937 */
3938static VALUE
3939econv_finish(VALUE self)
3940{
3941 VALUE ret, dst;
3942 VALUE av[5];
3943 int ac;
3944 rb_econv_t *ec = check_econv(self);
3945
3946 dst = rb_str_new(NULL, 0);
3947
3948 av[0] = Qnil;
3949 av[1] = dst;
3950 av[2] = Qnil;
3951 av[3] = Qnil;
3952 av[4] = INT2FIX(0);
3953 ac = 5;
3954
3955 ret = econv_primitive_convert(ac, av, self);
3956
3957 if (ret == sym_invalid_byte_sequence ||
3958 ret == sym_undefined_conversion ||
3959 ret == sym_incomplete_input) {
3960 VALUE exc = make_econv_exception(ec);
3961 rb_exc_raise(exc);
3962 }
3963
3964 if (ret != sym_finished) {
3965 rb_bug("unexpected result of econv_primitive_convert");
3966 }
3967
3968 return dst;
3969}
3970
3971/*
3972 * call-seq:
3973 * ec.primitive_errinfo -> array
3974 *
3975 * primitive_errinfo returns important information regarding the last error
3976 * as a 5-element array:
3977 *
3978 * [result, enc1, enc2, error_bytes, readagain_bytes]
3979 *
3980 * result is the last result of primitive_convert.
3981 *
3982 * Other elements are only meaningful when result is
3983 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3984 *
3985 * enc1 and enc2 indicate a conversion step as a pair of strings.
3986 * For example, a converter from EUC-JP to ISO-8859-1 converts
3987 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3988 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3989 *
3990 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3991 * error_bytes is discarded portion.
3992 * readagain_bytes is buffered portion which is read again on next conversion.
3993 *
3994 * Example:
3995 *
3996 * # \xff is invalid as EUC-JP.
3997 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3998 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3999 * p ec.primitive_errinfo
4000 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4001 *
4002 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4003 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4004 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4005 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4006 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4007 * p ec.primitive_errinfo
4008 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4009 *
4010 * # partial character is invalid
4011 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4012 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4013 * p ec.primitive_errinfo
4014 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4015 *
4016 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4017 * # partial characters.
4018 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4019 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4020 * p ec.primitive_errinfo
4021 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4022 *
4023 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4024 * # no low surrogate after high surrogate (\xd8\x00).
4025 * # It is detected by 3rd byte (\00) which is part of next character.
4026 * # So the high surrogate (\xd8\x00) is discarded and
4027 * # the 3rd byte is read again later.
4028 * # Since the byte is buffered in ec, it is dropped from src.
4029 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4030 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4031 * p ec.primitive_errinfo
4032 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4033 * p src
4034 * #=> "@"
4035 *
4036 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4037 * # The problem is detected by 4th byte.
4038 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4039 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4040 * p ec.primitive_errinfo
4041 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4042 * p src
4043 * #=> ""
4044 *
4045 */
4046static VALUE
4047econv_primitive_errinfo(VALUE self)
4048{
4049 rb_econv_t *ec = check_econv(self);
4050
4051 VALUE ary;
4052
4053 ary = rb_ary_new2(5);
4054
4055 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4056 rb_ary_store(ary, 4, Qnil);
4057
4058 if (ec->last_error.source_encoding)
4059 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4060
4061 if (ec->last_error.destination_encoding)
4062 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4063
4064 if (ec->last_error.error_bytes_start) {
4065 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4066 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4067 }
4068
4069 return ary;
4070}
4071
4072/*
4073 * call-seq:
4074 * ec.insert_output(string) -> nil
4075 *
4076 * Inserts string into the encoding converter.
4077 * The string will be converted to the destination encoding and
4078 * output on later conversions.
4079 *
4080 * If the destination encoding is stateful,
4081 * string is converted according to the state and the state is updated.
4082 *
4083 * This method should be used only when a conversion error occurs.
4084 *
4085 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4086 * src = "HIRAGANA LETTER A is \u{3042}."
4087 * dst = ""
4088 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4089 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4090 * ec.insert_output("<err>")
4091 * p ec.primitive_convert(src, dst) #=> :finished
4092 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4093 *
4094 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4095 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4096 * dst = ""
4097 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4098 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4099 * ec.insert_output "?" # state change required to output "?".
4100 * p ec.primitive_convert(src, dst) #=> :finished
4101 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4102 *
4103 */
4104static VALUE
4105econv_insert_output(VALUE self, VALUE string)
4106{
4107 const char *insert_enc;
4108
4109 int ret;
4110
4111 rb_econv_t *ec = check_econv(self);
4112
4113 StringValue(string);
4114 insert_enc = rb_econv_encoding_to_insert_output(ec);
4115 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4116
4117 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4118 if (ret == -1) {
4119 rb_raise(rb_eArgError, "too big string");
4120 }
4121
4122 return Qnil;
4123}
4124
4125/*
4126 * call-seq:
4127 * ec.putback -> string
4128 * ec.putback(max_numbytes) -> string
4129 *
4130 * Put back the bytes which will be converted.
4131 *
4132 * The bytes are caused by invalid_byte_sequence error.
4133 * When invalid_byte_sequence error, some bytes are discarded and
4134 * some bytes are buffered to be converted later.
4135 * The latter bytes can be put back.
4136 * It can be observed by
4137 * Encoding::InvalidByteSequenceError#readagain_bytes and
4138 * Encoding::Converter#primitive_errinfo.
4139 *
4140 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4141 * src = "\x00\xd8\x61\x00"
4142 * dst = ""
4143 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4144 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4145 * p ec.putback #=> "a\x00"
4146 * p ec.putback #=> "" # no more bytes to put back
4147 *
4148 */
4149static VALUE
4150econv_putback(int argc, VALUE *argv, VALUE self)
4151{
4152 rb_econv_t *ec = check_econv(self);
4153 int n;
4154 int putbackable;
4155 VALUE str, max;
4156
4157 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4158 n = rb_econv_putbackable(ec);
4159 }
4160 else {
4161 n = NUM2INT(max);
4162 putbackable = rb_econv_putbackable(ec);
4163 if (putbackable < n)
4164 n = putbackable;
4165 }
4166
4167 str = rb_str_new(NULL, n);
4168 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4169
4170 if (ec->source_encoding) {
4171 rb_enc_associate(str, ec->source_encoding);
4172 }
4173
4174 return str;
4175}
4176
4177/*
4178 * call-seq:
4179 * ec.last_error -> exception or nil
4180 *
4181 * Returns an exception object for the last conversion.
4182 * Returns nil if the last conversion did not produce an error.
4183 *
4184 * "error" means that
4185 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4186 * Encoding::Converter#convert and
4187 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4188 * Encoding::Converter#primitive_convert.
4189 *
4190 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4191 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4192 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4193 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4194 * p ec.last_error #=> nil
4195 *
4196 */
4197static VALUE
4198econv_last_error(VALUE self)
4199{
4200 rb_econv_t *ec = check_econv(self);
4201 VALUE exc;
4202
4203 exc = make_econv_exception(ec);
4204 if (NIL_P(exc))
4205 return Qnil;
4206 return exc;
4207}
4208
4209/*
4210 * call-seq:
4211 * ec.replacement -> string
4212 *
4213 * Returns the replacement string.
4214 *
4215 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4216 * p ec.replacement #=> "?"
4217 *
4218 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4219 * p ec.replacement #=> "\uFFFD"
4220 */
4221static VALUE
4222econv_get_replacement(VALUE self)
4223{
4224 rb_econv_t *ec = check_econv(self);
4225 int ret;
4226 rb_encoding *enc;
4227
4228 ret = make_replacement(ec);
4229 if (ret == -1) {
4230 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4231 }
4232
4233 enc = rb_enc_find(ec->replacement_enc);
4234 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4235}
4236
4237/*
4238 * call-seq:
4239 * ec.replacement = string
4240 *
4241 * Sets the replacement string.
4242 *
4243 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4244 * ec.replacement = "<undef>"
4245 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4246 */
4247static VALUE
4248econv_set_replacement(VALUE self, VALUE arg)
4249{
4250 rb_econv_t *ec = check_econv(self);
4251 VALUE string = arg;
4252 int ret;
4253 rb_encoding *enc;
4254
4255 StringValue(string);
4256 enc = rb_enc_get(string);
4257
4258 ret = rb_econv_set_replacement(ec,
4259 (const unsigned char *)RSTRING_PTR(string),
4260 RSTRING_LEN(string),
4261 rb_enc_name(enc));
4262
4263 if (ret == -1) {
4264 /* xxx: rb_eInvalidByteSequenceError? */
4265 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4266 }
4267
4268 return arg;
4269}
4270
4271VALUE
4273{
4274 return make_econv_exception(ec);
4275}
4276
4277void
4279{
4280 VALUE exc;
4281
4282 exc = make_econv_exception(ec);
4283 if (NIL_P(exc))
4284 return;
4285 rb_exc_raise(exc);
4286}
4287
4288/*
4289 * call-seq:
4290 * ecerr.source_encoding_name -> string
4291 *
4292 * Returns the source encoding name as a string.
4293 */
4294static VALUE
4295ecerr_source_encoding_name(VALUE self)
4296{
4297 return rb_attr_get(self, id_source_encoding_name);
4298}
4299
4300/*
4301 * call-seq:
4302 * ecerr.source_encoding -> encoding
4303 *
4304 * Returns the source encoding as an encoding object.
4305 *
4306 * Note that the result may not be equal to the source encoding of
4307 * the encoding converter if the conversion has multiple steps.
4308 *
4309 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4310 * begin
4311 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4312 * rescue Encoding::UndefinedConversionError
4313 * p $!.source_encoding #=> #<Encoding:UTF-8>
4314 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4315 * p $!.source_encoding_name #=> "UTF-8"
4316 * p $!.destination_encoding_name #=> "EUC-JP"
4317 * end
4318 *
4319 */
4320static VALUE
4321ecerr_source_encoding(VALUE self)
4322{
4323 return rb_attr_get(self, id_source_encoding);
4324}
4325
4326/*
4327 * call-seq:
4328 * ecerr.destination_encoding_name -> string
4329 *
4330 * Returns the destination encoding name as a string.
4331 */
4332static VALUE
4333ecerr_destination_encoding_name(VALUE self)
4334{
4335 return rb_attr_get(self, id_destination_encoding_name);
4336}
4337
4338/*
4339 * call-seq:
4340 * ecerr.destination_encoding -> string
4341 *
4342 * Returns the destination encoding as an encoding object.
4343 */
4344static VALUE
4345ecerr_destination_encoding(VALUE self)
4346{
4347 return rb_attr_get(self, id_destination_encoding);
4348}
4349
4350/*
4351 * call-seq:
4352 * ecerr.error_char -> string
4353 *
4354 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4355 *
4356 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4357 * begin
4358 * ec.convert("\xa0")
4359 * rescue Encoding::UndefinedConversionError
4360 * puts $!.error_char.dump #=> "\xC2\xA0"
4361 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4362 * end
4363 *
4364 */
4365static VALUE
4366ecerr_error_char(VALUE self)
4367{
4368 return rb_attr_get(self, id_error_char);
4369}
4370
4371/*
4372 * call-seq:
4373 * ecerr.error_bytes -> string
4374 *
4375 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4376 *
4377 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4378 * begin
4379 * ec.convert("abc\xA1\xFFdef")
4380 * rescue Encoding::InvalidByteSequenceError
4381 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4382 * puts $!.error_bytes.dump #=> "\xA1"
4383 * puts $!.readagain_bytes.dump #=> "\xFF"
4384 * end
4385 */
4386static VALUE
4387ecerr_error_bytes(VALUE self)
4388{
4389 return rb_attr_get(self, id_error_bytes);
4390}
4391
4392/*
4393 * call-seq:
4394 * ecerr.readagain_bytes -> string
4395 *
4396 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4397 */
4398static VALUE
4399ecerr_readagain_bytes(VALUE self)
4400{
4401 return rb_attr_get(self, id_readagain_bytes);
4402}
4403
4404/*
4405 * call-seq:
4406 * ecerr.incomplete_input? -> true or false
4407 *
4408 * Returns true if the invalid byte sequence error is caused by
4409 * premature end of string.
4410 *
4411 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4412 *
4413 * begin
4414 * ec.convert("abc\xA1z")
4415 * rescue Encoding::InvalidByteSequenceError
4416 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4417 * p $!.incomplete_input? #=> false
4418 * end
4419 *
4420 * begin
4421 * ec.convert("abc\xA1")
4422 * ec.finish
4423 * rescue Encoding::InvalidByteSequenceError
4424 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4425 * p $!.incomplete_input? #=> true
4426 * end
4427 */
4428static VALUE
4429ecerr_incomplete_input(VALUE self)
4430{
4431 return rb_attr_get(self, id_incomplete_input);
4432}
4433
4434/*
4435 * Document-class: Encoding::UndefinedConversionError
4436 *
4437 * Raised by Encoding and String methods when a transcoding operation
4438 * fails.
4439 */
4440
4441/*
4442 * Document-class: Encoding::InvalidByteSequenceError
4443 *
4444 * Raised by Encoding and String methods when the string being
4445 * transcoded contains a byte invalid for the either the source or
4446 * target encoding.
4447 */
4448
4449/*
4450 * Document-class: Encoding::ConverterNotFoundError
4451 *
4452 * Raised by transcoding methods when a named encoding does not
4453 * correspond with a known converter.
4454 */
4455
4456void
4457Init_transcode(void)
4458{
4459 transcoder_table = st_init_strcasetable();
4460
4461 id_destination_encoding = rb_intern_const("destination_encoding");
4462 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4463 id_error_bytes = rb_intern_const("error_bytes");
4464 id_error_char = rb_intern_const("error_char");
4465 id_incomplete_input = rb_intern_const("incomplete_input");
4466 id_readagain_bytes = rb_intern_const("readagain_bytes");
4467 id_source_encoding = rb_intern_const("source_encoding");
4468 id_source_encoding_name = rb_intern_const("source_encoding_name");
4469
4470 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4471 sym_undef = ID2SYM(rb_intern_const("undef"));
4472 sym_replace = ID2SYM(rb_intern_const("replace"));
4473 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4474 sym_xml = ID2SYM(rb_intern_const("xml"));
4475 sym_text = ID2SYM(rb_intern_const("text"));
4476 sym_attr = ID2SYM(rb_intern_const("attr"));
4477
4478 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4479 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4480 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4481 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4482 sym_finished = ID2SYM(rb_intern_const("finished"));
4483 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4484 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4485 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4486 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4487 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4488 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4489 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4490
4491#ifdef ENABLE_ECONV_NEWLINE_OPTION
4492 sym_newline = ID2SYM(rb_intern_const("newline"));
4493 sym_universal = ID2SYM(rb_intern_const("universal"));
4494 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4495 sym_cr = ID2SYM(rb_intern_const("cr"));
4496 sym_lf = ID2SYM(rb_intern_const("lf"));
4497#endif
4498
4499 InitVM(transcode);
4500}
4501
4502void
4503InitVM_transcode(void)
4504{
4505 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4506 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4507 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4508
4509 rb_define_method(rb_cString, "encode", str_encode, -1);
4510 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4511
4512 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4513 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4514 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4515 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4516 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4517 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4518 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4519 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4520 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4521 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4522 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4523 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4524 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4525 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4526 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4527 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4528 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4529 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4530 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4531
4532 /*
4533 *Mask for invalid byte sequences
4534 */
4535 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4536
4537 /*
4538 * Replace invalid byte sequences
4539 */
4540 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4541
4542 /*
4543 * Mask for a valid character in the source encoding but no related
4544 * character(s) in destination encoding.
4545 */
4546 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4547
4548 /*
4549 * Replace byte sequences that are undefined in the destination encoding.
4550 */
4551 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4552
4553 /*
4554 * Replace byte sequences that are undefined in the destination encoding
4555 * with an XML hexadecimal character reference. This is valid for XML
4556 * conversion.
4557 */
4558 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4559
4560 /*
4561 * Indicates the source may be part of a larger string. See
4562 * primitive_convert for an example.
4563 */
4564 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4565
4566 /*
4567 * Stop converting after some output is complete but before all of the
4568 * input was consumed. See primitive_convert for an example.
4569 */
4570 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4571
4572 /*
4573 * Decorator for converting CRLF and CR to LF
4574 */
4575 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4576
4577 /*
4578 * Decorator for converting CRLF and CR to LF when writing
4579 */
4580 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4581
4582 /*
4583 * Decorator for converting LF to CRLF
4584 */
4585 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4586
4587 /*
4588 * Decorator for converting LF to CR
4589 */
4590 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4591
4592 /*
4593 * Escape as XML CharData
4594 */
4595 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4596
4597 /*
4598 * Escape as XML AttValue
4599 */
4600 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4601
4602 /*
4603 * Escape as XML AttValue
4604 */
4605 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4606
4607 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4608 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4609 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4610 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4611 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4612
4613 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4614 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4615 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4616 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4617 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4618 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4619 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4620
4621 Init_newline();
4622}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1012
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2635
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1675
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:403
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:675
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1380
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1397
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1481
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1436
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:247
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:57
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3192
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1286
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:901
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:785
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2600
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2097
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1532
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2645
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1770
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3211
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1098
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1919
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1910
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1814
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1616
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1931
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:1979
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:1996
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1962
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4272
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4278
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1925
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1847
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1781
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2259
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1150
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1003
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1661
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2539
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:119
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1672
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1714
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:955
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1462
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1917
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3269
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2649
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7326
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1644
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5697
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1844
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2953
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:986
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:515
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:427
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:8284
Definition transcode.c:175
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376