Ruby 3.5.0dev (2025-10-30 revision 15f2dcceb4787c5738dde48f580019c3765ce1b8)
transcode.c (15f2dcceb4787c5738dde48f580019c3765ce1b8)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "internal/encoding.h"
23#include "ruby/encoding.h"
24#include "vm_sync.h"
25
26#include "transcode_data.h"
27#include "id.h"
28
29#define ENABLE_ECONV_NEWLINE_OPTION 1
30
31/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
32static VALUE rb_eUndefinedConversionError;
33static VALUE rb_eInvalidByteSequenceError;
34static VALUE rb_eConverterNotFoundError;
35
36VALUE rb_cEncodingConverter;
37
38static ID id_destination_encoding;
39static ID id_destination_encoding_name;
40static ID id_error_bytes;
41static ID id_error_char;
42static ID id_incomplete_input;
43static ID id_readagain_bytes;
44static ID id_source_encoding;
45static ID id_source_encoding_name;
46
47static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
48static VALUE sym_xml, sym_text, sym_attr;
49static VALUE sym_universal_newline;
50static VALUE sym_crlf_newline;
51static VALUE sym_cr_newline;
52static VALUE sym_lf_newline;
53#ifdef ENABLE_ECONV_NEWLINE_OPTION
54static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
55#endif
56static VALUE sym_partial_input;
57
58static VALUE sym_invalid_byte_sequence;
59static VALUE sym_undefined_conversion;
60static VALUE sym_destination_buffer_full;
61static VALUE sym_source_buffer_empty;
62static VALUE sym_finished;
63static VALUE sym_after_output;
64static VALUE sym_incomplete_input;
65
66static unsigned char *
67allocate_converted_string(const char *sname, const char *dname,
68 const unsigned char *str, size_t len,
69 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
70 size_t *dst_len_ptr);
71
72/* dynamic structure, one per conversion (similar to iconv_t) */
73/* may carry conversion state (e.g. for iso-2022-jp) */
74typedef struct rb_transcoding {
75 const rb_transcoder *transcoder;
76
77 int flags;
78
79 int resume_position;
80 unsigned int next_table;
81 VALUE next_info;
82 unsigned char next_byte;
83 unsigned int output_index;
84
85 ssize_t recognized_len; /* already interpreted */
86 ssize_t readagain_len; /* not yet interpreted */
87 union {
88 unsigned char ary[8]; /* max_input <= sizeof(ary) */
89 unsigned char *ptr; /* length: max_input */
90 } readbuf; /* recognized_len + readagain_len used */
91
92 ssize_t writebuf_off;
93 ssize_t writebuf_len;
94 union {
95 unsigned char ary[8]; /* max_output <= sizeof(ary) */
96 unsigned char *ptr; /* length: max_output */
97 } writebuf;
98
99 union rb_transcoding_state_t { /* opaque data for stateful encoding */
100 void *ptr;
101 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
102 double dummy_for_alignment;
103 } state;
105#define TRANSCODING_READBUF(tc) \
106 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
107 (tc)->readbuf.ary : \
108 (tc)->readbuf.ptr)
109#define TRANSCODING_WRITEBUF(tc) \
110 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
111 (tc)->writebuf.ary : \
112 (tc)->writebuf.ptr)
113#define TRANSCODING_WRITEBUF_SIZE(tc) \
114 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
115 sizeof((tc)->writebuf.ary) : \
116 (size_t)(tc)->transcoder->max_output)
117#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
118#define TRANSCODING_STATE(tc) \
119 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
120 (tc)->state.ary : \
121 (tc)->state.ptr)
122
123typedef struct {
124 struct rb_transcoding *tc;
125 unsigned char *out_buf_start;
126 unsigned char *out_data_start;
127 unsigned char *out_data_end;
128 unsigned char *out_buf_end;
129 rb_econv_result_t last_result;
131
133 int flags;
134 int started; /* bool */
135
136 const char *source_encoding_name;
137 const char *destination_encoding_name;
138
139 const unsigned char *replacement_str;
140 size_t replacement_len;
141 const char *replacement_enc;
142
143 unsigned char *in_buf_start;
144 unsigned char *in_data_start;
145 unsigned char *in_data_end;
146 unsigned char *in_buf_end;
147 rb_econv_elem_t *elems;
148 int replacement_allocated; /* bool */
149 int num_allocated;
150 int num_trans;
151 int num_finished;
152 struct rb_transcoding *last_tc;
153
154 /* last error */
155 struct {
156 rb_econv_result_t result;
157 struct rb_transcoding *error_tc;
158 const char *source_encoding;
159 const char *destination_encoding;
160 const unsigned char *error_bytes_start;
161 size_t error_bytes_len;
162 size_t readagain_len;
163 } last_error;
164
165 /* The following fields are only for Encoding::Converter.
166 * rb_econv_open set them NULL. */
167 rb_encoding *source_encoding;
168 rb_encoding *destination_encoding;
169};
170
171/*
172 * Dispatch data and logic
173 */
174
175#define DECORATOR_P(sname, dname) (*(sname) == '\0')
176
177typedef struct {
178 const char *sname;
179 const char *dname;
180 const char *lib; /* null means no need to load a library */
181 const rb_transcoder *transcoder;
183
184static st_table *transcoder_table;
185
186static int
187free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
188{
189 xfree((void *)val);
190 return ST_DELETE;
191}
192
193static int
194free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
195{
196 st_foreach((void *)val, free_inner_transcode_i, 0);
197 st_free_table((void *)val);
198 return ST_DELETE;
199}
200
201void
202rb_free_transcoder_table(void)
203{
204 st_foreach(transcoder_table, free_transcode_i, 0);
205 st_free_table(transcoder_table);
206}
207
208static transcoder_entry_t *
209make_transcoder_entry(const char *sname, const char *dname)
210{
211 st_data_t val;
212 st_table *table2;
213
214 RB_VM_LOCKING() {
215 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
216 val = (st_data_t)st_init_strcasetable();
217 st_add_direct(transcoder_table, (st_data_t)sname, val);
218 }
219 table2 = (st_table *)val;
220 if (!st_lookup(table2, (st_data_t)dname, &val)) {
222 entry->sname = sname;
223 entry->dname = dname;
224 entry->lib = NULL;
225 entry->transcoder = NULL;
226 val = (st_data_t)entry;
227 st_add_direct(table2, (st_data_t)dname, val);
228 }
229 }
230 return (transcoder_entry_t *)val;
231}
232
233static transcoder_entry_t *
234get_transcoder_entry(const char *sname, const char *dname)
235{
236 st_data_t val = 0;
237 st_table *table2;
238 RB_VM_LOCKING() {
239 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
240 table2 = (st_table *)val;
241 if (!st_lookup(table2, (st_data_t)dname, &val)) {
242 val = 0;
243 }
244 }
245 }
246 return (transcoder_entry_t *)val;
247}
248
249void
250rb_register_transcoder(const rb_transcoder *tr)
251{
252 const char *const sname = tr->src_encoding;
253 const char *const dname = tr->dst_encoding;
254
255 transcoder_entry_t *entry;
256
257 RB_VM_LOCKING() {
258 entry = make_transcoder_entry(sname, dname);
259 if (entry->transcoder) {
260 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
261 sname, dname);
262 }
263 entry->transcoder = tr;
264 }
265}
266
267static void
268declare_transcoder(const char *sname, const char *dname, const char *lib)
269{
270 transcoder_entry_t *entry;
271
272 entry = make_transcoder_entry(sname, dname);
273 entry->lib = lib;
274}
275
276static const char transcoder_lib_prefix[] = "enc/trans/";
277
278void
279rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
280{
281 if (!lib) {
282 rb_raise(rb_eArgError, "invalid library name - (null)");
283 }
284 declare_transcoder(enc1, enc2, lib);
285}
286
287#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
288
289typedef struct search_path_queue_tag {
290 struct search_path_queue_tag *next;
291 const char *enc;
293
294typedef struct {
295 st_table *visited;
296 search_path_queue_t *queue;
297 search_path_queue_t **queue_last_ptr;
298 const char *base_enc;
300
301static int
302transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
303{
304 const char *dname = (const char *)key;
307
308 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
309 return ST_CONTINUE;
310 }
311
313 q->enc = dname;
314 q->next = NULL;
315 *bfs->queue_last_ptr = q;
316 bfs->queue_last_ptr = &q->next;
317
318 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
319 return ST_CONTINUE;
320}
321
322static int
323transcode_search_path(const char *sname, const char *dname,
324 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
325 void *arg)
326{
329 st_data_t val;
330 st_table *table2;
331 int pathlen = -1;
332 bool found = false;
333 bool lookup_res;
334
335 if (encoding_equal(sname, dname))
336 return -1;
337
339 q->enc = sname;
340 q->next = NULL;
341 bfs.queue_last_ptr = &q->next;
342 bfs.queue = q;
343
344 bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop
345 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
346
347 RB_VM_LOCKING() {
348 while (bfs.queue) {
349 q = bfs.queue;
350 bfs.queue = q->next;
351 if (!bfs.queue) {
352 bfs.queue_last_ptr = &bfs.queue;
353 }
354
355 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2
356 if (!lookup_res) {
357 xfree(q);
358 continue;
359 }
360 table2 = (st_table *)val;
361
362 if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv
363 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
364 xfree(q);
365 found = true;
366 break;
367 }
368
369 bfs.base_enc = q->enc;
370 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
371
372 bfs.base_enc = NULL;
373 xfree(q);
374 }
375 }
376
377 while (bfs.queue) {
378 q = bfs.queue;
379 bfs.queue = q->next;
380 xfree(q);
381 }
382
383 if (found) {
384 const char *enc = dname;
385 int depth;
386 pathlen = 0;
387 while (1) {
388 st_lookup(bfs.visited, (st_data_t)enc, &val);
389 if (!val)
390 break;
391 pathlen++;
392 enc = (const char *)val;
393 }
394 depth = pathlen;
395 enc = dname;
396 while (1) {
397 st_lookup(bfs.visited, (st_data_t)enc, &val);
398 if (!val)
399 break;
400 callback((const char *)val, enc, --depth, arg);
401 enc = (const char *)val;
402 }
403 }
404
405 st_free_table(bfs.visited);
406
407 return pathlen; /* is -1 if not found */
408}
409
410int rb_require_internal_silent(VALUE fname);
411
412static const rb_transcoder *
413load_transcoder_entry(transcoder_entry_t *entry)
414{
415 ASSERT_vm_unlocking();
416 if (entry->transcoder)
417 return entry->transcoder;
418
419 if (entry->lib) {
420 const char *const lib = entry->lib;
421 const size_t len = strlen(lib);
422 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
423 const VALUE fn = rb_str_new(0, total_len);
424 char *const path = RSTRING_PTR(fn);
425
426 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
427 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
428 rb_str_set_len(fn, total_len);
429 OBJ_FREEZE(fn);
430 rb_require_internal_silent(fn); // Sets entry->transcoder
431 }
432
433 if (entry->transcoder)
434 return entry->transcoder;
435
436 return NULL;
437}
438
439static const char*
440get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
441{
442 if (encoding_equal(encname, "UTF-8")) {
443 *len_ret = 3;
444 *repl_encname_ptr = "UTF-8";
445 return "\xEF\xBF\xBD";
446 }
447 else {
448 *len_ret = 1;
449 *repl_encname_ptr = "US-ASCII";
450 return "?";
451 }
452}
453
454/*
455 * Transcoding engine logic
456 */
457
458static const unsigned char *
459transcode_char_start(rb_transcoding *tc,
460 const unsigned char *in_start,
461 const unsigned char *inchar_start,
462 const unsigned char *in_p,
463 size_t *char_len_ptr)
464{
465 const unsigned char *ptr;
466 if (inchar_start - in_start < tc->recognized_len) {
467 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
468 inchar_start, unsigned char, in_p - inchar_start);
469 ptr = TRANSCODING_READBUF(tc);
470 }
471 else {
472 ptr = inchar_start - tc->recognized_len;
473 }
474 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
475 return ptr;
476}
477
479transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
480 const unsigned char *in_stop, unsigned char *out_stop,
481 rb_transcoding *tc,
482 const int opt)
483{
484 const rb_transcoder *tr = tc->transcoder;
485 int unitlen = tr->input_unit_length;
486 ssize_t readagain_len = 0;
487
488 const unsigned char *inchar_start;
489 const unsigned char *in_p;
490
491 unsigned char *out_p;
492
493 in_p = inchar_start = *in_pos;
494
495 out_p = *out_pos;
496
497#define SUSPEND(ret, num) \
498 do { \
499 tc->resume_position = (num); \
500 if (0 < in_p - inchar_start) \
501 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
502 inchar_start, unsigned char, in_p - inchar_start); \
503 *in_pos = in_p; \
504 *out_pos = out_p; \
505 tc->recognized_len += in_p - inchar_start; \
506 if (readagain_len) { \
507 tc->recognized_len -= readagain_len; \
508 tc->readagain_len = readagain_len; \
509 } \
510 return (ret); \
511 resume_label ## num:; \
512 } while (0)
513#define SUSPEND_OBUF(num) \
514 do { \
515 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
516 } while (0)
517
518#define SUSPEND_AFTER_OUTPUT(num) \
519 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
520 SUSPEND(econv_after_output, num); \
521 }
522
523#define next_table (tc->next_table)
524#define next_info (tc->next_info)
525#define next_byte (tc->next_byte)
526#define writebuf_len (tc->writebuf_len)
527#define writebuf_off (tc->writebuf_off)
528
529 switch (tc->resume_position) {
530 case 0: break;
531 case 1: goto resume_label1;
532 case 2: goto resume_label2;
533 case 3: goto resume_label3;
534 case 4: goto resume_label4;
535 case 5: goto resume_label5;
536 case 6: goto resume_label6;
537 case 7: goto resume_label7;
538 case 8: goto resume_label8;
539 case 9: goto resume_label9;
540 case 10: goto resume_label10;
541 case 11: goto resume_label11;
542 case 12: goto resume_label12;
543 case 13: goto resume_label13;
544 case 14: goto resume_label14;
545 case 15: goto resume_label15;
546 case 16: goto resume_label16;
547 case 17: goto resume_label17;
548 case 18: goto resume_label18;
549 case 19: goto resume_label19;
550 case 20: goto resume_label20;
551 case 21: goto resume_label21;
552 case 22: goto resume_label22;
553 case 23: goto resume_label23;
554 case 24: goto resume_label24;
555 case 25: goto resume_label25;
556 case 26: goto resume_label26;
557 case 27: goto resume_label27;
558 case 28: goto resume_label28;
559 case 29: goto resume_label29;
560 case 30: goto resume_label30;
561 case 31: goto resume_label31;
562 case 32: goto resume_label32;
563 case 33: goto resume_label33;
564 case 34: goto resume_label34;
565 }
566
567 while (1) {
568 inchar_start = in_p;
569 tc->recognized_len = 0;
570 next_table = tr->conv_tree_start;
571
572 SUSPEND_AFTER_OUTPUT(24);
573
574 if (in_stop <= in_p) {
575 if (!(opt & ECONV_PARTIAL_INPUT))
576 break;
577 SUSPEND(econv_source_buffer_empty, 7);
578 continue;
579 }
580
581#define BYTE_ADDR(index) (tr->byte_array + (index))
582#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
583#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
584#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
585#define BL_MIN_BYTE (BL_BASE[0])
586#define BL_MAX_BYTE (BL_BASE[1])
587#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
588#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
589
590 next_byte = (unsigned char)*in_p++;
591 follow_byte:
592 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
593 next_info = INVALID;
594 else {
595 next_info = (VALUE)BL_ACTION(next_byte);
596 }
597 follow_info:
598 switch (next_info & 0x1F) {
599 case NOMAP:
600 {
601 const unsigned char *p = inchar_start;
602 writebuf_off = 0;
603 while (p < in_p) {
604 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
605 }
606 writebuf_len = writebuf_off;
607 writebuf_off = 0;
608 while (writebuf_off < writebuf_len) {
609 SUSPEND_OBUF(3);
610 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
611 }
612 }
613 continue;
614 case 0x00: case 0x04: case 0x08: case 0x0C:
615 case 0x10: case 0x14: case 0x18: case 0x1C:
616 SUSPEND_AFTER_OUTPUT(25);
617 while (in_p >= in_stop) {
618 if (!(opt & ECONV_PARTIAL_INPUT))
619 goto incomplete;
620 SUSPEND(econv_source_buffer_empty, 5);
621 }
622 next_byte = (unsigned char)*in_p++;
623 next_table = (unsigned int)next_info;
624 goto follow_byte;
625 case ZERObt: /* drop input */
626 continue;
627 case ONEbt:
628 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
629 continue;
630 case TWObt:
631 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
632 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
633 continue;
634 case THREEbt:
635 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
636 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
637 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
638 continue;
639 case FOURbt:
640 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
641 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
642 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
643 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
644 continue;
645 case GB4bt:
646 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
647 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
648 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
649 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
650 continue;
651 case STR1:
652 tc->output_index = 0;
653 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
654 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
655 tc->output_index++;
656 }
657 continue;
658 case FUNii:
659 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
660 goto follow_info;
661 case FUNsi:
662 {
663 const unsigned char *char_start;
664 size_t char_len;
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
667 goto follow_info;
668 }
669 case FUNio:
670 SUSPEND_OBUF(13);
671 if (tr->max_output <= out_stop - out_p)
672 out_p += tr->func_io(TRANSCODING_STATE(tc),
673 next_info, out_p, out_stop - out_p);
674 else {
675 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
676 next_info,
677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
678 writebuf_off = 0;
679 while (writebuf_off < writebuf_len) {
680 SUSPEND_OBUF(20);
681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682 }
683 }
684 break;
685 case FUNso:
686 {
687 const unsigned char *char_start;
688 size_t char_len;
689 SUSPEND_OBUF(14);
690 if (tr->max_output <= out_stop - out_p) {
691 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
692 out_p += tr->func_so(TRANSCODING_STATE(tc),
693 char_start, (size_t)char_len,
694 out_p, out_stop - out_p);
695 }
696 else {
697 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
698 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
699 char_start, (size_t)char_len,
700 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
701 writebuf_off = 0;
702 while (writebuf_off < writebuf_len) {
703 SUSPEND_OBUF(22);
704 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
705 }
706 }
707 break;
708 }
709 case FUNsio:
710 {
711 const unsigned char *char_start;
712 size_t char_len;
713 SUSPEND_OBUF(33);
714 if (tr->max_output <= out_stop - out_p) {
715 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
716 out_p += tr->func_sio(TRANSCODING_STATE(tc),
717 char_start, (size_t)char_len, next_info,
718 out_p, out_stop - out_p);
719 }
720 else {
721 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
722 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
723 char_start, (size_t)char_len, next_info,
724 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
725 writebuf_off = 0;
726 while (writebuf_off < writebuf_len) {
727 SUSPEND_OBUF(34);
728 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
729 }
730 }
731 break;
732 }
733 case INVALID:
734 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
735 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
736 SUSPEND_AFTER_OUTPUT(26);
737 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
738 in_p = in_stop;
739 SUSPEND(econv_source_buffer_empty, 8);
740 }
741 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
742 in_p = in_stop;
743 }
744 else {
745 in_p = inchar_start + (unitlen - tc->recognized_len);
746 }
747 }
748 else {
749 ssize_t invalid_len; /* including the last byte which causes invalid */
750 ssize_t discard_len;
751 invalid_len = tc->recognized_len + (in_p - inchar_start);
752 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
753 readagain_len = invalid_len - discard_len;
754 }
755 goto invalid;
756 case UNDEF:
757 goto undef;
758 default:
759 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
760 }
761 continue;
762
763 invalid:
764 SUSPEND(econv_invalid_byte_sequence, 1);
765 continue;
766
767 incomplete:
768 SUSPEND(econv_incomplete_input, 27);
769 continue;
770
771 undef:
772 SUSPEND(econv_undefined_conversion, 2);
773 continue;
774 }
775
776 /* cleanup */
777 if (tr->finish_func) {
778 SUSPEND_OBUF(4);
779 if (tr->max_output <= out_stop - out_p) {
780 out_p += tr->finish_func(TRANSCODING_STATE(tc),
781 out_p, out_stop - out_p);
782 }
783 else {
784 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
785 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
786 writebuf_off = 0;
787 while (writebuf_off < writebuf_len) {
788 SUSPEND_OBUF(23);
789 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
790 }
791 }
792 }
793 while (1)
794 SUSPEND(econv_finished, 6);
795#undef SUSPEND
796#undef next_table
797#undef next_info
798#undef next_byte
799#undef writebuf_len
800#undef writebuf_off
801}
802
804transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
805 const unsigned char *in_stop, unsigned char *out_stop,
806 rb_transcoding *tc,
807 const int opt)
808{
809 if (tc->readagain_len) {
810 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
811 const unsigned char *readagain_pos = readagain_buf;
812 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
814
815 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
816 unsigned char, tc->readagain_len);
817 tc->readagain_len = 0;
818 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
819 if (res != econv_source_buffer_empty) {
820 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
821 readagain_pos, unsigned char, readagain_stop - readagain_pos);
822 tc->readagain_len += readagain_stop - readagain_pos;
823 return res;
824 }
825 }
826 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
827}
828
829static rb_transcoding *
830rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
831{
832 rb_transcoding *tc;
833
834 tc = ALLOC(rb_transcoding);
835 tc->transcoder = tr;
836 tc->flags = flags;
837 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
838 tc->state.ptr = xmalloc(tr->state_size);
839 if (tr->state_init_func) {
840 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
841 }
842 tc->resume_position = 0;
843 tc->recognized_len = 0;
844 tc->readagain_len = 0;
845 tc->writebuf_len = 0;
846 tc->writebuf_off = 0;
847 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
848 tc->readbuf.ptr = xmalloc(tr->max_input);
849 }
850 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
851 tc->writebuf.ptr = xmalloc(tr->max_output);
852 }
853 return tc;
854}
855
857rb_transcoding_convert(rb_transcoding *tc,
858 const unsigned char **input_ptr, const unsigned char *input_stop,
859 unsigned char **output_ptr, unsigned char *output_stop,
860 int flags)
861{
862 return transcode_restartable(
863 input_ptr, output_ptr,
864 input_stop, output_stop,
865 tc, flags);
866}
867
868static void
869rb_transcoding_close(rb_transcoding *tc)
870{
871 const rb_transcoder *tr = tc->transcoder;
872 if (tr->state_fini_func) {
873 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
874 }
875 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
876 xfree(tc->state.ptr);
877 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
878 xfree(tc->readbuf.ptr);
879 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
880 xfree(tc->writebuf.ptr);
881 xfree(tc);
882}
883
884static size_t
885rb_transcoding_memsize(rb_transcoding *tc)
886{
887 size_t size = sizeof(rb_transcoding);
888 const rb_transcoder *tr = tc->transcoder;
889
890 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
891 size += tr->state_size;
892 }
893 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
894 size += tr->max_input;
895 }
896 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
897 size += tr->max_output;
898 }
899 return size;
900}
901
902static rb_econv_t *
903rb_econv_alloc(int n_hint)
904{
905 rb_econv_t *ec;
906
907 if (n_hint <= 0)
908 n_hint = 1;
909
910 ec = ALLOC(rb_econv_t);
911 ec->flags = 0;
912 ec->source_encoding_name = NULL;
913 ec->destination_encoding_name = NULL;
914 ec->started = 0;
915 ec->replacement_str = NULL;
916 ec->replacement_len = 0;
917 ec->replacement_enc = NULL;
918 ec->replacement_allocated = 0;
919 ec->in_buf_start = NULL;
920 ec->in_data_start = NULL;
921 ec->in_data_end = NULL;
922 ec->in_buf_end = NULL;
923 ec->num_allocated = n_hint;
924 ec->num_trans = 0;
925 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
926 ec->num_finished = 0;
927 ec->last_tc = NULL;
928 ec->last_error.result = econv_source_buffer_empty;
929 ec->last_error.error_tc = NULL;
930 ec->last_error.source_encoding = NULL;
931 ec->last_error.destination_encoding = NULL;
932 ec->last_error.error_bytes_start = NULL;
933 ec->last_error.error_bytes_len = 0;
934 ec->last_error.readagain_len = 0;
935 ec->source_encoding = NULL;
936 ec->destination_encoding = NULL;
937 return ec;
938}
939
940static int
941rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
942{
943 int n, j;
944 int bufsize = 4096;
945 unsigned char *p;
946
947 if (ec->num_trans == ec->num_allocated) {
948 n = ec->num_allocated * 2;
949 REALLOC_N(ec->elems, rb_econv_elem_t, n);
950 ec->num_allocated = n;
951 }
952
953 p = xmalloc(bufsize);
954
955 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
956
957 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
958 ec->elems[i].out_buf_start = p;
959 ec->elems[i].out_buf_end = p + bufsize;
960 ec->elems[i].out_data_start = p;
961 ec->elems[i].out_data_end = p;
962 ec->elems[i].last_result = econv_source_buffer_empty;
963
964 ec->num_trans++;
965
966 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
967 for (j = ec->num_trans-1; i <= j; j--) {
968 rb_transcoding *tc = ec->elems[j].tc;
969 const rb_transcoder *tr2 = tc->transcoder;
970 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
971 ec->last_tc = tc;
972 break;
973 }
974 }
975
976 return 0;
977}
978
979static rb_econv_t *
980rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
981{
982 rb_econv_t *ec;
983 int i, ret;
984
985 for (i = 0; i < n; i++) {
986 const rb_transcoder *tr;
987 tr = load_transcoder_entry(entries[i]);
988 if (!tr)
989 return NULL;
990 }
991
992 ec = rb_econv_alloc(n);
993
994 for (i = 0; i < n; i++) {
995 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
996 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
997 if (ret == -1) {
998 rb_econv_close(ec);
999 return NULL;
1000 }
1001 }
1002
1003 return ec;
1004}
1005
1007 transcoder_entry_t **entries;
1008 int num_additional;
1009};
1010
1011static void
1012trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1013{
1014 struct trans_open_t *toarg = arg;
1015
1016 if (!toarg->entries) {
1017 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1018 }
1019 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1020}
1021
1022static rb_econv_t *
1023rb_econv_open0(const char *sname, const char *dname, int ecflags)
1024{
1025 transcoder_entry_t **entries = NULL;
1026 int num_trans;
1027 rb_econv_t *ec;
1028
1029 // loads encodings if not loaded already
1030 if (*sname) rb_enc_find_index(sname);
1031 if (*dname) rb_enc_find_index(dname);
1032
1033 if (*sname == '\0' && *dname == '\0') {
1034 num_trans = 0;
1035 entries = NULL;
1036 sname = dname = "";
1037 }
1038 else {
1039 struct trans_open_t toarg;
1040 toarg.entries = NULL;
1041 toarg.num_additional = 0;
1042 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1043 entries = toarg.entries;
1044 if (num_trans < 0) {
1045 xfree(entries);
1046 return NULL;
1047 }
1048 }
1049
1050 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1051 xfree(entries);
1052 if (!ec)
1053 return NULL;
1054
1055 ec->flags = ecflags;
1056 ec->source_encoding_name = sname;
1057 ec->destination_encoding_name = dname;
1058
1059 return ec;
1060}
1061
1062#define MAX_ECFLAGS_DECORATORS 32
1063
1064static int
1065decorator_names(int ecflags, const char **decorators_ret)
1066{
1067 int num_decorators;
1068
1069 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1074 case 0:
1075 break;
1076 default:
1077 return -1;
1078 }
1079
1080 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1082 return -1;
1083
1084 num_decorators = 0;
1085
1086 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1087 decorators_ret[num_decorators++] = "xml_text_escape";
1089 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1090 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1091 decorators_ret[num_decorators++] = "xml_attr_quote";
1092
1093 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1094 decorators_ret[num_decorators++] = "crlf_newline";
1095 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1096 decorators_ret[num_decorators++] = "cr_newline";
1097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1098 decorators_ret[num_decorators++] = "lf_newline";
1100 decorators_ret[num_decorators++] = "universal_newline";
1101
1102 return num_decorators;
1103}
1104
1105rb_econv_t *
1106rb_econv_open(const char *sname, const char *dname, int ecflags)
1107{
1108 rb_econv_t *ec;
1109 int num_decorators;
1110 const char *decorators[MAX_ECFLAGS_DECORATORS];
1111 int i;
1112
1113 num_decorators = decorator_names(ecflags, decorators);
1114 if (num_decorators == -1)
1115 return NULL;
1116
1117 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1118 if (ec) {
1119 for (i = 0; i < num_decorators; i++) {
1120 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1121 rb_econv_close(ec);
1122 ec = NULL;
1123 break;
1124 }
1125 }
1126 }
1127
1128 if (ec) {
1129 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1130 }
1131 return ec; // can be NULL
1132}
1133
1134static int
1135trans_sweep(rb_econv_t *ec,
1136 const unsigned char **input_ptr, const unsigned char *input_stop,
1137 unsigned char **output_ptr, unsigned char *output_stop,
1138 int flags,
1139 int start)
1140{
1141 int try;
1142 int i, f;
1143
1144 const unsigned char **ipp, *is, *iold;
1145 unsigned char **opp, *os, *oold;
1147
1148 try = 1;
1149 while (try) {
1150 try = 0;
1151 for (i = start; i < ec->num_trans; i++) {
1152 rb_econv_elem_t *te = &ec->elems[i];
1153
1154 if (i == 0) {
1155 ipp = input_ptr;
1156 is = input_stop;
1157 }
1158 else {
1159 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1160 ipp = (const unsigned char **)&prev_te->out_data_start;
1161 is = prev_te->out_data_end;
1162 }
1163
1164 if (i == ec->num_trans-1) {
1165 opp = output_ptr;
1166 os = output_stop;
1167 }
1168 else {
1169 if (te->out_buf_start != te->out_data_start) {
1170 ssize_t len = te->out_data_end - te->out_data_start;
1171 ssize_t off = te->out_data_start - te->out_buf_start;
1172 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1173 te->out_data_start = te->out_buf_start;
1174 te->out_data_end -= off;
1175 }
1176 opp = &te->out_data_end;
1177 os = te->out_buf_end;
1178 }
1179
1180 f = flags;
1181 if (ec->num_finished != i)
1183 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1184 start = 1;
1185 flags &= ~ECONV_AFTER_OUTPUT;
1186 }
1187 if (i != 0)
1188 f &= ~ECONV_AFTER_OUTPUT;
1189 iold = *ipp;
1190 oold = *opp;
1191 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1192 if (iold != *ipp || oold != *opp)
1193 try = 1;
1194
1195 switch (res) {
1199 case econv_after_output:
1200 return i;
1201
1204 break;
1205
1206 case econv_finished:
1207 ec->num_finished = i+1;
1208 break;
1209 }
1210 }
1211 }
1212 return -1;
1213}
1214
1215static rb_econv_result_t
1216rb_trans_conv(rb_econv_t *ec,
1217 const unsigned char **input_ptr, const unsigned char *input_stop,
1218 unsigned char **output_ptr, unsigned char *output_stop,
1219 int flags,
1220 int *result_position_ptr)
1221{
1222 int i;
1223 int needreport_index;
1224 int sweep_start;
1225
1226 unsigned char empty_buf;
1227 unsigned char *empty_ptr = &empty_buf;
1228
1229 if (!input_ptr) {
1230 input_ptr = (const unsigned char **)&empty_ptr;
1231 input_stop = empty_ptr;
1232 }
1233
1234 if (!output_ptr) {
1235 output_ptr = &empty_ptr;
1236 output_stop = empty_ptr;
1237 }
1238
1239 if (ec->elems[0].last_result == econv_after_output)
1240 ec->elems[0].last_result = econv_source_buffer_empty;
1241
1242 for (i = ec->num_trans-1; 0 <= i; i--) {
1243 switch (ec->elems[i].last_result) {
1247 case econv_after_output:
1248 case econv_finished:
1249 sweep_start = i+1;
1250 goto found_needreport;
1251
1254 break;
1255
1256 default:
1257 rb_bug("unexpected transcode last result");
1258 }
1259 }
1260
1261 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1262
1263 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1264 (flags & ECONV_AFTER_OUTPUT)) {
1266
1267 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1269 result_position_ptr);
1270
1271 if (res == econv_source_buffer_empty)
1272 return econv_after_output;
1273 return res;
1274 }
1275
1276 sweep_start = 0;
1277
1278 found_needreport:
1279
1280 do {
1281 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1282 sweep_start = needreport_index + 1;
1283 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1284
1285 for (i = ec->num_trans-1; 0 <= i; i--) {
1286 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1287 rb_econv_result_t res = ec->elems[i].last_result;
1288 if (res == econv_invalid_byte_sequence ||
1289 res == econv_incomplete_input ||
1291 res == econv_after_output) {
1292 ec->elems[i].last_result = econv_source_buffer_empty;
1293 }
1294 if (result_position_ptr)
1295 *result_position_ptr = i;
1296 return res;
1297 }
1298 }
1299 if (result_position_ptr)
1300 *result_position_ptr = -1;
1302}
1303
1304static rb_econv_result_t
1305rb_econv_convert0(rb_econv_t *ec,
1306 const unsigned char **input_ptr, const unsigned char *input_stop,
1307 unsigned char **output_ptr, unsigned char *output_stop,
1308 int flags)
1309{
1311 int result_position;
1312 int has_output = 0;
1313
1314 memset(&ec->last_error, 0, sizeof(ec->last_error));
1315
1316 if (ec->num_trans == 0) {
1317 size_t len;
1318 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1319 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1320 len = output_stop - *output_ptr;
1321 memcpy(*output_ptr, ec->in_data_start, len);
1322 *output_ptr = output_stop;
1323 ec->in_data_start += len;
1325 goto gotresult;
1326 }
1327 len = ec->in_data_end - ec->in_data_start;
1328 memcpy(*output_ptr, ec->in_data_start, len);
1329 *output_ptr += len;
1330 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1331 if (flags & ECONV_AFTER_OUTPUT) {
1332 res = econv_after_output;
1333 goto gotresult;
1334 }
1335 }
1336 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1337 len = output_stop - *output_ptr;
1338 }
1339 else {
1340 len = input_stop - *input_ptr;
1341 }
1342 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1343 *(*output_ptr)++ = *(*input_ptr)++;
1344 res = econv_after_output;
1345 goto gotresult;
1346 }
1347 memcpy(*output_ptr, *input_ptr, len);
1348 *output_ptr += len;
1349 *input_ptr += len;
1350 if (*input_ptr != input_stop)
1352 else if (flags & ECONV_PARTIAL_INPUT)
1354 else
1355 res = econv_finished;
1356 goto gotresult;
1357 }
1358
1359 if (ec->elems[ec->num_trans-1].out_data_start) {
1360 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1361 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1362 if (data_start != data_end) {
1363 size_t len;
1364 if (output_stop - *output_ptr < data_end - data_start) {
1365 len = output_stop - *output_ptr;
1366 memcpy(*output_ptr, data_start, len);
1367 *output_ptr = output_stop;
1368 ec->elems[ec->num_trans-1].out_data_start += len;
1370 goto gotresult;
1371 }
1372 len = data_end - data_start;
1373 memcpy(*output_ptr, data_start, len);
1374 *output_ptr += len;
1375 ec->elems[ec->num_trans-1].out_data_start =
1376 ec->elems[ec->num_trans-1].out_data_end =
1377 ec->elems[ec->num_trans-1].out_buf_start;
1378 has_output = 1;
1379 }
1380 }
1381
1382 if (ec->in_buf_start &&
1383 ec->in_data_start != ec->in_data_end) {
1384 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1385 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1386 if (res != econv_source_buffer_empty)
1387 goto gotresult;
1388 }
1389
1390 if (has_output &&
1391 (flags & ECONV_AFTER_OUTPUT) &&
1392 *input_ptr != input_stop) {
1393 input_stop = *input_ptr;
1394 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1395 if (res == econv_source_buffer_empty)
1396 res = econv_after_output;
1397 }
1398 else if ((flags & ECONV_AFTER_OUTPUT) ||
1399 ec->num_trans == 1) {
1400 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1401 }
1402 else {
1403 flags |= ECONV_AFTER_OUTPUT;
1404 do {
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1406 } while (res == econv_after_output);
1407 }
1408
1409 gotresult:
1410 ec->last_error.result = res;
1411 if (res == econv_invalid_byte_sequence ||
1412 res == econv_incomplete_input ||
1414 rb_transcoding *error_tc = ec->elems[result_position].tc;
1415 ec->last_error.error_tc = error_tc;
1416 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1417 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1418 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1419 ec->last_error.error_bytes_len = error_tc->recognized_len;
1420 ec->last_error.readagain_len = error_tc->readagain_len;
1421 }
1422
1423 return res;
1424}
1425
1426static int output_replacement_character(rb_econv_t *ec);
1427
1428static int
1429output_hex_charref(rb_econv_t *ec)
1430{
1431 int ret;
1432 unsigned char utfbuf[1024];
1433 const unsigned char *utf;
1434 size_t utf_len;
1435 int utf_allocated = 0;
1436 char charef_buf[16];
1437 const unsigned char *p;
1438
1439 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1440 utf = ec->last_error.error_bytes_start;
1441 utf_len = ec->last_error.error_bytes_len;
1442 }
1443 else {
1444 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1445 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1446 utfbuf, sizeof(utfbuf),
1447 &utf_len);
1448 if (!utf)
1449 return -1;
1450 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1451 utf_allocated = 1;
1452 }
1453
1454 if (utf_len % 4 != 0)
1455 goto fail;
1456
1457 p = utf;
1458 while (4 <= utf_len) {
1459 unsigned int u = 0;
1460 u += p[0] << 24;
1461 u += p[1] << 16;
1462 u += p[2] << 8;
1463 u += p[3];
1464 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1465
1466 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1467 if (ret == -1)
1468 goto fail;
1469
1470 p += 4;
1471 utf_len -= 4;
1472 }
1473
1474 if (utf_allocated)
1475 xfree((void *)utf);
1476 return 0;
1477
1478 fail:
1479 if (utf_allocated)
1480 xfree((void *)utf);
1481 return -1;
1482}
1483
1486 const unsigned char **input_ptr, const unsigned char *input_stop,
1487 unsigned char **output_ptr, unsigned char *output_stop,
1488 int flags)
1489{
1491
1492 unsigned char empty_buf;
1493 unsigned char *empty_ptr = &empty_buf;
1494
1495 ec->started = 1;
1496
1497 if (!input_ptr) {
1498 input_ptr = (const unsigned char **)&empty_ptr;
1499 input_stop = empty_ptr;
1500 }
1501
1502 if (!output_ptr) {
1503 output_ptr = &empty_ptr;
1504 output_stop = empty_ptr;
1505 }
1506
1507 resume:
1508 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1509
1510 if (ret == econv_invalid_byte_sequence ||
1511 ret == econv_incomplete_input) {
1512 /* deal with invalid byte sequence */
1513 /* todo: add more alternative behaviors */
1514 switch (ec->flags & ECONV_INVALID_MASK) {
1516 if (output_replacement_character(ec) == 0)
1517 goto resume;
1518 }
1519 }
1520
1521 if (ret == econv_undefined_conversion) {
1522 /* valid character in source encoding
1523 * but no related character(s) in destination encoding */
1524 /* todo: add more alternative behaviors */
1525 switch (ec->flags & ECONV_UNDEF_MASK) {
1527 if (output_replacement_character(ec) == 0)
1528 goto resume;
1529 break;
1530
1532 if (output_hex_charref(ec) == 0)
1533 goto resume;
1534 break;
1535 }
1536 }
1537
1538 return ret;
1539}
1540
1541const char *
1543{
1544 rb_transcoding *tc = ec->last_tc;
1545 const rb_transcoder *tr;
1546
1547 if (tc == NULL)
1548 return "";
1549
1550 tr = tc->transcoder;
1551
1552 if (tr->asciicompat_type == asciicompat_encoder)
1553 return tr->src_encoding;
1554 return tr->dst_encoding;
1555}
1556
1557static unsigned char *
1558allocate_converted_string(const char *sname, const char *dname,
1559 const unsigned char *str, size_t len,
1560 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1561 size_t *dst_len_ptr)
1562{
1563 unsigned char *dst_str;
1564 size_t dst_len;
1565 size_t dst_bufsize;
1566
1567 rb_econv_t *ec;
1569
1570 const unsigned char *sp;
1571 unsigned char *dp;
1572
1573 if (caller_dst_buf)
1574 dst_bufsize = caller_dst_bufsize;
1575 else if (len == 0)
1576 dst_bufsize = 1;
1577 else
1578 dst_bufsize = len;
1579
1580 ec = rb_econv_open(sname, dname, 0);
1581 if (ec == NULL)
1582 return NULL;
1583 if (caller_dst_buf)
1584 dst_str = caller_dst_buf;
1585 else
1586 dst_str = xmalloc(dst_bufsize);
1587 dst_len = 0;
1588 sp = str;
1589 dp = dst_str+dst_len;
1590 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1591 dst_len = dp - dst_str;
1592 while (res == econv_destination_buffer_full) {
1593 if (SIZE_MAX/2 < dst_bufsize) {
1594 goto fail;
1595 }
1596 dst_bufsize *= 2;
1597 if (dst_str == caller_dst_buf) {
1598 unsigned char *tmp;
1599 tmp = xmalloc(dst_bufsize);
1600 memcpy(tmp, dst_str, dst_bufsize/2);
1601 dst_str = tmp;
1602 }
1603 else {
1604 dst_str = xrealloc(dst_str, dst_bufsize);
1605 }
1606 dp = dst_str+dst_len;
1607 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1608 dst_len = dp - dst_str;
1609 }
1610 if (res != econv_finished) {
1611 goto fail;
1612 }
1613 rb_econv_close(ec);
1614 *dst_len_ptr = dst_len;
1615 return dst_str;
1616
1617 fail:
1618 if (dst_str != caller_dst_buf)
1619 xfree(dst_str);
1620 rb_econv_close(ec);
1621 return NULL;
1622}
1623
1624/* result: 0:success -1:failure */
1625int
1627 const unsigned char *str, size_t len, const char *str_encoding)
1628{
1629 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1630 unsigned char insert_buf[4096];
1631 const unsigned char *insert_str = NULL;
1632 size_t insert_len;
1633
1634 int last_trans_index;
1635 rb_transcoding *tc;
1636
1637 unsigned char **buf_start_p;
1638 unsigned char **data_start_p;
1639 unsigned char **data_end_p;
1640 unsigned char **buf_end_p;
1641
1642 size_t need;
1643
1644 ec->started = 1;
1645
1646 if (len == 0)
1647 return 0;
1648
1649 if (encoding_equal(insert_encoding, str_encoding)) {
1650 insert_str = str;
1651 insert_len = len;
1652 }
1653 else {
1654 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1655 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1656 if (insert_str == NULL)
1657 return -1;
1658 }
1659
1660 need = insert_len;
1661
1662 last_trans_index = ec->num_trans-1;
1663 if (ec->num_trans == 0) {
1664 tc = NULL;
1665 buf_start_p = &ec->in_buf_start;
1666 data_start_p = &ec->in_data_start;
1667 data_end_p = &ec->in_data_end;
1668 buf_end_p = &ec->in_buf_end;
1669 }
1670 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1671 tc = ec->elems[last_trans_index].tc;
1672 need += tc->readagain_len;
1673 if (need < insert_len)
1674 goto fail;
1675 if (last_trans_index == 0) {
1676 buf_start_p = &ec->in_buf_start;
1677 data_start_p = &ec->in_data_start;
1678 data_end_p = &ec->in_data_end;
1679 buf_end_p = &ec->in_buf_end;
1680 }
1681 else {
1682 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1683 buf_start_p = &ee->out_buf_start;
1684 data_start_p = &ee->out_data_start;
1685 data_end_p = &ee->out_data_end;
1686 buf_end_p = &ee->out_buf_end;
1687 }
1688 }
1689 else {
1690 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1691 buf_start_p = &ee->out_buf_start;
1692 data_start_p = &ee->out_data_start;
1693 data_end_p = &ee->out_data_end;
1694 buf_end_p = &ee->out_buf_end;
1695 tc = ec->elems[last_trans_index].tc;
1696 }
1697
1698 if (*buf_start_p == NULL) {
1699 unsigned char *buf = xmalloc(need);
1700 *buf_start_p = buf;
1701 *data_start_p = buf;
1702 *data_end_p = buf;
1703 *buf_end_p = buf+need;
1704 }
1705 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1706 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1707 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1708 *data_start_p = *buf_start_p;
1709 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1710 unsigned char *buf;
1711 size_t s = (*data_end_p - *buf_start_p) + need;
1712 if (s < need)
1713 goto fail;
1714 buf = xrealloc(*buf_start_p, s);
1715 *data_start_p = buf;
1716 *data_end_p = buf + (*data_end_p - *buf_start_p);
1717 *buf_start_p = buf;
1718 *buf_end_p = buf + s;
1719 }
1720 }
1721
1722 memcpy(*data_end_p, insert_str, insert_len);
1723 *data_end_p += insert_len;
1724 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1725 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1726 *data_end_p += tc->readagain_len;
1727 tc->readagain_len = 0;
1728 }
1729
1730 if (insert_str != str && insert_str != insert_buf)
1731 xfree((void*)insert_str);
1732 return 0;
1733
1734 fail:
1735 if (insert_str != str && insert_str != insert_buf)
1736 xfree((void*)insert_str);
1737 return -1;
1738}
1739
1740void
1742{
1743 int i;
1744
1745 if (ec->replacement_allocated) {
1746 xfree((void *)ec->replacement_str);
1747 }
1748 for (i = 0; i < ec->num_trans; i++) {
1749 rb_transcoding_close(ec->elems[i].tc);
1750 xfree(ec->elems[i].out_buf_start);
1751 }
1752 xfree(ec->in_buf_start);
1753 xfree(ec->elems);
1754 xfree(ec);
1755}
1756
1757size_t
1758rb_econv_memsize(rb_econv_t *ec)
1759{
1760 size_t size = sizeof(rb_econv_t);
1761 int i;
1762
1763 if (ec->replacement_allocated) {
1764 size += ec->replacement_len;
1765 }
1766 for (i = 0; i < ec->num_trans; i++) {
1767 size += rb_transcoding_memsize(ec->elems[i].tc);
1768
1769 if (ec->elems[i].out_buf_start) {
1770 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1771 }
1772 }
1773 size += ec->in_buf_end - ec->in_buf_start;
1774 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1775
1776 return size;
1777}
1778
1779int
1781{
1782 if (ec->num_trans == 0)
1783 return 0;
1784#if SIZEOF_SIZE_T > SIZEOF_INT
1785 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1786#endif
1787 return (int)ec->elems[0].tc->readagain_len;
1788}
1789
1790void
1791rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1792{
1793 rb_transcoding *tc;
1794 if (ec->num_trans == 0 || n == 0)
1795 return;
1796 tc = ec->elems[0].tc;
1797 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1798 tc->readagain_len -= n;
1799}
1800
1802 const char *ascii_compat_name;
1803 const char *ascii_incompat_name;
1804};
1805
1806static int
1807asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1808{
1809 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1810 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1811 const rb_transcoder *tr;
1812
1813 if (DECORATOR_P(entry->sname, entry->dname))
1814 return ST_CONTINUE;
1815 tr = load_transcoder_entry(entry);
1816 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1817 data->ascii_compat_name = tr->dst_encoding;
1818 return ST_STOP;
1819 }
1820 return ST_CONTINUE;
1821}
1822
1823const char *
1824rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1825{
1826 st_data_t v;
1827 st_table *table2;
1828 struct asciicompat_encoding_t data = {0};
1829
1830 unsigned int lev;
1831 RB_VM_LOCK_ENTER_LEV(&lev);
1832 {
1833 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1834 table2 = (st_table *)v;
1835 /*
1836 * Assumption:
1837 * There is at most one transcoder for
1838 * converting from ASCII incompatible encoding.
1839 *
1840 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1841 */
1842 if (table2->num_entries == 1) {
1843 data.ascii_incompat_name = ascii_incompat_name;
1844 data.ascii_compat_name = NULL;
1845 if (rb_multi_ractor_p()) {
1846 /*
1847 * We need to unlock in case `load_transcoder_entry` actually loads the encoding
1848 * and table2 could be inserted into when we unlock.
1849 */
1850 st_table *dup_table2 = st_copy(table2);
1851 RB_VM_LOCK_LEAVE_LEV(&lev);
1852 st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data);
1853 st_free_table(dup_table2);
1854 RB_VM_LOCK_ENTER_LEV(&lev);
1855 }
1856 else {
1857 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1858 }
1859 }
1860
1861 }
1862 }
1863 RB_VM_LOCK_LEAVE_LEV(&lev);
1864
1865 return data.ascii_compat_name; // can be NULL
1866}
1867
1868/*
1869 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1870 *
1871 * If the result of the conversion is not compatible with the encoding of
1872 * `dst`, `dst` may not be valid encoding.
1873 */
1874VALUE
1875rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1876{
1877 unsigned const char *sp, *se;
1878 unsigned char *ds, *dp, *de;
1880 int max_output;
1881 enum ruby_coderange_type coderange;
1882 rb_encoding *dst_enc = ec->destination_encoding;
1883
1884 if (NIL_P(dst)) {
1885 dst = rb_str_buf_new(len);
1886 if (dst_enc) {
1887 rb_enc_associate(dst, dst_enc);
1888 }
1889 coderange = ENC_CODERANGE_7BIT; // scan from the start
1890 }
1891 else {
1892 dst_enc = rb_enc_get(dst);
1893 coderange = rb_enc_str_coderange(dst);
1894 }
1895
1896 if (ec->last_tc)
1897 max_output = ec->last_tc->transcoder->max_output;
1898 else
1899 max_output = 1;
1900
1901 do {
1902 int cr;
1903 long dlen = RSTRING_LEN(dst);
1904 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1905 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1906 if (LONG_MAX < new_capa)
1907 rb_raise(rb_eArgError, "too long string");
1908 rb_str_modify_expand(dst, new_capa - dlen);
1909 }
1910 sp = (const unsigned char *)ss;
1911 se = sp + len;
1912 ds = (unsigned char *)RSTRING_PTR(dst);
1913 de = ds + rb_str_capacity(dst);
1914 dp = ds += dlen;
1915 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1916 switch (coderange) {
1917 case ENC_CODERANGE_7BIT:
1919 cr = (int)coderange;
1920 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1921 coderange = cr;
1922 ENC_CODERANGE_SET(dst, coderange);
1923 break;
1926 break;
1927 }
1928 len -= (const char *)sp - ss;
1929 ss = (const char *)sp;
1930 rb_str_set_len(dst, dlen + (dp - ds));
1932 } while (res == econv_destination_buffer_full);
1933
1934 return dst;
1935}
1936
1937VALUE
1938rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1939{
1940 src = rb_str_new_frozen(src);
1941 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1942 RB_GC_GUARD(src);
1943 return dst;
1944}
1945
1946VALUE
1948{
1949 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1950}
1951
1952VALUE
1953rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1954{
1955 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1956}
1957
1958VALUE
1960{
1961 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1962}
1963
1964static int
1965rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1966{
1967 transcoder_entry_t *entry;
1968 const rb_transcoder *tr = NULL;
1969
1970 if (ec->started != 0)
1971 return -1;
1972
1973 entry = get_transcoder_entry(sname, dname);
1974 if (entry) {
1975 tr = load_transcoder_entry(entry);
1976 }
1977
1978 return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
1979}
1980
1981static int
1982rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1983{
1984 return rb_econv_add_converter(ec, "", decorator_name, n);
1985}
1986
1987int
1988rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1989{
1990 const rb_transcoder *tr;
1991
1992 if (ec->num_trans == 0)
1993 return rb_econv_decorate_at(ec, decorator_name, 0);
1994
1995 tr = ec->elems[0].tc->transcoder;
1996
1997 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1998 tr->asciicompat_type == asciicompat_decoder)
1999 return rb_econv_decorate_at(ec, decorator_name, 1);
2000
2001 return rb_econv_decorate_at(ec, decorator_name, 0);
2002}
2003
2004int
2005rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
2006{
2007 const rb_transcoder *tr;
2008
2009 if (ec->num_trans == 0)
2010 return rb_econv_decorate_at(ec, decorator_name, 0);
2011
2012 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2013
2014 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
2015 tr->asciicompat_type == asciicompat_encoder)
2016 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2017
2018 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2019}
2020
2021void
2023{
2024 const char *dname = 0;
2025
2026 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2028 dname = "universal_newline";
2029 break;
2031 dname = "crlf_newline";
2032 break;
2034 dname = "cr_newline";
2035 break;
2037 dname = "lf_newline";
2038 break;
2039 }
2040
2041 if (dname) {
2042 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2043 int num_trans = ec->num_trans;
2044 int i, j = 0;
2045
2046 for (i=0; i < num_trans; i++) {
2047 if (transcoder == ec->elems[i].tc->transcoder) {
2048 rb_transcoding_close(ec->elems[i].tc);
2049 xfree(ec->elems[i].out_buf_start);
2050 ec->num_trans--;
2051 }
2052 else
2053 ec->elems[j++] = ec->elems[i];
2054 }
2055 }
2056
2057 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2058}
2059
2060static VALUE
2061econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2062{
2063 int has_description = 0;
2064
2065 if (NIL_P(mesg))
2066 mesg = rb_str_new(NULL, 0);
2067
2068 if (*sname != '\0' || *dname != '\0') {
2069 if (*sname == '\0')
2070 rb_str_cat2(mesg, dname);
2071 else if (*dname == '\0')
2072 rb_str_cat2(mesg, sname);
2073 else
2074 rb_str_catf(mesg, "%s to %s", sname, dname);
2075 has_description = 1;
2076 }
2077
2078 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2082 const char *pre = "";
2083 if (has_description)
2084 rb_str_cat2(mesg, " with ");
2085 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2086 rb_str_cat2(mesg, pre); pre = ",";
2087 rb_str_cat2(mesg, "universal_newline");
2088 }
2089 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2090 rb_str_cat2(mesg, pre); pre = ",";
2091 rb_str_cat2(mesg, "crlf_newline");
2092 }
2093 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2094 rb_str_cat2(mesg, pre); pre = ",";
2095 rb_str_cat2(mesg, "cr_newline");
2096 }
2097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2098 rb_str_cat2(mesg, pre); pre = ",";
2099 rb_str_cat2(mesg, "lf_newline");
2100 }
2101 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2102 rb_str_cat2(mesg, pre); pre = ",";
2103 rb_str_cat2(mesg, "xml_text");
2104 }
2105 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2106 rb_str_cat2(mesg, pre); pre = ",";
2107 rb_str_cat2(mesg, "xml_attr_content");
2108 }
2109 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2110 rb_str_cat2(mesg, pre); pre = ",";
2111 rb_str_cat2(mesg, "xml_attr_quote");
2112 }
2113 has_description = 1;
2114 }
2115 if (!has_description) {
2116 rb_str_cat2(mesg, "no-conversion");
2117 }
2118
2119 return mesg;
2120}
2121
2122VALUE
2123rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2124{
2125 VALUE mesg, exc;
2126 mesg = rb_str_new_cstr("code converter not found (");
2127 econv_description(sname, dname, ecflags, mesg);
2128 rb_str_cat2(mesg, ")");
2129 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2130 return exc;
2131}
2132
2133static VALUE
2134make_econv_exception(rb_econv_t *ec)
2135{
2136 VALUE mesg, exc;
2137 if (ec->last_error.result == econv_invalid_byte_sequence ||
2138 ec->last_error.result == econv_incomplete_input) {
2139 const char *err = (const char *)ec->last_error.error_bytes_start;
2140 size_t error_len = ec->last_error.error_bytes_len;
2141 VALUE bytes = rb_str_new(err, error_len);
2142 VALUE dumped = rb_str_dump(bytes);
2143 size_t readagain_len = ec->last_error.readagain_len;
2144 VALUE bytes2 = Qnil;
2145 VALUE dumped2;
2146 if (ec->last_error.result == econv_incomplete_input) {
2147 mesg = rb_sprintf("incomplete %s on %s",
2148 StringValueCStr(dumped),
2149 ec->last_error.source_encoding);
2150 }
2151 else if (readagain_len) {
2152 bytes2 = rb_str_new(err+error_len, readagain_len);
2153 dumped2 = rb_str_dump(bytes2);
2154 mesg = rb_sprintf("%s followed by %s on %s",
2155 StringValueCStr(dumped),
2156 StringValueCStr(dumped2),
2157 ec->last_error.source_encoding);
2158 }
2159 else {
2160 mesg = rb_sprintf("%s on %s",
2161 StringValueCStr(dumped),
2162 ec->last_error.source_encoding);
2163 }
2164
2165 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2166 rb_ivar_set(exc, id_error_bytes, bytes);
2167 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2168 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2169 goto set_encs;
2170 }
2171 if (ec->last_error.result == econv_undefined_conversion) {
2172 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2173 ec->last_error.error_bytes_len);
2174 VALUE dumped = Qnil;
2175 int idx;
2176 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2177 rb_encoding *utf8 = rb_utf8_encoding();
2178 const char *start, *end;
2179 int n;
2180 start = (const char *)ec->last_error.error_bytes_start;
2181 end = start + ec->last_error.error_bytes_len;
2182 n = rb_enc_precise_mbclen(start, end, utf8);
2183 if (MBCLEN_CHARFOUND_P(n) &&
2184 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2185 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2186 dumped = rb_sprintf("U+%04X", cc);
2187 }
2188 }
2189 if (NIL_P(dumped))
2190 dumped = rb_str_dump(bytes);
2191 if (strcmp(ec->last_error.source_encoding,
2192 ec->source_encoding_name) == 0 &&
2193 strcmp(ec->last_error.destination_encoding,
2194 ec->destination_encoding_name) == 0) {
2195 mesg = rb_sprintf("%s from %s to %s",
2196 StringValueCStr(dumped),
2197 ec->last_error.source_encoding,
2198 ec->last_error.destination_encoding);
2199 }
2200 else {
2201 int i;
2202 mesg = rb_sprintf("%s to %s in conversion from %s",
2203 StringValueCStr(dumped),
2204 ec->last_error.destination_encoding,
2205 ec->source_encoding_name);
2206 for (i = 0; i < ec->num_trans; i++) {
2207 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2208 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2209 rb_str_catf(mesg, " to %s",
2210 ec->elems[i].tc->transcoder->dst_encoding);
2211 }
2212 }
2213 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2214 idx = rb_enc_find_index(ec->last_error.source_encoding);
2215 if (0 <= idx)
2216 rb_enc_associate_index(bytes, idx);
2217 rb_ivar_set(exc, id_error_char, bytes);
2218 goto set_encs;
2219 }
2220 return Qnil;
2221
2222 set_encs:
2223 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2224 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2225 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2226 if (0 <= idx)
2227 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2228 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2229 if (0 <= idx)
2230 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2231 return exc;
2232}
2233
2234static void
2235more_output_buffer(
2236 VALUE destination,
2237 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2238 int max_output,
2239 unsigned char **out_start_ptr,
2240 unsigned char **out_pos,
2241 unsigned char **out_stop_ptr)
2242{
2243 size_t len = (*out_pos - *out_start_ptr);
2244 size_t new_len = (len + max_output) * 2;
2245 *out_start_ptr = resize_destination(destination, len, new_len);
2246 *out_pos = *out_start_ptr + len;
2247 *out_stop_ptr = *out_start_ptr + new_len;
2248}
2249
2250static int
2251make_replacement(rb_econv_t *ec)
2252{
2253 rb_transcoding *tc;
2254 const rb_transcoder *tr;
2255 const unsigned char *replacement;
2256 const char *repl_enc;
2257 const char *ins_enc;
2258 size_t len;
2259
2260 if (ec->replacement_str)
2261 return 0;
2262
2264
2265 tc = ec->last_tc;
2266 if (*ins_enc) {
2267 tr = tc->transcoder;
2268 rb_enc_find(tr->dst_encoding);
2269 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2270 }
2271 else {
2272 replacement = (unsigned char *)"?";
2273 len = 1;
2274 repl_enc = "";
2275 }
2276
2277 ec->replacement_str = replacement;
2278 ec->replacement_len = len;
2279 ec->replacement_enc = repl_enc;
2280 ec->replacement_allocated = 0;
2281 return 0;
2282}
2283
2284int
2286 const unsigned char *str, size_t len, const char *encname)
2287{
2288 unsigned char *str2;
2289 size_t len2;
2290 const char *encname2;
2291
2293
2294 if (!*encname2 || encoding_equal(encname, encname2)) {
2295 str2 = xmalloc(len);
2296 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2297 len2 = len;
2298 encname2 = encname;
2299 }
2300 else {
2301 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2302 if (!str2)
2303 return -1;
2304 }
2305
2306 if (ec->replacement_allocated) {
2307 xfree((void *)ec->replacement_str);
2308 }
2309 ec->replacement_allocated = 1;
2310 ec->replacement_str = str2;
2311 ec->replacement_len = len2;
2312 ec->replacement_enc = encname2;
2313 return 0;
2314}
2315
2316static int
2317output_replacement_character(rb_econv_t *ec)
2318{
2319 int ret;
2320
2321 if (make_replacement(ec) == -1)
2322 return -1;
2323
2324 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2325 if (ret == -1)
2326 return -1;
2327
2328 return 0;
2329}
2330
2331#if 1
2332#define hash_fallback rb_hash_aref
2333
2334static VALUE
2335proc_fallback(VALUE fallback, VALUE c)
2336{
2337 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2338}
2339
2340static VALUE
2341method_fallback(VALUE fallback, VALUE c)
2342{
2343 return rb_method_call(1, &c, fallback);
2344}
2345
2346static VALUE
2347aref_fallback(VALUE fallback, VALUE c)
2348{
2349 return rb_funcallv_public(fallback, idAREF, 1, &c);
2350}
2351
2353 VALUE (*fallback_func)(VALUE, VALUE);
2354 VALUE fallback;
2355 VALUE rep;
2356};
2357
2358static VALUE
2359transcode_loop_fallback_try(VALUE a)
2360{
2362
2363 return args->fallback_func(args->fallback, args->rep);
2364}
2365
2366static void
2367transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2368 const unsigned char *in_stop, unsigned char *out_stop,
2369 VALUE destination,
2370 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2371 const char *src_encoding,
2372 const char *dst_encoding,
2373 int ecflags,
2374 VALUE ecopts)
2375{
2376 rb_econv_t *ec;
2377 rb_transcoding *last_tc;
2379 unsigned char *out_start = *out_pos;
2380 int max_output;
2381 VALUE exc;
2382 VALUE fallback = Qnil;
2383 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2384
2385 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2386 if (!ec)
2387 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2388
2389 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2390 fallback = rb_hash_aref(ecopts, sym_fallback);
2391 if (RB_TYPE_P(fallback, T_HASH)) {
2392 fallback_func = hash_fallback;
2393 }
2394 else if (rb_obj_is_proc(fallback)) {
2395 fallback_func = proc_fallback;
2396 }
2397 else if (rb_obj_is_method(fallback)) {
2398 fallback_func = method_fallback;
2399 }
2400 else {
2401 fallback_func = aref_fallback;
2402 }
2403 }
2404 last_tc = ec->last_tc;
2405 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2406
2407 resume:
2408 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2409
2410 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2411 VALUE rep = rb_enc_str_new(
2412 (const char *)ec->last_error.error_bytes_start,
2413 ec->last_error.error_bytes_len,
2414 rb_enc_find(ec->last_error.source_encoding));
2415
2416
2417 struct transcode_loop_fallback_args args = {
2418 .fallback_func = fallback_func,
2419 .fallback = fallback,
2420 .rep = rep,
2421 };
2422
2423 int state;
2424 rep = rb_protect(transcode_loop_fallback_try, (VALUE)&args, &state);
2425 if (state) {
2426 rb_econv_close(ec);
2427 rb_jump_tag(state);
2428 }
2429
2430 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2431 StringValue(rep);
2432 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2433 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2434 if ((int)ret == -1) {
2435 rb_econv_close(ec);
2436 rb_raise(rb_eArgError, "too big fallback string");
2437 }
2438 goto resume;
2439 }
2440 }
2441
2442 if (ret == econv_invalid_byte_sequence ||
2443 ret == econv_incomplete_input ||
2445 exc = make_econv_exception(ec);
2446 rb_econv_close(ec);
2447 rb_exc_raise(exc);
2448 }
2449
2450 if (ret == econv_destination_buffer_full) {
2451 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2452 goto resume;
2453 }
2454
2455 rb_econv_close(ec);
2456 return;
2457}
2458#else
2459/* sample transcode_loop implementation in byte-by-byte stream style */
2460static void
2461transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2462 const unsigned char *in_stop, unsigned char *out_stop,
2463 VALUE destination,
2464 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2465 const char *src_encoding,
2466 const char *dst_encoding,
2467 int ecflags,
2468 VALUE ecopts)
2469{
2470 rb_econv_t *ec;
2471 rb_transcoding *last_tc;
2473 unsigned char *out_start = *out_pos;
2474 const unsigned char *ptr;
2475 int max_output;
2476 VALUE exc;
2477
2478 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2479 if (!ec)
2480 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2481
2482 last_tc = ec->last_tc;
2483 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2484
2486 ptr = *in_pos;
2487 while (ret != econv_finished) {
2488 unsigned char input_byte;
2489 const unsigned char *p = &input_byte;
2490
2491 if (ret == econv_source_buffer_empty) {
2492 if (ptr < in_stop) {
2493 input_byte = *ptr;
2494 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2495 }
2496 else {
2497 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2498 }
2499 }
2500 else {
2501 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2502 }
2503 if (&input_byte != p)
2504 ptr += p - &input_byte;
2505 switch (ret) {
2509 exc = make_econv_exception(ec);
2510 rb_econv_close(ec);
2511 rb_exc_raise(exc);
2512 break;
2513
2515 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2516 break;
2517
2519 break;
2520
2521 case econv_finished:
2522 break;
2523 }
2524 }
2525 rb_econv_close(ec);
2526 *in_pos = in_stop;
2527 return;
2528}
2529#endif
2530
2531
2532/*
2533 * String-specific code
2534 */
2535
2536static unsigned char *
2537str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2538{
2539 rb_str_resize(destination, new_len);
2540 return (unsigned char *)RSTRING_PTR(destination);
2541}
2542
2543static int
2544econv_opts(VALUE opt, int ecflags)
2545{
2546 VALUE v;
2547 int newlineflag = 0;
2548
2549 v = rb_hash_aref(opt, sym_invalid);
2550 if (NIL_P(v)) {
2551 }
2552 else if (v==sym_replace) {
2553 ecflags |= ECONV_INVALID_REPLACE;
2554 }
2555 else {
2556 rb_raise(rb_eArgError, "unknown value for invalid character option");
2557 }
2558
2559 v = rb_hash_aref(opt, sym_undef);
2560 if (NIL_P(v)) {
2561 }
2562 else if (v==sym_replace) {
2563 ecflags |= ECONV_UNDEF_REPLACE;
2564 }
2565 else {
2566 rb_raise(rb_eArgError, "unknown value for undefined character option");
2567 }
2568
2569 v = rb_hash_aref(opt, sym_replace);
2570 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2571 ecflags |= ECONV_UNDEF_REPLACE;
2572 }
2573
2574 v = rb_hash_aref(opt, sym_xml);
2575 if (!NIL_P(v)) {
2576 if (v==sym_text) {
2578 }
2579 else if (v==sym_attr) {
2581 }
2582 else if (SYMBOL_P(v)) {
2583 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2584 }
2585 else {
2586 rb_raise(rb_eArgError, "unexpected value for xml option");
2587 }
2588 }
2589
2590#ifdef ENABLE_ECONV_NEWLINE_OPTION
2591 v = rb_hash_aref(opt, sym_newline);
2592 if (!NIL_P(v)) {
2593 newlineflag = 2;
2594 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2595 if (v == sym_universal) {
2597 }
2598 else if (v == sym_crlf) {
2600 }
2601 else if (v == sym_cr) {
2602 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2603 }
2604 else if (v == sym_lf) {
2605 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2606 }
2607 else if (SYMBOL_P(v)) {
2608 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2609 rb_sym2str(v));
2610 }
2611 else {
2612 rb_raise(rb_eArgError, "unexpected value for newline option");
2613 }
2614 }
2615#endif
2616 {
2617 int setflags = 0;
2618
2619 v = rb_hash_aref(opt, sym_universal_newline);
2620 if (RTEST(v))
2622 newlineflag |= !NIL_P(v);
2623
2624 v = rb_hash_aref(opt, sym_crlf_newline);
2625 if (RTEST(v))
2626 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2627 newlineflag |= !NIL_P(v);
2628
2629 v = rb_hash_aref(opt, sym_cr_newline);
2630 if (RTEST(v))
2631 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2632 newlineflag |= !NIL_P(v);
2633
2634 v = rb_hash_aref(opt, sym_lf_newline);
2635 if (RTEST(v))
2636 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2637 newlineflag |= !NIL_P(v);
2638
2639 switch (newlineflag) {
2640 case 1:
2641 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2642 ecflags |= setflags;
2643 break;
2644
2645 case 3:
2646 rb_warning(":newline option precedes other newline options");
2647 break;
2648 }
2649 }
2650
2651 return ecflags;
2652}
2653
2654int
2655rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2656{
2657 VALUE newhash = Qnil;
2658 VALUE v;
2659
2660 if (NIL_P(opthash)) {
2661 *opts = Qnil;
2662 return ecflags;
2663 }
2664 ecflags = econv_opts(opthash, ecflags);
2665
2666 v = rb_hash_aref(opthash, sym_replace);
2667 if (!NIL_P(v)) {
2668 StringValue(v);
2669 if (is_broken_string(v)) {
2670 VALUE dumped = rb_str_dump(v);
2671 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2672 StringValueCStr(dumped),
2673 rb_enc_name(rb_enc_get(v)));
2674 }
2675 v = rb_str_new_frozen(v);
2676 newhash = rb_hash_new();
2677 rb_hash_aset(newhash, sym_replace, v);
2678 }
2679
2680 v = rb_hash_aref(opthash, sym_fallback);
2681 if (!NIL_P(v)) {
2682 VALUE h = rb_check_hash_type(v);
2683 if (NIL_P(h)
2684 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2685 : (v = h, 1)) {
2686 if (NIL_P(newhash))
2687 newhash = rb_hash_new();
2688 rb_hash_aset(newhash, sym_fallback, v);
2689 }
2690 }
2691
2692 if (!NIL_P(newhash))
2693 rb_hash_freeze(newhash);
2694 *opts = newhash;
2695
2696 return ecflags;
2697}
2698
2699int
2701{
2702 return rb_econv_prepare_options(opthash, opts, 0);
2703}
2704
2705rb_econv_t *
2706rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2707{
2708 rb_econv_t *ec;
2709 VALUE replacement;
2710
2711 if (NIL_P(opthash)) {
2712 replacement = Qnil;
2713 }
2714 else {
2715 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2716 rb_bug("rb_econv_open_opts called with invalid opthash");
2717 replacement = rb_hash_aref(opthash, sym_replace);
2718 }
2719
2720 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2721 if (ec) {
2722 if (!NIL_P(replacement)) {
2723 int ret;
2724 rb_encoding *enc = rb_enc_get(replacement);
2725
2726 ret = rb_econv_set_replacement(ec,
2727 (const unsigned char *)RSTRING_PTR(replacement),
2728 RSTRING_LEN(replacement),
2729 rb_enc_name(enc));
2730 if (ret == -1) {
2731 rb_econv_close(ec);
2732 ec = NULL;
2733 }
2734 }
2735 }
2736 return ec; // can be NULL
2737}
2738
2739static int
2740enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2741{
2742 rb_encoding *enc;
2743 const char *n;
2744 int encidx;
2745 VALUE encval;
2746
2747 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2748 !(enc = rb_enc_from_index(encidx))) {
2749 enc = NULL;
2750 encidx = 0;
2751 n = StringValueCStr(*arg);
2752 }
2753 else {
2754 n = rb_enc_name(enc);
2755 }
2756
2757 *name_p = n;
2758 *enc_p = enc;
2759
2760 return encidx;
2761}
2762
2763static int
2764str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2765 const char **sname_p, rb_encoding **senc_p,
2766 const char **dname_p, rb_encoding **denc_p)
2767{
2768 rb_encoding *senc, *denc;
2769 const char *sname, *dname;
2770 int sencidx, dencidx;
2771
2772 dencidx = enc_arg(arg1, &dname, &denc);
2773
2774 if (NIL_P(*arg2)) {
2775 sencidx = rb_enc_get_index(str);
2776 senc = rb_enc_from_index(sencidx);
2777 sname = rb_enc_name(senc);
2778 }
2779 else {
2780 sencidx = enc_arg(arg2, &sname, &senc);
2781 }
2782
2783 *sname_p = sname;
2784 *senc_p = senc;
2785 *dname_p = dname;
2786 *denc_p = denc;
2787 return dencidx;
2788}
2789
2790static int
2791str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2792{
2793 VALUE dest;
2794 VALUE str = *self;
2795 VALUE arg1, arg2;
2796 long blen, slen;
2797 unsigned char *buf, *bp, *sp;
2798 const unsigned char *fromp;
2799 rb_encoding *senc, *denc;
2800 const char *sname, *dname;
2801 int dencidx;
2802 int explicitly_invalid_replace = TRUE;
2803
2804 rb_check_arity(argc, 0, 2);
2805
2806 if (argc == 0) {
2807 arg1 = rb_enc_default_internal();
2808 if (NIL_P(arg1)) {
2809 if (!ecflags) return -1;
2810 arg1 = rb_obj_encoding(str);
2811 }
2812 if (!(ecflags & ECONV_INVALID_MASK)) {
2813 explicitly_invalid_replace = FALSE;
2814 }
2816 }
2817 else {
2818 arg1 = argv[0];
2819 }
2820 arg2 = argc<=1 ? Qnil : argv[1];
2821 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2822
2823 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2827 if (senc && senc == denc) {
2828 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2829 VALUE rep = Qnil;
2830 if (!NIL_P(ecopts)) {
2831 rep = rb_hash_aref(ecopts, sym_replace);
2832 }
2833 dest = rb_enc_str_scrub(senc, str, rep);
2834 if (NIL_P(dest)) dest = str;
2835 *self = dest;
2836 return dencidx;
2837 }
2838 return NIL_P(arg2) ? -1 : dencidx;
2839 }
2840 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2841 if (is_ascii_string(str)) {
2842 return dencidx;
2843 }
2844 }
2845 if (encoding_equal(sname, dname)) {
2846 return NIL_P(arg2) ? -1 : dencidx;
2847 }
2848 }
2849 else {
2850 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2851 rb_encoding *utf8 = rb_utf8_encoding();
2852 str = rb_str_conv_enc(str, senc, utf8);
2853 senc = utf8;
2854 sname = "UTF-8";
2855 }
2856 if (encoding_equal(sname, dname)) {
2857 sname = "";
2858 dname = "";
2859 }
2860 }
2861
2862 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2863 slen = RSTRING_LEN(str);
2864 blen = slen + 30; /* len + margin */
2865 dest = rb_str_tmp_new(blen);
2866 bp = (unsigned char *)RSTRING_PTR(dest);
2867
2868 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2869 if (fromp != sp+slen) {
2870 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2871 }
2872 buf = (unsigned char *)RSTRING_PTR(dest);
2873 *bp = '\0';
2874 rb_str_set_len(dest, bp - buf);
2875
2876 /* set encoding */
2877 if (!denc) {
2878 dencidx = rb_define_dummy_encoding(dname);
2879 RB_GC_GUARD(arg1);
2880 RB_GC_GUARD(arg2);
2881 }
2882 *self = dest;
2883
2884 return dencidx;
2885}
2886
2887static int
2888str_transcode(int argc, VALUE *argv, VALUE *self)
2889{
2890 VALUE opt;
2891 int ecflags = 0;
2892 VALUE ecopts = Qnil;
2893
2894 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2895 if (!NIL_P(opt)) {
2896 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2897 }
2898 return str_transcode0(argc, argv, self, ecflags, ecopts);
2899}
2900
2901static inline VALUE
2902str_encode_associate(VALUE str, int encidx)
2903{
2904 int cr = 0;
2905
2906 rb_enc_associate_index(str, encidx);
2907
2908 /* transcoded string never be broken. */
2909 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2910 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2911 }
2912 else {
2914 }
2915 ENC_CODERANGE_SET(str, cr);
2916 return str;
2917}
2918
2919/*
2920 * call-seq:
2921 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2922 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2923 *
2924 * Like #encode, but applies encoding changes to +self+; returns +self+.
2925 *
2926 * Related: see {Modifying}[rdoc-ref:String@Modifying].
2927 */
2928
2929static VALUE
2930str_encode_bang(int argc, VALUE *argv, VALUE str)
2931{
2932 VALUE newstr;
2933 int encidx;
2934
2935 rb_check_frozen(str);
2936
2937 newstr = str;
2938 encidx = str_transcode(argc, argv, &newstr);
2939
2940 if (encidx < 0) return str;
2941 if (newstr == str) {
2942 rb_enc_associate_index(str, encidx);
2943 return str;
2944 }
2945 rb_str_shared_replace(str, newstr);
2946 return str_encode_associate(str, encidx);
2947}
2948
2949static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2950
2951/*
2952 * call-seq:
2953 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2954 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2955 *
2956 * :include: doc/string/encode.rdoc
2957 *
2958 */
2959
2960static VALUE
2961str_encode(int argc, VALUE *argv, VALUE str)
2962{
2963 VALUE newstr = str;
2964 int encidx = str_transcode(argc, argv, &newstr);
2965 return encoded_dup(newstr, str, encidx);
2966}
2967
2968VALUE
2969rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2970{
2971 int argc = 1;
2972 VALUE *argv = &to;
2973 VALUE newstr = str;
2974 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2975 return encoded_dup(newstr, str, encidx);
2976}
2977
2978static VALUE
2979encoded_dup(VALUE newstr, VALUE str, int encidx)
2980{
2981 if (encidx < 0) return rb_str_dup(str);
2982 if (newstr == str) {
2983 newstr = rb_str_dup(str);
2984 rb_enc_associate_index(newstr, encidx);
2985 return newstr;
2986 }
2987 else {
2988 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2989 }
2990 return str_encode_associate(newstr, encidx);
2991}
2992
2993/*
2994 * Document-class: Encoding::Converter
2995 *
2996 * Encoding conversion class.
2997 */
2998static void
2999econv_free(void *ptr)
3000{
3001 rb_econv_t *ec = ptr;
3002 rb_econv_close(ec);
3003}
3004
3005static size_t
3006econv_memsize(const void *ptr)
3007{
3008 return sizeof(rb_econv_t);
3009}
3010
3011static const rb_data_type_t econv_data_type = {
3012 "econv",
3013 {0, econv_free, econv_memsize,},
3014 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
3015};
3016
3017static VALUE
3018econv_s_allocate(VALUE klass)
3019{
3020 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
3021}
3022
3023static rb_encoding *
3024make_dummy_encoding(const char *name)
3025{
3026 rb_encoding *enc;
3027 int idx;
3028 idx = rb_define_dummy_encoding(name);
3029 enc = rb_enc_from_index(idx);
3030 return enc;
3031}
3032
3033static rb_encoding *
3034make_encoding(const char *name)
3035{
3036 rb_encoding *enc;
3037 enc = rb_enc_find(name);
3038 if (!enc) {
3039 RB_VM_LOCKING() {
3040 if (rb_enc_registered(name)) {
3041 enc = NULL;
3042 }
3043 else {
3044 enc = make_dummy_encoding(name);
3045 }
3046 }
3047 }
3048 return enc;
3049}
3050
3051static VALUE
3052make_encobj(const char *name)
3053{
3054 return rb_enc_from_encoding(make_encoding(name));
3055}
3056
3057/*
3058 * call-seq:
3059 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3060 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3061 *
3062 * Returns the corresponding ASCII compatible encoding.
3063 *
3064 * Returns nil if the argument is an ASCII compatible encoding.
3065 *
3066 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3067 * can represents exactly the same characters as the given ASCII incompatible encoding.
3068 * So, no conversion undefined error occurs when converting between the two encodings.
3069 *
3070 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3071 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3072 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3073 *
3074 */
3075static VALUE
3076econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3077{
3078 const char *arg_name, *result_name;
3079 rb_encoding *arg_enc, *result_enc;
3080 VALUE enc = Qnil;
3081
3082 enc_arg(&arg, &arg_name, &arg_enc);
3083 result_name = rb_econv_asciicompat_encoding(arg_name);
3084 if (result_name) {
3085 result_enc = make_encoding(result_name);
3086 enc = rb_enc_from_encoding(result_enc);
3087 }
3088 return enc;
3089}
3090
3091static void
3092econv_args(int argc, VALUE *argv,
3093 VALUE *snamev_p, VALUE *dnamev_p,
3094 const char **sname_p, const char **dname_p,
3095 rb_encoding **senc_p, rb_encoding **denc_p,
3096 int *ecflags_p,
3097 VALUE *ecopts_p)
3098{
3099 VALUE opt, flags_v, ecopts;
3100 int sidx, didx;
3101 const char *sname, *dname;
3102 rb_encoding *senc, *denc;
3103 int ecflags;
3104
3105 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3106
3107 if (!NIL_P(flags_v)) {
3108 if (!NIL_P(opt)) {
3109 rb_error_arity(argc + 1, 2, 3);
3110 }
3111 ecflags = NUM2INT(rb_to_int(flags_v));
3112 ecopts = Qnil;
3113 }
3114 else if (!NIL_P(opt)) {
3115 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3116 }
3117 else {
3118 ecflags = 0;
3119 ecopts = Qnil;
3120 }
3121
3122 senc = NULL;
3123 sidx = rb_to_encoding_index(*snamev_p);
3124 if (0 <= sidx) {
3125 senc = rb_enc_from_index(sidx);
3126 }
3127 else {
3128 StringValue(*snamev_p);
3129 }
3130
3131 denc = NULL;
3132 didx = rb_to_encoding_index(*dnamev_p);
3133 if (0 <= didx) {
3134 denc = rb_enc_from_index(didx);
3135 }
3136 else {
3137 StringValue(*dnamev_p);
3138 }
3139
3140 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3141 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3142
3143 *sname_p = sname;
3144 *dname_p = dname;
3145 *senc_p = senc;
3146 *denc_p = denc;
3147 *ecflags_p = ecflags;
3148 *ecopts_p = ecopts;
3149}
3150
3151static int
3152decorate_convpath(VALUE convpath, int ecflags)
3153{
3154 int num_decorators;
3155 const char *decorators[MAX_ECFLAGS_DECORATORS];
3156 int i;
3157 int n, len;
3158
3159 num_decorators = decorator_names(ecflags, decorators);
3160 if (num_decorators == -1)
3161 return -1;
3162
3163 len = n = RARRAY_LENINT(convpath);
3164 if (n != 0) {
3165 VALUE pair = RARRAY_AREF(convpath, n-1);
3166 if (RB_TYPE_P(pair, T_ARRAY)) {
3167 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3168 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3169 transcoder_entry_t *entry;
3170 const rb_transcoder *tr;
3171 entry = get_transcoder_entry(sname, dname);
3172 tr = load_transcoder_entry(entry);
3173 if (!tr)
3174 return -1;
3175 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3176 tr->asciicompat_type == asciicompat_encoder) {
3177 n--;
3178 rb_ary_store(convpath, len + num_decorators - 1, pair);
3179 }
3180 }
3181 else {
3182 rb_ary_store(convpath, len + num_decorators - 1, pair);
3183 }
3184 }
3185
3186 for (i = 0; i < num_decorators; i++)
3187 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3188
3189 return 0;
3190}
3191
3192static void
3193search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3194{
3195 VALUE *ary_p = arg;
3196 VALUE v;
3197
3198 if (NIL_P(*ary_p)) {
3199 *ary_p = rb_ary_new();
3200 }
3201
3202 if (DECORATOR_P(sname, dname)) {
3203 v = rb_str_new_cstr(dname);
3204 }
3205 else {
3206 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3207 }
3208 rb_ary_store(*ary_p, depth, v);
3209}
3210
3211/*
3212 * call-seq:
3213 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3214 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3215 *
3216 * Returns a conversion path.
3217 *
3218 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3219 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3220 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3221 *
3222 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3223 * or
3224 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3225 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3226 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3227 * # "universal_newline"]
3228 *
3229 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3230 * or
3231 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3232 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3233 * # "universal_newline",
3234 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3235 */
3236static VALUE
3237econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3238{
3239 VALUE snamev, dnamev;
3240 const char *sname, *dname;
3241 rb_encoding *senc, *denc;
3242 int ecflags;
3243 VALUE ecopts;
3244 VALUE convpath;
3245
3246 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3247
3248 convpath = Qnil;
3249 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3250
3251 if (NIL_P(convpath)) {
3252 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3253 RB_GC_GUARD(snamev);
3254 RB_GC_GUARD(dnamev);
3255 rb_exc_raise(exc);
3256 }
3257
3258 if (decorate_convpath(convpath, ecflags) == -1) {
3259 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3260 RB_GC_GUARD(snamev);
3261 RB_GC_GUARD(dnamev);
3262 rb_exc_raise(exc);
3263 }
3264
3265 return convpath;
3266}
3267
3268/*
3269 * Check the existence of a conversion path.
3270 * Returns the number of converters in the conversion path.
3271 * result: >=0:success -1:failure
3272 */
3273int
3274rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3275{
3276 VALUE convpath = Qnil;
3277 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3278 &convpath);
3279 return RTEST(convpath);
3280}
3281
3283 rb_econv_t *ec;
3284 int index;
3285 int ret;
3286};
3287
3288static void
3289rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3290{
3292 int ret;
3293
3294 if (a->ret == -1)
3295 return;
3296
3297 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3298
3299 a->ret = ret;
3300 return;
3301}
3302
3303static rb_econv_t *
3304rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3305 const char **sname_p, const char **dname_p,
3306 rb_encoding **senc_p, rb_encoding**denc_p)
3307{
3308 rb_econv_t *ec;
3309 long i;
3310 int ret, first=1;
3311 VALUE elt;
3312 rb_encoding *senc = 0, *denc = 0;
3313 const char *sname, *dname;
3314
3315 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3316 DATA_PTR(self) = ec;
3317
3318 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3319 VALUE snamev, dnamev;
3320 VALUE pair;
3321 elt = rb_ary_entry(convpath, i);
3322 if (!NIL_P(pair = rb_check_array_type(elt))) {
3323 if (RARRAY_LEN(pair) != 2)
3324 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3325 snamev = rb_ary_entry(pair, 0);
3326 enc_arg(&snamev, &sname, &senc);
3327 dnamev = rb_ary_entry(pair, 1);
3328 enc_arg(&dnamev, &dname, &denc);
3329 }
3330 else {
3331 sname = "";
3332 dname = StringValueCStr(elt);
3333 }
3334 if (DECORATOR_P(sname, dname)) {
3335 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3336 if (ret == -1) {
3337 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3338 RB_GC_GUARD(snamev);
3339 RB_GC_GUARD(dnamev);
3340 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3341 }
3342 }
3343 else {
3344 int j = ec->num_trans;
3345 struct rb_econv_init_by_convpath_t arg;
3346 arg.ec = ec;
3347 arg.index = ec->num_trans;
3348 arg.ret = 0;
3349 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3350 if (ret == -1 || arg.ret == -1) {
3351 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3352 RB_GC_GUARD(snamev);
3353 RB_GC_GUARD(dnamev);
3354 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3355 }
3356 if (first) {
3357 first = 0;
3358 *senc_p = senc;
3359 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3360 }
3361 *denc_p = denc;
3362 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3363 }
3364 }
3365
3366 if (first) {
3367 *senc_p = NULL;
3368 *denc_p = NULL;
3369 *sname_p = "";
3370 *dname_p = "";
3371 }
3372
3373 ec->source_encoding_name = *sname_p;
3374 ec->destination_encoding_name = *dname_p;
3375
3376 return ec;
3377}
3378
3379/*
3380 * call-seq:
3381 * Encoding::Converter.new(source_encoding, destination_encoding)
3382 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3383 * Encoding::Converter.new(convpath)
3384 *
3385 * possible options elements:
3386 * hash form:
3387 * :invalid => nil # raise error on invalid byte sequence (default)
3388 * :invalid => :replace # replace invalid byte sequence
3389 * :undef => nil # raise error on undefined conversion (default)
3390 * :undef => :replace # replace undefined conversion
3391 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3392 * :newline => :universal # decorator for converting CRLF and CR to LF
3393 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3394 * :newline => :crlf # decorator for converting LF to CRLF
3395 * :newline => :cr # decorator for converting LF to CR
3396 * :universal_newline => true # decorator for converting CRLF and CR to LF
3397 * :crlf_newline => true # decorator for converting LF to CRLF
3398 * :cr_newline => true # decorator for converting LF to CR
3399 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3400 * :xml => :text # escape as XML CharData.
3401 * :xml => :attr # escape as XML AttValue
3402 * integer form:
3403 * Encoding::Converter::INVALID_REPLACE
3404 * Encoding::Converter::UNDEF_REPLACE
3405 * Encoding::Converter::UNDEF_HEX_CHARREF
3406 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3407 * Encoding::Converter::LF_NEWLINE_DECORATOR
3408 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3409 * Encoding::Converter::CR_NEWLINE_DECORATOR
3410 * Encoding::Converter::XML_TEXT_DECORATOR
3411 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3412 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3413 *
3414 * Encoding::Converter.new creates an instance of Encoding::Converter.
3415 *
3416 * Source_encoding and destination_encoding should be a string or
3417 * Encoding object.
3418 *
3419 * opt should be nil, a hash or an integer.
3420 *
3421 * convpath should be an array.
3422 * convpath may contain
3423 * - two-element arrays which contain encodings or encoding names, or
3424 * - strings representing decorator names.
3425 *
3426 * Encoding::Converter.new optionally takes an option.
3427 * The option should be a hash or an integer.
3428 * The option hash can contain :invalid => nil, etc.
3429 * The option integer should be logical-or of constants such as
3430 * Encoding::Converter::INVALID_REPLACE, etc.
3431 *
3432 * [:invalid => nil]
3433 * Raise error on invalid byte sequence. This is a default behavior.
3434 * [:invalid => :replace]
3435 * Replace invalid byte sequence by replacement string.
3436 * [:undef => nil]
3437 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3438 * This is a default behavior.
3439 * [:undef => :replace]
3440 * Replace undefined character in destination_encoding with replacement string.
3441 * [:replace => string]
3442 * Specify the replacement string.
3443 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3444 * [:universal_newline => true]
3445 * Convert CRLF and CR to LF.
3446 * [:crlf_newline => true]
3447 * Convert LF to CRLF.
3448 * [:cr_newline => true]
3449 * Convert LF to CR.
3450 * [:lf_newline => true]
3451 * Convert CRLF and CR to LF (when writing).
3452 * [:xml => :text]
3453 * Escape as XML CharData.
3454 * This form can be used as an HTML 4.0 #PCDATA.
3455 * - '&' -> '&amp;'
3456 * - '<' -> '&lt;'
3457 * - '>' -> '&gt;'
3458 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3459 * [:xml => :attr]
3460 * Escape as XML AttValue.
3461 * The converted result is quoted as "...".
3462 * This form can be used as an HTML 4.0 attribute value.
3463 * - '&' -> '&amp;'
3464 * - '<' -> '&lt;'
3465 * - '>' -> '&gt;'
3466 * - '"' -> '&quot;'
3467 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3468 *
3469 * Examples:
3470 * # UTF-16BE to UTF-8
3471 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3472 *
3473 * # Usually, decorators such as newline conversion are inserted last.
3474 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3475 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3476 * # "universal_newline"]
3477 *
3478 * # But, if the last encoding is ASCII incompatible,
3479 * # decorators are inserted before the last conversion.
3480 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3481 * p ec.convpath #=> ["crlf_newline",
3482 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3483 *
3484 * # Conversion path can be specified directly.
3485 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3486 * p ec.convpath #=> ["universal_newline",
3487 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3488 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3489 */
3490static VALUE
3491econv_init(int argc, VALUE *argv, VALUE self)
3492{
3493 VALUE ecopts;
3494 VALUE snamev, dnamev;
3495 const char *sname, *dname;
3496 rb_encoding *senc, *denc;
3497 rb_econv_t *ec;
3498 int ecflags;
3499 VALUE convpath;
3500
3501 if (rb_check_typeddata(self, &econv_data_type)) {
3502 rb_raise(rb_eTypeError, "already initialized");
3503 }
3504
3505 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3506 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3507 ecflags = 0;
3508 ecopts = Qnil;
3509 }
3510 else {
3511 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3512 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3513 }
3514
3515 if (!ec) {
3516 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3517 RB_GC_GUARD(snamev);
3518 RB_GC_GUARD(dnamev);
3519 rb_exc_raise(exc);
3520 }
3521
3522 if (!DECORATOR_P(sname, dname)) {
3523 if (!senc)
3524 senc = make_dummy_encoding(sname);
3525 if (!denc)
3526 denc = make_dummy_encoding(dname);
3527 RB_GC_GUARD(snamev);
3528 RB_GC_GUARD(dnamev);
3529 }
3530
3531 ec->source_encoding = senc;
3532 ec->destination_encoding = denc;
3533
3534 DATA_PTR(self) = ec;
3535
3536 return self;
3537}
3538
3539/*
3540 * call-seq:
3541 * ec.inspect -> string
3542 *
3543 * Returns a printable version of <i>ec</i>
3544 *
3545 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3546 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3547 *
3548 */
3549static VALUE
3550econv_inspect(VALUE self)
3551{
3552 const char *cname = rb_obj_classname(self);
3553 rb_econv_t *ec;
3554
3555 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3556 if (!ec)
3557 return rb_sprintf("#<%s: uninitialized>", cname);
3558 else {
3559 const char *sname = ec->source_encoding_name;
3560 const char *dname = ec->destination_encoding_name;
3561 VALUE str;
3562 str = rb_sprintf("#<%s: ", cname);
3563 econv_description(sname, dname, ec->flags, str);
3564 rb_str_cat2(str, ">");
3565 return str;
3566 }
3567}
3568
3569static rb_econv_t *
3570check_econv(VALUE self)
3571{
3572 rb_econv_t *ec;
3573
3574 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3575 if (!ec) {
3576 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3577 }
3578 return ec;
3579}
3580
3581static VALUE
3582econv_get_encoding(rb_encoding *encoding)
3583{
3584 if (!encoding)
3585 return Qnil;
3586 return rb_enc_from_encoding(encoding);
3587}
3588
3589/*
3590 * call-seq:
3591 * ec.source_encoding -> encoding
3592 *
3593 * Returns the source encoding as an Encoding object.
3594 */
3595static VALUE
3596econv_source_encoding(VALUE self)
3597{
3598 rb_econv_t *ec = check_econv(self);
3599 return econv_get_encoding(ec->source_encoding);
3600}
3601
3602/*
3603 * call-seq:
3604 * ec.destination_encoding -> encoding
3605 *
3606 * Returns the destination encoding as an Encoding object.
3607 */
3608static VALUE
3609econv_destination_encoding(VALUE self)
3610{
3611 rb_econv_t *ec = check_econv(self);
3612 return econv_get_encoding(ec->destination_encoding);
3613}
3614
3615/*
3616 * call-seq:
3617 * ec.convpath -> ary
3618 *
3619 * Returns the conversion path of ec.
3620 *
3621 * The result is an array of conversions.
3622 *
3623 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3624 * p ec.convpath
3625 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3626 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3627 * # "crlf_newline"]
3628 *
3629 * Each element of the array is a pair of encodings or a string.
3630 * A pair means an encoding conversion.
3631 * A string means a decorator.
3632 *
3633 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3634 * a converter from ISO-8859-1 to UTF-8.
3635 * "crlf_newline" means newline converter from LF to CRLF.
3636 */
3637static VALUE
3638econv_convpath(VALUE self)
3639{
3640 rb_econv_t *ec = check_econv(self);
3641 VALUE result;
3642 int i;
3643
3644 result = rb_ary_new();
3645 for (i = 0; i < ec->num_trans; i++) {
3646 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3647 VALUE v;
3648 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3649 v = rb_str_new_cstr(tr->dst_encoding);
3650 else
3651 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3652 rb_ary_push(result, v);
3653 }
3654 return result;
3655}
3656
3657/*
3658 * call-seq:
3659 * ec == other -> true or false
3660 */
3661static VALUE
3662econv_equal(VALUE self, VALUE other)
3663{
3664 rb_econv_t *ec1 = check_econv(self);
3665 rb_econv_t *ec2;
3666 int i;
3667
3668 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3669 return Qnil;
3670 }
3671 ec2 = DATA_PTR(other);
3672 if (!ec2) return Qfalse;
3673 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3674 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3675 return Qfalse;
3676 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3677 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3678 return Qfalse;
3679 if (ec1->flags != ec2->flags) return Qfalse;
3680 if (ec1->replacement_enc != ec2->replacement_enc &&
3681 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3682 return Qfalse;
3683 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3684 if (ec1->replacement_str != ec2->replacement_str &&
3685 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3686 return Qfalse;
3687
3688 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3689 for (i = 0; i < ec1->num_trans; i++) {
3690 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3691 return Qfalse;
3692 }
3693 return Qtrue;
3694}
3695
3696static VALUE
3697econv_result_to_symbol(rb_econv_result_t res)
3698{
3699 switch (res) {
3700 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3701 case econv_incomplete_input: return sym_incomplete_input;
3702 case econv_undefined_conversion: return sym_undefined_conversion;
3703 case econv_destination_buffer_full: return sym_destination_buffer_full;
3704 case econv_source_buffer_empty: return sym_source_buffer_empty;
3705 case econv_finished: return sym_finished;
3706 case econv_after_output: return sym_after_output;
3707 default: return INT2NUM(res); /* should not be reached */
3708 }
3709}
3710
3711/*
3712 * call-seq:
3713 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3714 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3715 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3716 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3717 *
3718 * possible opt elements:
3719 * hash form:
3720 * :partial_input => true # source buffer may be part of larger source
3721 * :after_output => true # stop conversion after output before input
3722 * integer form:
3723 * Encoding::Converter::PARTIAL_INPUT
3724 * Encoding::Converter::AFTER_OUTPUT
3725 *
3726 * possible results:
3727 * :invalid_byte_sequence
3728 * :incomplete_input
3729 * :undefined_conversion
3730 * :after_output
3731 * :destination_buffer_full
3732 * :source_buffer_empty
3733 * :finished
3734 *
3735 * primitive_convert converts source_buffer into destination_buffer.
3736 *
3737 * source_buffer should be a string or nil.
3738 * nil means an empty string.
3739 *
3740 * destination_buffer should be a string.
3741 *
3742 * destination_byteoffset should be an integer or nil.
3743 * nil means the end of destination_buffer.
3744 * If it is omitted, nil is assumed.
3745 *
3746 * destination_bytesize should be an integer or nil.
3747 * nil means unlimited.
3748 * If it is omitted, nil is assumed.
3749 *
3750 * opt should be nil, a hash or an integer.
3751 * nil means no flags.
3752 * If it is omitted, nil is assumed.
3753 *
3754 * primitive_convert converts the content of source_buffer from beginning
3755 * and store the result into destination_buffer.
3756 *
3757 * destination_byteoffset and destination_bytesize specify the region which
3758 * the converted result is stored.
3759 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3760 * If destination_byteoffset is nil,
3761 * destination_buffer.bytesize is used for appending the result.
3762 * destination_bytesize specifies maximum number of bytes.
3763 * If destination_bytesize is nil,
3764 * destination size is unlimited.
3765 * After conversion, destination_buffer is resized to
3766 * destination_byteoffset + actually produced number of bytes.
3767 * Also destination_buffer's encoding is set to destination_encoding.
3768 *
3769 * primitive_convert drops the converted part of source_buffer.
3770 * the dropped part is converted in destination_buffer or
3771 * buffered in Encoding::Converter object.
3772 *
3773 * primitive_convert stops conversion when one of following condition met.
3774 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3775 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3776 * - unexpected end of source buffer (:incomplete_input)
3777 * this occur only when :partial_input is not specified.
3778 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3779 * - character not representable in output encoding (:undefined_conversion)
3780 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3781 * - after some output is generated, before input is done (:after_output)
3782 * this occur only when :after_output is specified.
3783 * - destination buffer is full (:destination_buffer_full)
3784 * this occur only when destination_bytesize is non-nil.
3785 * - source buffer is empty (:source_buffer_empty)
3786 * this occur only when :partial_input is specified.
3787 * - conversion is finished (:finished)
3788 *
3789 * example:
3790 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3791 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3792 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3793 *
3794 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3795 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3796 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3797 * ret = ec.primitive_convert(src, dst="", nil, 1)
3798 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3799 * ret = ec.primitive_convert(src, dst="", nil, 1)
3800 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3801 * ret = ec.primitive_convert(src, dst="", nil, 1)
3802 * p [ret, src, dst] #=> [:finished, "", "i"]
3803 *
3804 */
3805static VALUE
3806econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3807{
3808 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3809 rb_econv_t *ec = check_econv(self);
3811 const unsigned char *ip, *is;
3812 unsigned char *op, *os;
3813 long output_byteoffset, output_bytesize;
3814 unsigned long output_byteend;
3815 int flags;
3816
3817 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3818
3819 if (NIL_P(output_byteoffset_v))
3820 output_byteoffset = 0; /* dummy */
3821 else
3822 output_byteoffset = NUM2LONG(output_byteoffset_v);
3823
3824 if (NIL_P(output_bytesize_v))
3825 output_bytesize = 0; /* dummy */
3826 else
3827 output_bytesize = NUM2LONG(output_bytesize_v);
3828
3829 if (!NIL_P(flags_v)) {
3830 if (!NIL_P(opt)) {
3831 rb_error_arity(argc + 1, 2, 5);
3832 }
3833 flags = NUM2INT(rb_to_int(flags_v));
3834 }
3835 else if (!NIL_P(opt)) {
3836 VALUE v;
3837 flags = 0;
3838 v = rb_hash_aref(opt, sym_partial_input);
3839 if (RTEST(v))
3840 flags |= ECONV_PARTIAL_INPUT;
3841 v = rb_hash_aref(opt, sym_after_output);
3842 if (RTEST(v))
3843 flags |= ECONV_AFTER_OUTPUT;
3844 }
3845 else {
3846 flags = 0;
3847 }
3848
3849 StringValue(output);
3850 if (!NIL_P(input))
3851 StringValue(input);
3852 rb_str_modify(output);
3853
3854 if (NIL_P(output_bytesize_v)) {
3855 output_bytesize = rb_str_capacity(output);
3856
3857 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3858 output_bytesize = RSTRING_LEN(input);
3859 }
3860
3861 retry:
3862
3863 if (NIL_P(output_byteoffset_v))
3864 output_byteoffset = RSTRING_LEN(output);
3865
3866 if (output_byteoffset < 0)
3867 rb_raise(rb_eArgError, "negative output_byteoffset");
3868
3869 if (RSTRING_LEN(output) < output_byteoffset)
3870 rb_raise(rb_eArgError, "output_byteoffset too big");
3871
3872 if (output_bytesize < 0)
3873 rb_raise(rb_eArgError, "negative output_bytesize");
3874
3875 output_byteend = (unsigned long)output_byteoffset +
3876 (unsigned long)output_bytesize;
3877
3878 if (output_byteend < (unsigned long)output_byteoffset ||
3879 LONG_MAX < output_byteend)
3880 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3881
3882 if (rb_str_capacity(output) < output_byteend)
3883 rb_str_resize(output, output_byteend);
3884
3885 if (NIL_P(input)) {
3886 ip = is = NULL;
3887 }
3888 else {
3889 ip = (const unsigned char *)RSTRING_PTR(input);
3890 is = ip + RSTRING_LEN(input);
3891 }
3892
3893 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3894 os = op + output_bytesize;
3895
3896 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3897 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3898 if (!NIL_P(input)) {
3899 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3900 }
3901
3902 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3903 if (LONG_MAX / 2 < output_bytesize)
3904 rb_raise(rb_eArgError, "too long conversion result");
3905 output_bytesize *= 2;
3906 output_byteoffset_v = Qnil;
3907 goto retry;
3908 }
3909
3910 if (ec->destination_encoding) {
3911 rb_enc_associate(output, ec->destination_encoding);
3912 }
3913
3914 return econv_result_to_symbol(res);
3915}
3916
3917/*
3918 * call-seq:
3919 * ec.convert(source_string) -> destination_string
3920 *
3921 * Convert source_string and return destination_string.
3922 *
3923 * source_string is assumed as a part of source.
3924 * i.e. :partial_input=>true is specified internally.
3925 * finish method should be used last.
3926 *
3927 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3928 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3929 * puts ec.finish.dump #=> ""
3930 *
3931 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3932 * puts ec.convert("\xA4").dump #=> ""
3933 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3934 * puts ec.finish.dump #=> ""
3935 *
3936 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3937 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3938 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3939 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3940 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3941 *
3942 * If a conversion error occur,
3943 * Encoding::UndefinedConversionError or
3944 * Encoding::InvalidByteSequenceError is raised.
3945 * Encoding::Converter#convert doesn't supply methods to recover or restart
3946 * from these exceptions.
3947 * When you want to handle these conversion errors,
3948 * use Encoding::Converter#primitive_convert.
3949 *
3950 */
3951static VALUE
3952econv_convert(VALUE self, VALUE source_string)
3953{
3954 VALUE ret, dst;
3955 VALUE av[5];
3956 int ac;
3957 rb_econv_t *ec = check_econv(self);
3958
3959 StringValue(source_string);
3960
3961 dst = rb_str_new(NULL, 0);
3962
3963 av[0] = rb_str_dup(source_string);
3964 av[1] = dst;
3965 av[2] = Qnil;
3966 av[3] = Qnil;
3968 ac = 5;
3969
3970 ret = econv_primitive_convert(ac, av, self);
3971
3972 if (ret == sym_invalid_byte_sequence ||
3973 ret == sym_undefined_conversion ||
3974 ret == sym_incomplete_input) {
3975 VALUE exc = make_econv_exception(ec);
3976 rb_exc_raise(exc);
3977 }
3978
3979 if (ret == sym_finished) {
3980 rb_raise(rb_eArgError, "converter already finished");
3981 }
3982
3983 if (ret != sym_source_buffer_empty) {
3984 rb_bug("unexpected result of econv_primitive_convert");
3985 }
3986
3987 return dst;
3988}
3989
3990/*
3991 * call-seq:
3992 * ec.finish -> string
3993 *
3994 * Finishes the converter.
3995 * It returns the last part of the converted string.
3996 *
3997 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3998 * p ec.convert("\u3042") #=> "\e$B$\""
3999 * p ec.finish #=> "\e(B"
4000 */
4001static VALUE
4002econv_finish(VALUE self)
4003{
4004 VALUE ret, dst;
4005 VALUE av[5];
4006 int ac;
4007 rb_econv_t *ec = check_econv(self);
4008
4009 dst = rb_str_new(NULL, 0);
4010
4011 av[0] = Qnil;
4012 av[1] = dst;
4013 av[2] = Qnil;
4014 av[3] = Qnil;
4015 av[4] = INT2FIX(0);
4016 ac = 5;
4017
4018 ret = econv_primitive_convert(ac, av, self);
4019
4020 if (ret == sym_invalid_byte_sequence ||
4021 ret == sym_undefined_conversion ||
4022 ret == sym_incomplete_input) {
4023 VALUE exc = make_econv_exception(ec);
4024 rb_exc_raise(exc);
4025 }
4026
4027 if (ret != sym_finished) {
4028 rb_bug("unexpected result of econv_primitive_convert");
4029 }
4030
4031 return dst;
4032}
4033
4034/*
4035 * call-seq:
4036 * ec.primitive_errinfo -> array
4037 *
4038 * primitive_errinfo returns important information regarding the last error
4039 * as a 5-element array:
4040 *
4041 * [result, enc1, enc2, error_bytes, readagain_bytes]
4042 *
4043 * result is the last result of primitive_convert.
4044 *
4045 * Other elements are only meaningful when result is
4046 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
4047 *
4048 * enc1 and enc2 indicate a conversion step as a pair of strings.
4049 * For example, a converter from EUC-JP to ISO-8859-1 converts
4050 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
4051 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
4052 *
4053 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4054 * error_bytes is discarded portion.
4055 * readagain_bytes is buffered portion which is read again on next conversion.
4056 *
4057 * Example:
4058 *
4059 * # \xff is invalid as EUC-JP.
4060 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4061 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4062 * p ec.primitive_errinfo
4063 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4064 *
4065 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4066 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4067 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4068 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4069 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4070 * p ec.primitive_errinfo
4071 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4072 *
4073 * # partial character is invalid
4074 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4075 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4076 * p ec.primitive_errinfo
4077 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4078 *
4079 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4080 * # partial characters.
4081 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4082 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4083 * p ec.primitive_errinfo
4084 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4085 *
4086 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4087 * # no low surrogate after high surrogate (\xd8\x00).
4088 * # It is detected by 3rd byte (\00) which is part of next character.
4089 * # So the high surrogate (\xd8\x00) is discarded and
4090 * # the 3rd byte is read again later.
4091 * # Since the byte is buffered in ec, it is dropped from src.
4092 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4093 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4094 * p ec.primitive_errinfo
4095 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4096 * p src
4097 * #=> "@"
4098 *
4099 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4100 * # The problem is detected by 4th byte.
4101 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4102 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4103 * p ec.primitive_errinfo
4104 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4105 * p src
4106 * #=> ""
4107 *
4108 */
4109static VALUE
4110econv_primitive_errinfo(VALUE self)
4111{
4112 rb_econv_t *ec = check_econv(self);
4113
4114 VALUE ary;
4115
4116 ary = rb_ary_new2(5);
4117
4118 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4119 rb_ary_store(ary, 4, Qnil);
4120
4121 if (ec->last_error.source_encoding)
4122 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4123
4124 if (ec->last_error.destination_encoding)
4125 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4126
4127 if (ec->last_error.error_bytes_start) {
4128 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4129 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4130 }
4131
4132 return ary;
4133}
4134
4135/*
4136 * call-seq:
4137 * ec.insert_output(string) -> nil
4138 *
4139 * Inserts string into the encoding converter.
4140 * The string will be converted to the destination encoding and
4141 * output on later conversions.
4142 *
4143 * If the destination encoding is stateful,
4144 * string is converted according to the state and the state is updated.
4145 *
4146 * This method should be used only when a conversion error occurs.
4147 *
4148 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4149 * src = "HIRAGANA LETTER A is \u{3042}."
4150 * dst = ""
4151 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4152 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4153 * ec.insert_output("<err>")
4154 * p ec.primitive_convert(src, dst) #=> :finished
4155 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4156 *
4157 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4158 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4159 * dst = ""
4160 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4161 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4162 * ec.insert_output "?" # state change required to output "?".
4163 * p ec.primitive_convert(src, dst) #=> :finished
4164 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4165 *
4166 */
4167static VALUE
4168econv_insert_output(VALUE self, VALUE string)
4169{
4170 const char *insert_enc;
4171
4172 int ret;
4173
4174 rb_econv_t *ec = check_econv(self);
4175
4176 StringValue(string);
4177 insert_enc = rb_econv_encoding_to_insert_output(ec);
4178 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4179
4180 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4181 if (ret == -1) {
4182 rb_raise(rb_eArgError, "too big string");
4183 }
4184
4185 return Qnil;
4186}
4187
4188/*
4189 * call-seq:
4190 * ec.putback -> string
4191 * ec.putback(max_numbytes) -> string
4192 *
4193 * Put back the bytes which will be converted.
4194 *
4195 * The bytes are caused by invalid_byte_sequence error.
4196 * When invalid_byte_sequence error, some bytes are discarded and
4197 * some bytes are buffered to be converted later.
4198 * The latter bytes can be put back.
4199 * It can be observed by
4200 * Encoding::InvalidByteSequenceError#readagain_bytes and
4201 * Encoding::Converter#primitive_errinfo.
4202 *
4203 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4204 * src = "\x00\xd8\x61\x00"
4205 * dst = ""
4206 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4207 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4208 * p ec.putback #=> "a\x00"
4209 * p ec.putback #=> "" # no more bytes to put back
4210 *
4211 */
4212static VALUE
4213econv_putback(int argc, VALUE *argv, VALUE self)
4214{
4215 rb_econv_t *ec = check_econv(self);
4216 int n;
4217 int putbackable;
4218 VALUE str, max;
4219
4220 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4221 n = rb_econv_putbackable(ec);
4222 }
4223 else {
4224 n = NUM2INT(max);
4225 putbackable = rb_econv_putbackable(ec);
4226 if (putbackable < n)
4227 n = putbackable;
4228 }
4229
4230 str = rb_str_new(NULL, n);
4231 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4232
4233 if (ec->source_encoding) {
4234 rb_enc_associate(str, ec->source_encoding);
4235 }
4236
4237 return str;
4238}
4239
4240/*
4241 * call-seq:
4242 * ec.last_error -> exception or nil
4243 *
4244 * Returns an exception object for the last conversion.
4245 * Returns nil if the last conversion did not produce an error.
4246 *
4247 * "error" means that
4248 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4249 * Encoding::Converter#convert and
4250 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4251 * Encoding::Converter#primitive_convert.
4252 *
4253 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4254 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4255 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4256 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4257 * p ec.last_error #=> nil
4258 *
4259 */
4260static VALUE
4261econv_last_error(VALUE self)
4262{
4263 rb_econv_t *ec = check_econv(self);
4264 VALUE exc;
4265
4266 exc = make_econv_exception(ec);
4267 if (NIL_P(exc))
4268 return Qnil;
4269 return exc;
4270}
4271
4272/*
4273 * call-seq:
4274 * ec.replacement -> string
4275 *
4276 * Returns the replacement string.
4277 *
4278 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4279 * p ec.replacement #=> "?"
4280 *
4281 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4282 * p ec.replacement #=> "\uFFFD"
4283 */
4284static VALUE
4285econv_get_replacement(VALUE self)
4286{
4287 rb_econv_t *ec = check_econv(self);
4288 int ret;
4289 rb_encoding *enc;
4290
4291 ret = make_replacement(ec);
4292 if (ret == -1) {
4293 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4294 }
4295
4296 enc = rb_enc_find(ec->replacement_enc);
4297 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4298}
4299
4300/*
4301 * call-seq:
4302 * ec.replacement = string
4303 *
4304 * Sets the replacement string.
4305 *
4306 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4307 * ec.replacement = "<undef>"
4308 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4309 */
4310static VALUE
4311econv_set_replacement(VALUE self, VALUE arg)
4312{
4313 rb_econv_t *ec = check_econv(self);
4314 VALUE string = arg;
4315 int ret;
4316 rb_encoding *enc;
4317
4318 StringValue(string);
4319 enc = rb_enc_get(string);
4320
4321 ret = rb_econv_set_replacement(ec,
4322 (const unsigned char *)RSTRING_PTR(string),
4323 RSTRING_LEN(string),
4324 rb_enc_name(enc));
4325
4326 if (ret == -1) {
4327 /* xxx: rb_eInvalidByteSequenceError? */
4328 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4329 }
4330
4331 return arg;
4332}
4333
4334VALUE
4336{
4337 return make_econv_exception(ec);
4338}
4339
4340void
4342{
4343 VALUE exc;
4344
4345 exc = make_econv_exception(ec);
4346 if (NIL_P(exc))
4347 return;
4348 rb_exc_raise(exc);
4349}
4350
4351/*
4352 * call-seq:
4353 * ecerr.source_encoding_name -> string
4354 *
4355 * Returns the source encoding name as a string.
4356 */
4357static VALUE
4358ecerr_source_encoding_name(VALUE self)
4359{
4360 return rb_attr_get(self, id_source_encoding_name);
4361}
4362
4363/*
4364 * call-seq:
4365 * ecerr.source_encoding -> encoding
4366 *
4367 * Returns the source encoding as an encoding object.
4368 *
4369 * Note that the result may not be equal to the source encoding of
4370 * the encoding converter if the conversion has multiple steps.
4371 *
4372 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4373 * begin
4374 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4375 * rescue Encoding::UndefinedConversionError
4376 * p $!.source_encoding #=> #<Encoding:UTF-8>
4377 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4378 * p $!.source_encoding_name #=> "UTF-8"
4379 * p $!.destination_encoding_name #=> "EUC-JP"
4380 * end
4381 *
4382 */
4383static VALUE
4384ecerr_source_encoding(VALUE self)
4385{
4386 return rb_attr_get(self, id_source_encoding);
4387}
4388
4389/*
4390 * call-seq:
4391 * ecerr.destination_encoding_name -> string
4392 *
4393 * Returns the destination encoding name as a string.
4394 */
4395static VALUE
4396ecerr_destination_encoding_name(VALUE self)
4397{
4398 return rb_attr_get(self, id_destination_encoding_name);
4399}
4400
4401/*
4402 * call-seq:
4403 * ecerr.destination_encoding -> string
4404 *
4405 * Returns the destination encoding as an encoding object.
4406 */
4407static VALUE
4408ecerr_destination_encoding(VALUE self)
4409{
4410 return rb_attr_get(self, id_destination_encoding);
4411}
4412
4413/*
4414 * call-seq:
4415 * ecerr.error_char -> string
4416 *
4417 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4418 *
4419 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4420 * begin
4421 * ec.convert("\xa0")
4422 * rescue Encoding::UndefinedConversionError
4423 * puts $!.error_char.dump #=> "\xC2\xA0"
4424 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4425 * end
4426 *
4427 */
4428static VALUE
4429ecerr_error_char(VALUE self)
4430{
4431 return rb_attr_get(self, id_error_char);
4432}
4433
4434/*
4435 * call-seq:
4436 * ecerr.error_bytes -> string
4437 *
4438 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4439 *
4440 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4441 * begin
4442 * ec.convert("abc\xA1\xFFdef")
4443 * rescue Encoding::InvalidByteSequenceError
4444 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4445 * puts $!.error_bytes.dump #=> "\xA1"
4446 * puts $!.readagain_bytes.dump #=> "\xFF"
4447 * end
4448 */
4449static VALUE
4450ecerr_error_bytes(VALUE self)
4451{
4452 return rb_attr_get(self, id_error_bytes);
4453}
4454
4455/*
4456 * call-seq:
4457 * ecerr.readagain_bytes -> string
4458 *
4459 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4460 */
4461static VALUE
4462ecerr_readagain_bytes(VALUE self)
4463{
4464 return rb_attr_get(self, id_readagain_bytes);
4465}
4466
4467/*
4468 * call-seq:
4469 * ecerr.incomplete_input? -> true or false
4470 *
4471 * Returns true if the invalid byte sequence error is caused by
4472 * premature end of string.
4473 *
4474 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4475 *
4476 * begin
4477 * ec.convert("abc\xA1z")
4478 * rescue Encoding::InvalidByteSequenceError
4479 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4480 * p $!.incomplete_input? #=> false
4481 * end
4482 *
4483 * begin
4484 * ec.convert("abc\xA1")
4485 * ec.finish
4486 * rescue Encoding::InvalidByteSequenceError
4487 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4488 * p $!.incomplete_input? #=> true
4489 * end
4490 */
4491static VALUE
4492ecerr_incomplete_input(VALUE self)
4493{
4494 return rb_attr_get(self, id_incomplete_input);
4495}
4496
4497/*
4498 * Document-class: Encoding::UndefinedConversionError
4499 *
4500 * Raised by Encoding and String methods when a transcoding operation
4501 * fails.
4502 */
4503
4504/*
4505 * Document-class: Encoding::InvalidByteSequenceError
4506 *
4507 * Raised by Encoding and String methods when the string being
4508 * transcoded contains a byte invalid for the either the source or
4509 * target encoding.
4510 */
4511
4512/*
4513 * Document-class: Encoding::ConverterNotFoundError
4514 *
4515 * Raised by transcoding methods when a named encoding does not
4516 * correspond with a known converter.
4517 */
4518
4519void
4520Init_transcode(void)
4521{
4522 transcoder_table = st_init_strcasetable();
4523
4524 id_destination_encoding = rb_intern_const("destination_encoding");
4525 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4526 id_error_bytes = rb_intern_const("error_bytes");
4527 id_error_char = rb_intern_const("error_char");
4528 id_incomplete_input = rb_intern_const("incomplete_input");
4529 id_readagain_bytes = rb_intern_const("readagain_bytes");
4530 id_source_encoding = rb_intern_const("source_encoding");
4531 id_source_encoding_name = rb_intern_const("source_encoding_name");
4532
4533 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4534 sym_undef = ID2SYM(rb_intern_const("undef"));
4535 sym_replace = ID2SYM(rb_intern_const("replace"));
4536 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4537 sym_xml = ID2SYM(rb_intern_const("xml"));
4538 sym_text = ID2SYM(rb_intern_const("text"));
4539 sym_attr = ID2SYM(rb_intern_const("attr"));
4540
4541 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4542 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4543 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4544 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4545 sym_finished = ID2SYM(rb_intern_const("finished"));
4546 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4547 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4548 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4549 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4550 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4551 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4552 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4553
4554#ifdef ENABLE_ECONV_NEWLINE_OPTION
4555 sym_newline = ID2SYM(rb_intern_const("newline"));
4556 sym_universal = ID2SYM(rb_intern_const("universal"));
4557 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4558 sym_cr = ID2SYM(rb_intern_const("cr"));
4559 sym_lf = ID2SYM(rb_intern_const("lf"));
4560#endif
4561
4562 InitVM(transcode);
4563}
4564
4565void
4566InitVM_transcode(void)
4567{
4568 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4569 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4570 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4571
4572 rb_define_method(rb_cString, "encode", str_encode, -1);
4573 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4574
4575 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4576 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4577 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4578 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4579 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4580 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4581 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4582 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4583 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4584 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4585 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4586 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4587 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4588 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4589 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4590 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4591 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4592 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4593 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4594
4595 /*
4596 *Mask for invalid byte sequences
4597 */
4598 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4599
4600 /*
4601 * Replace invalid byte sequences
4602 */
4603 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4604
4605 /*
4606 * Mask for a valid character in the source encoding but no related
4607 * character(s) in destination encoding.
4608 */
4609 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4610
4611 /*
4612 * Replace byte sequences that are undefined in the destination encoding.
4613 */
4614 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4615
4616 /*
4617 * Replace byte sequences that are undefined in the destination encoding
4618 * with an XML hexadecimal character reference. This is valid for XML
4619 * conversion.
4620 */
4621 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4622
4623 /*
4624 * Indicates the source may be part of a larger string. See
4625 * primitive_convert for an example.
4626 */
4627 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4628
4629 /*
4630 * Stop converting after some output is complete but before all of the
4631 * input was consumed. See primitive_convert for an example.
4632 */
4633 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4634
4635 /*
4636 * Decorator for converting CRLF and CR to LF
4637 */
4638 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4639
4640 /*
4641 * Decorator for converting CRLF and CR to LF when writing
4642 */
4643 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4644
4645 /*
4646 * Decorator for converting LF to CRLF
4647 */
4648 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4649
4650 /*
4651 * Decorator for converting LF to CR
4652 */
4653 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4654
4655 /*
4656 * Escape as XML CharData
4657 */
4658 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4659
4660 /*
4661 * Escape as XML AttValue
4662 */
4663 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4664
4665 /*
4666 * Escape as XML AttValue
4667 */
4668 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4669
4670 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4671 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4672 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4673 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4674 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4675
4676 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4677 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4678 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4679 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4680 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4681 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4682 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4683
4684 Init_newline();
4685}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1639
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3262
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1674
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:403
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1380
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1397
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1481
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1436
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:60
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1332
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:937
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:821
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2655
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2123
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1542
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2700
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1780
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3274
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1106
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1947
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1938
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1824
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1626
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1959
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2706
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:2005
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:2022
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1988
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2969
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4335
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4341
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1953
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1875
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1791
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2285
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1168
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1006
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1676
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2575
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:120
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1736
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1778
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:991
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1508
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1986
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3377
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2734
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7337
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1708
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5751
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1986
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3361
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:520
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:455
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:515
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition st.h:79
Definition string.c:8263
Definition transcode.c:177
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376