Ruby 3.5.0dev (2025-08-27 revision fb6e3a80009a744a4e0b75660f1ce6da65e20e6c)
transcode.c (fb6e3a80009a744a4e0b75660f1ce6da65e20e6c)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "internal/encoding.h"
23#include "ruby/encoding.h"
24#include "vm_sync.h"
25
26#include "transcode_data.h"
27#include "id.h"
28
29#define ENABLE_ECONV_NEWLINE_OPTION 1
30
31/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
32static VALUE rb_eUndefinedConversionError;
33static VALUE rb_eInvalidByteSequenceError;
34static VALUE rb_eConverterNotFoundError;
35
36VALUE rb_cEncodingConverter;
37
38static ID id_destination_encoding;
39static ID id_destination_encoding_name;
40static ID id_error_bytes;
41static ID id_error_char;
42static ID id_incomplete_input;
43static ID id_readagain_bytes;
44static ID id_source_encoding;
45static ID id_source_encoding_name;
46
47static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
48static VALUE sym_xml, sym_text, sym_attr;
49static VALUE sym_universal_newline;
50static VALUE sym_crlf_newline;
51static VALUE sym_cr_newline;
52static VALUE sym_lf_newline;
53#ifdef ENABLE_ECONV_NEWLINE_OPTION
54static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
55#endif
56static VALUE sym_partial_input;
57
58static VALUE sym_invalid_byte_sequence;
59static VALUE sym_undefined_conversion;
60static VALUE sym_destination_buffer_full;
61static VALUE sym_source_buffer_empty;
62static VALUE sym_finished;
63static VALUE sym_after_output;
64static VALUE sym_incomplete_input;
65
66static unsigned char *
67allocate_converted_string(const char *sname, const char *dname,
68 const unsigned char *str, size_t len,
69 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
70 size_t *dst_len_ptr);
71
72/* dynamic structure, one per conversion (similar to iconv_t) */
73/* may carry conversion state (e.g. for iso-2022-jp) */
74typedef struct rb_transcoding {
75 const rb_transcoder *transcoder;
76
77 int flags;
78
79 int resume_position;
80 unsigned int next_table;
81 VALUE next_info;
82 unsigned char next_byte;
83 unsigned int output_index;
84
85 ssize_t recognized_len; /* already interpreted */
86 ssize_t readagain_len; /* not yet interpreted */
87 union {
88 unsigned char ary[8]; /* max_input <= sizeof(ary) */
89 unsigned char *ptr; /* length: max_input */
90 } readbuf; /* recognized_len + readagain_len used */
91
92 ssize_t writebuf_off;
93 ssize_t writebuf_len;
94 union {
95 unsigned char ary[8]; /* max_output <= sizeof(ary) */
96 unsigned char *ptr; /* length: max_output */
97 } writebuf;
98
99 union rb_transcoding_state_t { /* opaque data for stateful encoding */
100 void *ptr;
101 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
102 double dummy_for_alignment;
103 } state;
105#define TRANSCODING_READBUF(tc) \
106 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
107 (tc)->readbuf.ary : \
108 (tc)->readbuf.ptr)
109#define TRANSCODING_WRITEBUF(tc) \
110 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
111 (tc)->writebuf.ary : \
112 (tc)->writebuf.ptr)
113#define TRANSCODING_WRITEBUF_SIZE(tc) \
114 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
115 sizeof((tc)->writebuf.ary) : \
116 (size_t)(tc)->transcoder->max_output)
117#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
118#define TRANSCODING_STATE(tc) \
119 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
120 (tc)->state.ary : \
121 (tc)->state.ptr)
122
123typedef struct {
124 struct rb_transcoding *tc;
125 unsigned char *out_buf_start;
126 unsigned char *out_data_start;
127 unsigned char *out_data_end;
128 unsigned char *out_buf_end;
129 rb_econv_result_t last_result;
131
133 int flags;
134 int started; /* bool */
135
136 const char *source_encoding_name;
137 const char *destination_encoding_name;
138
139 const unsigned char *replacement_str;
140 size_t replacement_len;
141 const char *replacement_enc;
142
143 unsigned char *in_buf_start;
144 unsigned char *in_data_start;
145 unsigned char *in_data_end;
146 unsigned char *in_buf_end;
147 rb_econv_elem_t *elems;
148 int replacement_allocated; /* bool */
149 int num_allocated;
150 int num_trans;
151 int num_finished;
152 struct rb_transcoding *last_tc;
153
154 /* last error */
155 struct {
156 rb_econv_result_t result;
157 struct rb_transcoding *error_tc;
158 const char *source_encoding;
159 const char *destination_encoding;
160 const unsigned char *error_bytes_start;
161 size_t error_bytes_len;
162 size_t readagain_len;
163 } last_error;
164
165 /* The following fields are only for Encoding::Converter.
166 * rb_econv_open set them NULL. */
167 rb_encoding *source_encoding;
168 rb_encoding *destination_encoding;
169};
170
171/*
172 * Dispatch data and logic
173 */
174
175#define DECORATOR_P(sname, dname) (*(sname) == '\0')
176
177typedef struct {
178 const char *sname;
179 const char *dname;
180 const char *lib; /* null means no need to load a library */
181 const rb_transcoder *transcoder;
183
184static st_table *transcoder_table;
185
186static int
187free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
188{
189 xfree((void *)val);
190 return ST_DELETE;
191}
192
193static int
194free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
195{
196 st_foreach((void *)val, free_inner_transcode_i, 0);
197 st_free_table((void *)val);
198 return ST_DELETE;
199}
200
201void
202rb_free_transcoder_table(void)
203{
204 st_foreach(transcoder_table, free_transcode_i, 0);
205 st_free_table(transcoder_table);
206}
207
208static transcoder_entry_t *
209make_transcoder_entry(const char *sname, const char *dname)
210{
211 st_data_t val;
212 st_table *table2;
213
214 RB_VM_LOCKING() {
215 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
216 val = (st_data_t)st_init_strcasetable();
217 st_add_direct(transcoder_table, (st_data_t)sname, val);
218 }
219 table2 = (st_table *)val;
220 if (!st_lookup(table2, (st_data_t)dname, &val)) {
222 entry->sname = sname;
223 entry->dname = dname;
224 entry->lib = NULL;
225 entry->transcoder = NULL;
226 val = (st_data_t)entry;
227 st_add_direct(table2, (st_data_t)dname, val);
228 }
229 }
230 return (transcoder_entry_t *)val;
231}
232
233static transcoder_entry_t *
234get_transcoder_entry(const char *sname, const char *dname)
235{
236 st_data_t val = 0;
237 st_table *table2;
238 RB_VM_LOCKING() {
239 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
240 table2 = (st_table *)val;
241 if (!st_lookup(table2, (st_data_t)dname, &val)) {
242 val = 0;
243 }
244 }
245 }
246 return (transcoder_entry_t *)val;
247}
248
249void
250rb_register_transcoder(const rb_transcoder *tr)
251{
252 const char *const sname = tr->src_encoding;
253 const char *const dname = tr->dst_encoding;
254
255 transcoder_entry_t *entry;
256
257 RB_VM_LOCKING() {
258 entry = make_transcoder_entry(sname, dname);
259 if (entry->transcoder) {
260 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
261 sname, dname);
262 }
263 entry->transcoder = tr;
264 }
265}
266
267static void
268declare_transcoder(const char *sname, const char *dname, const char *lib)
269{
270 transcoder_entry_t *entry;
271
272 entry = make_transcoder_entry(sname, dname);
273 entry->lib = lib;
274}
275
276static const char transcoder_lib_prefix[] = "enc/trans/";
277
278void
279rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
280{
281 if (!lib) {
282 rb_raise(rb_eArgError, "invalid library name - (null)");
283 }
284 declare_transcoder(enc1, enc2, lib);
285}
286
287#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
288
289typedef struct search_path_queue_tag {
290 struct search_path_queue_tag *next;
291 const char *enc;
293
294typedef struct {
295 st_table *visited;
296 search_path_queue_t *queue;
297 search_path_queue_t **queue_last_ptr;
298 const char *base_enc;
300
301static int
302transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
303{
304 const char *dname = (const char *)key;
307
308 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
309 return ST_CONTINUE;
310 }
311
313 q->enc = dname;
314 q->next = NULL;
315 *bfs->queue_last_ptr = q;
316 bfs->queue_last_ptr = &q->next;
317
318 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
319 return ST_CONTINUE;
320}
321
322static int
323transcode_search_path(const char *sname, const char *dname,
324 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
325 void *arg)
326{
329 st_data_t val;
330 st_table *table2;
331 int pathlen = -1;
332 bool found = false;
333 bool lookup_res;
334
335 if (encoding_equal(sname, dname))
336 return -1;
337
339 q->enc = sname;
340 q->next = NULL;
341 bfs.queue_last_ptr = &q->next;
342 bfs.queue = q;
343
344 bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop
345 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
346
347 RB_VM_LOCKING() {
348 while (bfs.queue) {
349 q = bfs.queue;
350 bfs.queue = q->next;
351 if (!bfs.queue) {
352 bfs.queue_last_ptr = &bfs.queue;
353 }
354
355 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2
356 if (!lookup_res) {
357 xfree(q);
358 continue;
359 }
360 table2 = (st_table *)val;
361
362 if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv
363 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
364 xfree(q);
365 found = true;
366 break;
367 }
368
369 bfs.base_enc = q->enc;
370 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
371
372 bfs.base_enc = NULL;
373 xfree(q);
374 }
375 }
376
377 while (bfs.queue) {
378 q = bfs.queue;
379 bfs.queue = q->next;
380 xfree(q);
381 }
382
383 if (found) {
384 const char *enc = dname;
385 int depth;
386 pathlen = 0;
387 while (1) {
388 st_lookup(bfs.visited, (st_data_t)enc, &val);
389 if (!val)
390 break;
391 pathlen++;
392 enc = (const char *)val;
393 }
394 depth = pathlen;
395 enc = dname;
396 while (1) {
397 st_lookup(bfs.visited, (st_data_t)enc, &val);
398 if (!val)
399 break;
400 callback((const char *)val, enc, --depth, arg);
401 enc = (const char *)val;
402 }
403 }
404
405 st_free_table(bfs.visited);
406
407 return pathlen; /* is -1 if not found */
408}
409
410int rb_require_internal_silent(VALUE fname);
411
412static const rb_transcoder *
413load_transcoder_entry(transcoder_entry_t *entry)
414{
415 ASSERT_vm_unlocking();
416 if (entry->transcoder)
417 return entry->transcoder;
418
419 if (entry->lib) {
420 const char *const lib = entry->lib;
421 const size_t len = strlen(lib);
422 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
423 const VALUE fn = rb_str_new(0, total_len);
424 char *const path = RSTRING_PTR(fn);
425
426 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
427 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
428 rb_str_set_len(fn, total_len);
429 OBJ_FREEZE(fn);
430 rb_require_internal_silent(fn); // Sets entry->transcoder
431 }
432
433 if (entry->transcoder)
434 return entry->transcoder;
435
436 return NULL;
437}
438
439static const char*
440get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
441{
442 if (encoding_equal(encname, "UTF-8")) {
443 *len_ret = 3;
444 *repl_encname_ptr = "UTF-8";
445 return "\xEF\xBF\xBD";
446 }
447 else {
448 *len_ret = 1;
449 *repl_encname_ptr = "US-ASCII";
450 return "?";
451 }
452}
453
454/*
455 * Transcoding engine logic
456 */
457
458static const unsigned char *
459transcode_char_start(rb_transcoding *tc,
460 const unsigned char *in_start,
461 const unsigned char *inchar_start,
462 const unsigned char *in_p,
463 size_t *char_len_ptr)
464{
465 const unsigned char *ptr;
466 if (inchar_start - in_start < tc->recognized_len) {
467 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
468 inchar_start, unsigned char, in_p - inchar_start);
469 ptr = TRANSCODING_READBUF(tc);
470 }
471 else {
472 ptr = inchar_start - tc->recognized_len;
473 }
474 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
475 return ptr;
476}
477
479transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
480 const unsigned char *in_stop, unsigned char *out_stop,
481 rb_transcoding *tc,
482 const int opt)
483{
484 const rb_transcoder *tr = tc->transcoder;
485 int unitlen = tr->input_unit_length;
486 ssize_t readagain_len = 0;
487
488 const unsigned char *inchar_start;
489 const unsigned char *in_p;
490
491 unsigned char *out_p;
492
493 in_p = inchar_start = *in_pos;
494
495 out_p = *out_pos;
496
497#define SUSPEND(ret, num) \
498 do { \
499 tc->resume_position = (num); \
500 if (0 < in_p - inchar_start) \
501 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
502 inchar_start, unsigned char, in_p - inchar_start); \
503 *in_pos = in_p; \
504 *out_pos = out_p; \
505 tc->recognized_len += in_p - inchar_start; \
506 if (readagain_len) { \
507 tc->recognized_len -= readagain_len; \
508 tc->readagain_len = readagain_len; \
509 } \
510 return (ret); \
511 resume_label ## num:; \
512 } while (0)
513#define SUSPEND_OBUF(num) \
514 do { \
515 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
516 } while (0)
517
518#define SUSPEND_AFTER_OUTPUT(num) \
519 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
520 SUSPEND(econv_after_output, num); \
521 }
522
523#define next_table (tc->next_table)
524#define next_info (tc->next_info)
525#define next_byte (tc->next_byte)
526#define writebuf_len (tc->writebuf_len)
527#define writebuf_off (tc->writebuf_off)
528
529 switch (tc->resume_position) {
530 case 0: break;
531 case 1: goto resume_label1;
532 case 2: goto resume_label2;
533 case 3: goto resume_label3;
534 case 4: goto resume_label4;
535 case 5: goto resume_label5;
536 case 6: goto resume_label6;
537 case 7: goto resume_label7;
538 case 8: goto resume_label8;
539 case 9: goto resume_label9;
540 case 10: goto resume_label10;
541 case 11: goto resume_label11;
542 case 12: goto resume_label12;
543 case 13: goto resume_label13;
544 case 14: goto resume_label14;
545 case 15: goto resume_label15;
546 case 16: goto resume_label16;
547 case 17: goto resume_label17;
548 case 18: goto resume_label18;
549 case 19: goto resume_label19;
550 case 20: goto resume_label20;
551 case 21: goto resume_label21;
552 case 22: goto resume_label22;
553 case 23: goto resume_label23;
554 case 24: goto resume_label24;
555 case 25: goto resume_label25;
556 case 26: goto resume_label26;
557 case 27: goto resume_label27;
558 case 28: goto resume_label28;
559 case 29: goto resume_label29;
560 case 30: goto resume_label30;
561 case 31: goto resume_label31;
562 case 32: goto resume_label32;
563 case 33: goto resume_label33;
564 case 34: goto resume_label34;
565 }
566
567 while (1) {
568 inchar_start = in_p;
569 tc->recognized_len = 0;
570 next_table = tr->conv_tree_start;
571
572 SUSPEND_AFTER_OUTPUT(24);
573
574 if (in_stop <= in_p) {
575 if (!(opt & ECONV_PARTIAL_INPUT))
576 break;
577 SUSPEND(econv_source_buffer_empty, 7);
578 continue;
579 }
580
581#define BYTE_ADDR(index) (tr->byte_array + (index))
582#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
583#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
584#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
585#define BL_MIN_BYTE (BL_BASE[0])
586#define BL_MAX_BYTE (BL_BASE[1])
587#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
588#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
589
590 next_byte = (unsigned char)*in_p++;
591 follow_byte:
592 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
593 next_info = INVALID;
594 else {
595 next_info = (VALUE)BL_ACTION(next_byte);
596 }
597 follow_info:
598 switch (next_info & 0x1F) {
599 case NOMAP:
600 {
601 const unsigned char *p = inchar_start;
602 writebuf_off = 0;
603 while (p < in_p) {
604 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
605 }
606 writebuf_len = writebuf_off;
607 writebuf_off = 0;
608 while (writebuf_off < writebuf_len) {
609 SUSPEND_OBUF(3);
610 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
611 }
612 }
613 continue;
614 case 0x00: case 0x04: case 0x08: case 0x0C:
615 case 0x10: case 0x14: case 0x18: case 0x1C:
616 SUSPEND_AFTER_OUTPUT(25);
617 while (in_p >= in_stop) {
618 if (!(opt & ECONV_PARTIAL_INPUT))
619 goto incomplete;
620 SUSPEND(econv_source_buffer_empty, 5);
621 }
622 next_byte = (unsigned char)*in_p++;
623 next_table = (unsigned int)next_info;
624 goto follow_byte;
625 case ZERObt: /* drop input */
626 continue;
627 case ONEbt:
628 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
629 continue;
630 case TWObt:
631 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
632 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
633 continue;
634 case THREEbt:
635 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
636 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
637 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
638 continue;
639 case FOURbt:
640 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
641 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
642 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
643 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
644 continue;
645 case GB4bt:
646 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
647 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
648 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
649 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
650 continue;
651 case STR1:
652 tc->output_index = 0;
653 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
654 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
655 tc->output_index++;
656 }
657 continue;
658 case FUNii:
659 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
660 goto follow_info;
661 case FUNsi:
662 {
663 const unsigned char *char_start;
664 size_t char_len;
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
667 goto follow_info;
668 }
669 case FUNio:
670 SUSPEND_OBUF(13);
671 if (tr->max_output <= out_stop - out_p)
672 out_p += tr->func_io(TRANSCODING_STATE(tc),
673 next_info, out_p, out_stop - out_p);
674 else {
675 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
676 next_info,
677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
678 writebuf_off = 0;
679 while (writebuf_off < writebuf_len) {
680 SUSPEND_OBUF(20);
681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682 }
683 }
684 break;
685 case FUNso:
686 {
687 const unsigned char *char_start;
688 size_t char_len;
689 SUSPEND_OBUF(14);
690 if (tr->max_output <= out_stop - out_p) {
691 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
692 out_p += tr->func_so(TRANSCODING_STATE(tc),
693 char_start, (size_t)char_len,
694 out_p, out_stop - out_p);
695 }
696 else {
697 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
698 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
699 char_start, (size_t)char_len,
700 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
701 writebuf_off = 0;
702 while (writebuf_off < writebuf_len) {
703 SUSPEND_OBUF(22);
704 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
705 }
706 }
707 break;
708 }
709 case FUNsio:
710 {
711 const unsigned char *char_start;
712 size_t char_len;
713 SUSPEND_OBUF(33);
714 if (tr->max_output <= out_stop - out_p) {
715 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
716 out_p += tr->func_sio(TRANSCODING_STATE(tc),
717 char_start, (size_t)char_len, next_info,
718 out_p, out_stop - out_p);
719 }
720 else {
721 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
722 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
723 char_start, (size_t)char_len, next_info,
724 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
725 writebuf_off = 0;
726 while (writebuf_off < writebuf_len) {
727 SUSPEND_OBUF(34);
728 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
729 }
730 }
731 break;
732 }
733 case INVALID:
734 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
735 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
736 SUSPEND_AFTER_OUTPUT(26);
737 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
738 in_p = in_stop;
739 SUSPEND(econv_source_buffer_empty, 8);
740 }
741 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
742 in_p = in_stop;
743 }
744 else {
745 in_p = inchar_start + (unitlen - tc->recognized_len);
746 }
747 }
748 else {
749 ssize_t invalid_len; /* including the last byte which causes invalid */
750 ssize_t discard_len;
751 invalid_len = tc->recognized_len + (in_p - inchar_start);
752 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
753 readagain_len = invalid_len - discard_len;
754 }
755 goto invalid;
756 case UNDEF:
757 goto undef;
758 default:
759 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
760 }
761 continue;
762
763 invalid:
764 SUSPEND(econv_invalid_byte_sequence, 1);
765 continue;
766
767 incomplete:
768 SUSPEND(econv_incomplete_input, 27);
769 continue;
770
771 undef:
772 SUSPEND(econv_undefined_conversion, 2);
773 continue;
774 }
775
776 /* cleanup */
777 if (tr->finish_func) {
778 SUSPEND_OBUF(4);
779 if (tr->max_output <= out_stop - out_p) {
780 out_p += tr->finish_func(TRANSCODING_STATE(tc),
781 out_p, out_stop - out_p);
782 }
783 else {
784 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
785 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
786 writebuf_off = 0;
787 while (writebuf_off < writebuf_len) {
788 SUSPEND_OBUF(23);
789 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
790 }
791 }
792 }
793 while (1)
794 SUSPEND(econv_finished, 6);
795#undef SUSPEND
796#undef next_table
797#undef next_info
798#undef next_byte
799#undef writebuf_len
800#undef writebuf_off
801}
802
804transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
805 const unsigned char *in_stop, unsigned char *out_stop,
806 rb_transcoding *tc,
807 const int opt)
808{
809 if (tc->readagain_len) {
810 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
811 const unsigned char *readagain_pos = readagain_buf;
812 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
814
815 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
816 unsigned char, tc->readagain_len);
817 tc->readagain_len = 0;
818 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
819 if (res != econv_source_buffer_empty) {
820 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
821 readagain_pos, unsigned char, readagain_stop - readagain_pos);
822 tc->readagain_len += readagain_stop - readagain_pos;
823 return res;
824 }
825 }
826 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
827}
828
829static rb_transcoding *
830rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
831{
832 rb_transcoding *tc;
833
834 tc = ALLOC(rb_transcoding);
835 tc->transcoder = tr;
836 tc->flags = flags;
837 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
838 tc->state.ptr = xmalloc(tr->state_size);
839 if (tr->state_init_func) {
840 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
841 }
842 tc->resume_position = 0;
843 tc->recognized_len = 0;
844 tc->readagain_len = 0;
845 tc->writebuf_len = 0;
846 tc->writebuf_off = 0;
847 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
848 tc->readbuf.ptr = xmalloc(tr->max_input);
849 }
850 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
851 tc->writebuf.ptr = xmalloc(tr->max_output);
852 }
853 return tc;
854}
855
857rb_transcoding_convert(rb_transcoding *tc,
858 const unsigned char **input_ptr, const unsigned char *input_stop,
859 unsigned char **output_ptr, unsigned char *output_stop,
860 int flags)
861{
862 return transcode_restartable(
863 input_ptr, output_ptr,
864 input_stop, output_stop,
865 tc, flags);
866}
867
868static void
869rb_transcoding_close(rb_transcoding *tc)
870{
871 const rb_transcoder *tr = tc->transcoder;
872 if (tr->state_fini_func) {
873 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
874 }
875 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
876 xfree(tc->state.ptr);
877 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
878 xfree(tc->readbuf.ptr);
879 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
880 xfree(tc->writebuf.ptr);
881 xfree(tc);
882}
883
884static size_t
885rb_transcoding_memsize(rb_transcoding *tc)
886{
887 size_t size = sizeof(rb_transcoding);
888 const rb_transcoder *tr = tc->transcoder;
889
890 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
891 size += tr->state_size;
892 }
893 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
894 size += tr->max_input;
895 }
896 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
897 size += tr->max_output;
898 }
899 return size;
900}
901
902static rb_econv_t *
903rb_econv_alloc(int n_hint)
904{
905 rb_econv_t *ec;
906
907 if (n_hint <= 0)
908 n_hint = 1;
909
910 ec = ALLOC(rb_econv_t);
911 ec->flags = 0;
912 ec->source_encoding_name = NULL;
913 ec->destination_encoding_name = NULL;
914 ec->started = 0;
915 ec->replacement_str = NULL;
916 ec->replacement_len = 0;
917 ec->replacement_enc = NULL;
918 ec->replacement_allocated = 0;
919 ec->in_buf_start = NULL;
920 ec->in_data_start = NULL;
921 ec->in_data_end = NULL;
922 ec->in_buf_end = NULL;
923 ec->num_allocated = n_hint;
924 ec->num_trans = 0;
925 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
926 ec->num_finished = 0;
927 ec->last_tc = NULL;
928 ec->last_error.result = econv_source_buffer_empty;
929 ec->last_error.error_tc = NULL;
930 ec->last_error.source_encoding = NULL;
931 ec->last_error.destination_encoding = NULL;
932 ec->last_error.error_bytes_start = NULL;
933 ec->last_error.error_bytes_len = 0;
934 ec->last_error.readagain_len = 0;
935 ec->source_encoding = NULL;
936 ec->destination_encoding = NULL;
937 return ec;
938}
939
940static int
941rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
942{
943 int n, j;
944 int bufsize = 4096;
945 unsigned char *p;
946
947 if (ec->num_trans == ec->num_allocated) {
948 n = ec->num_allocated * 2;
949 REALLOC_N(ec->elems, rb_econv_elem_t, n);
950 ec->num_allocated = n;
951 }
952
953 p = xmalloc(bufsize);
954
955 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
956
957 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
958 ec->elems[i].out_buf_start = p;
959 ec->elems[i].out_buf_end = p + bufsize;
960 ec->elems[i].out_data_start = p;
961 ec->elems[i].out_data_end = p;
962 ec->elems[i].last_result = econv_source_buffer_empty;
963
964 ec->num_trans++;
965
966 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
967 for (j = ec->num_trans-1; i <= j; j--) {
968 rb_transcoding *tc = ec->elems[j].tc;
969 const rb_transcoder *tr2 = tc->transcoder;
970 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
971 ec->last_tc = tc;
972 break;
973 }
974 }
975
976 return 0;
977}
978
979static rb_econv_t *
980rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
981{
982 rb_econv_t *ec;
983 int i, ret;
984
985 for (i = 0; i < n; i++) {
986 const rb_transcoder *tr;
987 tr = load_transcoder_entry(entries[i]);
988 if (!tr)
989 return NULL;
990 }
991
992 ec = rb_econv_alloc(n);
993
994 for (i = 0; i < n; i++) {
995 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
996 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
997 if (ret == -1) {
998 rb_econv_close(ec);
999 return NULL;
1000 }
1001 }
1002
1003 return ec;
1004}
1005
1007 transcoder_entry_t **entries;
1008 int num_additional;
1009};
1010
1011static void
1012trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1013{
1014 struct trans_open_t *toarg = arg;
1015
1016 if (!toarg->entries) {
1017 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1018 }
1019 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1020}
1021
1022static rb_econv_t *
1023rb_econv_open0(const char *sname, const char *dname, int ecflags)
1024{
1025 transcoder_entry_t **entries = NULL;
1026 int num_trans;
1027 rb_econv_t *ec;
1028
1029 // loads encodings if not loaded already
1030 if (*sname) rb_enc_find_index(sname);
1031 if (*dname) rb_enc_find_index(dname);
1032
1033 if (*sname == '\0' && *dname == '\0') {
1034 num_trans = 0;
1035 entries = NULL;
1036 sname = dname = "";
1037 }
1038 else {
1039 struct trans_open_t toarg;
1040 toarg.entries = NULL;
1041 toarg.num_additional = 0;
1042 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1043 entries = toarg.entries;
1044 if (num_trans < 0) {
1045 xfree(entries);
1046 return NULL;
1047 }
1048 }
1049
1050 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1051 xfree(entries);
1052 if (!ec)
1053 return NULL;
1054
1055 ec->flags = ecflags;
1056 ec->source_encoding_name = sname;
1057 ec->destination_encoding_name = dname;
1058
1059 return ec;
1060}
1061
1062#define MAX_ECFLAGS_DECORATORS 32
1063
1064static int
1065decorator_names(int ecflags, const char **decorators_ret)
1066{
1067 int num_decorators;
1068
1069 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1074 case 0:
1075 break;
1076 default:
1077 return -1;
1078 }
1079
1080 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1082 return -1;
1083
1084 num_decorators = 0;
1085
1086 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1087 decorators_ret[num_decorators++] = "xml_text_escape";
1089 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1090 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1091 decorators_ret[num_decorators++] = "xml_attr_quote";
1092
1093 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1094 decorators_ret[num_decorators++] = "crlf_newline";
1095 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1096 decorators_ret[num_decorators++] = "cr_newline";
1097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1098 decorators_ret[num_decorators++] = "lf_newline";
1100 decorators_ret[num_decorators++] = "universal_newline";
1101
1102 return num_decorators;
1103}
1104
1105rb_econv_t *
1106rb_econv_open(const char *sname, const char *dname, int ecflags)
1107{
1108 rb_econv_t *ec;
1109 int num_decorators;
1110 const char *decorators[MAX_ECFLAGS_DECORATORS];
1111 int i;
1112
1113 num_decorators = decorator_names(ecflags, decorators);
1114 if (num_decorators == -1)
1115 return NULL;
1116
1117 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1118 if (ec) {
1119 for (i = 0; i < num_decorators; i++) {
1120 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1121 rb_econv_close(ec);
1122 ec = NULL;
1123 break;
1124 }
1125 }
1126 }
1127
1128 if (ec) {
1129 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1130 }
1131 return ec; // can be NULL
1132}
1133
1134static int
1135trans_sweep(rb_econv_t *ec,
1136 const unsigned char **input_ptr, const unsigned char *input_stop,
1137 unsigned char **output_ptr, unsigned char *output_stop,
1138 int flags,
1139 int start)
1140{
1141 int try;
1142 int i, f;
1143
1144 const unsigned char **ipp, *is, *iold;
1145 unsigned char **opp, *os, *oold;
1147
1148 try = 1;
1149 while (try) {
1150 try = 0;
1151 for (i = start; i < ec->num_trans; i++) {
1152 rb_econv_elem_t *te = &ec->elems[i];
1153
1154 if (i == 0) {
1155 ipp = input_ptr;
1156 is = input_stop;
1157 }
1158 else {
1159 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1160 ipp = (const unsigned char **)&prev_te->out_data_start;
1161 is = prev_te->out_data_end;
1162 }
1163
1164 if (i == ec->num_trans-1) {
1165 opp = output_ptr;
1166 os = output_stop;
1167 }
1168 else {
1169 if (te->out_buf_start != te->out_data_start) {
1170 ssize_t len = te->out_data_end - te->out_data_start;
1171 ssize_t off = te->out_data_start - te->out_buf_start;
1172 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1173 te->out_data_start = te->out_buf_start;
1174 te->out_data_end -= off;
1175 }
1176 opp = &te->out_data_end;
1177 os = te->out_buf_end;
1178 }
1179
1180 f = flags;
1181 if (ec->num_finished != i)
1183 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1184 start = 1;
1185 flags &= ~ECONV_AFTER_OUTPUT;
1186 }
1187 if (i != 0)
1188 f &= ~ECONV_AFTER_OUTPUT;
1189 iold = *ipp;
1190 oold = *opp;
1191 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1192 if (iold != *ipp || oold != *opp)
1193 try = 1;
1194
1195 switch (res) {
1199 case econv_after_output:
1200 return i;
1201
1204 break;
1205
1206 case econv_finished:
1207 ec->num_finished = i+1;
1208 break;
1209 }
1210 }
1211 }
1212 return -1;
1213}
1214
1215static rb_econv_result_t
1216rb_trans_conv(rb_econv_t *ec,
1217 const unsigned char **input_ptr, const unsigned char *input_stop,
1218 unsigned char **output_ptr, unsigned char *output_stop,
1219 int flags,
1220 int *result_position_ptr)
1221{
1222 int i;
1223 int needreport_index;
1224 int sweep_start;
1225
1226 unsigned char empty_buf;
1227 unsigned char *empty_ptr = &empty_buf;
1228
1229 if (!input_ptr) {
1230 input_ptr = (const unsigned char **)&empty_ptr;
1231 input_stop = empty_ptr;
1232 }
1233
1234 if (!output_ptr) {
1235 output_ptr = &empty_ptr;
1236 output_stop = empty_ptr;
1237 }
1238
1239 if (ec->elems[0].last_result == econv_after_output)
1240 ec->elems[0].last_result = econv_source_buffer_empty;
1241
1242 for (i = ec->num_trans-1; 0 <= i; i--) {
1243 switch (ec->elems[i].last_result) {
1247 case econv_after_output:
1248 case econv_finished:
1249 sweep_start = i+1;
1250 goto found_needreport;
1251
1254 break;
1255
1256 default:
1257 rb_bug("unexpected transcode last result");
1258 }
1259 }
1260
1261 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1262
1263 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1264 (flags & ECONV_AFTER_OUTPUT)) {
1266
1267 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1269 result_position_ptr);
1270
1271 if (res == econv_source_buffer_empty)
1272 return econv_after_output;
1273 return res;
1274 }
1275
1276 sweep_start = 0;
1277
1278 found_needreport:
1279
1280 do {
1281 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1282 sweep_start = needreport_index + 1;
1283 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1284
1285 for (i = ec->num_trans-1; 0 <= i; i--) {
1286 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1287 rb_econv_result_t res = ec->elems[i].last_result;
1288 if (res == econv_invalid_byte_sequence ||
1289 res == econv_incomplete_input ||
1291 res == econv_after_output) {
1292 ec->elems[i].last_result = econv_source_buffer_empty;
1293 }
1294 if (result_position_ptr)
1295 *result_position_ptr = i;
1296 return res;
1297 }
1298 }
1299 if (result_position_ptr)
1300 *result_position_ptr = -1;
1302}
1303
1304static rb_econv_result_t
1305rb_econv_convert0(rb_econv_t *ec,
1306 const unsigned char **input_ptr, const unsigned char *input_stop,
1307 unsigned char **output_ptr, unsigned char *output_stop,
1308 int flags)
1309{
1311 int result_position;
1312 int has_output = 0;
1313
1314 memset(&ec->last_error, 0, sizeof(ec->last_error));
1315
1316 if (ec->num_trans == 0) {
1317 size_t len;
1318 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1319 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1320 len = output_stop - *output_ptr;
1321 memcpy(*output_ptr, ec->in_data_start, len);
1322 *output_ptr = output_stop;
1323 ec->in_data_start += len;
1325 goto gotresult;
1326 }
1327 len = ec->in_data_end - ec->in_data_start;
1328 memcpy(*output_ptr, ec->in_data_start, len);
1329 *output_ptr += len;
1330 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1331 if (flags & ECONV_AFTER_OUTPUT) {
1332 res = econv_after_output;
1333 goto gotresult;
1334 }
1335 }
1336 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1337 len = output_stop - *output_ptr;
1338 }
1339 else {
1340 len = input_stop - *input_ptr;
1341 }
1342 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1343 *(*output_ptr)++ = *(*input_ptr)++;
1344 res = econv_after_output;
1345 goto gotresult;
1346 }
1347 memcpy(*output_ptr, *input_ptr, len);
1348 *output_ptr += len;
1349 *input_ptr += len;
1350 if (*input_ptr != input_stop)
1352 else if (flags & ECONV_PARTIAL_INPUT)
1354 else
1355 res = econv_finished;
1356 goto gotresult;
1357 }
1358
1359 if (ec->elems[ec->num_trans-1].out_data_start) {
1360 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1361 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1362 if (data_start != data_end) {
1363 size_t len;
1364 if (output_stop - *output_ptr < data_end - data_start) {
1365 len = output_stop - *output_ptr;
1366 memcpy(*output_ptr, data_start, len);
1367 *output_ptr = output_stop;
1368 ec->elems[ec->num_trans-1].out_data_start += len;
1370 goto gotresult;
1371 }
1372 len = data_end - data_start;
1373 memcpy(*output_ptr, data_start, len);
1374 *output_ptr += len;
1375 ec->elems[ec->num_trans-1].out_data_start =
1376 ec->elems[ec->num_trans-1].out_data_end =
1377 ec->elems[ec->num_trans-1].out_buf_start;
1378 has_output = 1;
1379 }
1380 }
1381
1382 if (ec->in_buf_start &&
1383 ec->in_data_start != ec->in_data_end) {
1384 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1385 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1386 if (res != econv_source_buffer_empty)
1387 goto gotresult;
1388 }
1389
1390 if (has_output &&
1391 (flags & ECONV_AFTER_OUTPUT) &&
1392 *input_ptr != input_stop) {
1393 input_stop = *input_ptr;
1394 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1395 if (res == econv_source_buffer_empty)
1396 res = econv_after_output;
1397 }
1398 else if ((flags & ECONV_AFTER_OUTPUT) ||
1399 ec->num_trans == 1) {
1400 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1401 }
1402 else {
1403 flags |= ECONV_AFTER_OUTPUT;
1404 do {
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1406 } while (res == econv_after_output);
1407 }
1408
1409 gotresult:
1410 ec->last_error.result = res;
1411 if (res == econv_invalid_byte_sequence ||
1412 res == econv_incomplete_input ||
1414 rb_transcoding *error_tc = ec->elems[result_position].tc;
1415 ec->last_error.error_tc = error_tc;
1416 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1417 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1418 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1419 ec->last_error.error_bytes_len = error_tc->recognized_len;
1420 ec->last_error.readagain_len = error_tc->readagain_len;
1421 }
1422
1423 return res;
1424}
1425
1426static int output_replacement_character(rb_econv_t *ec);
1427
1428static int
1429output_hex_charref(rb_econv_t *ec)
1430{
1431 int ret;
1432 unsigned char utfbuf[1024];
1433 const unsigned char *utf;
1434 size_t utf_len;
1435 int utf_allocated = 0;
1436 char charef_buf[16];
1437 const unsigned char *p;
1438
1439 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1440 utf = ec->last_error.error_bytes_start;
1441 utf_len = ec->last_error.error_bytes_len;
1442 }
1443 else {
1444 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1445 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1446 utfbuf, sizeof(utfbuf),
1447 &utf_len);
1448 if (!utf)
1449 return -1;
1450 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1451 utf_allocated = 1;
1452 }
1453
1454 if (utf_len % 4 != 0)
1455 goto fail;
1456
1457 p = utf;
1458 while (4 <= utf_len) {
1459 unsigned int u = 0;
1460 u += p[0] << 24;
1461 u += p[1] << 16;
1462 u += p[2] << 8;
1463 u += p[3];
1464 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1465
1466 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1467 if (ret == -1)
1468 goto fail;
1469
1470 p += 4;
1471 utf_len -= 4;
1472 }
1473
1474 if (utf_allocated)
1475 xfree((void *)utf);
1476 return 0;
1477
1478 fail:
1479 if (utf_allocated)
1480 xfree((void *)utf);
1481 return -1;
1482}
1483
1486 const unsigned char **input_ptr, const unsigned char *input_stop,
1487 unsigned char **output_ptr, unsigned char *output_stop,
1488 int flags)
1489{
1491
1492 unsigned char empty_buf;
1493 unsigned char *empty_ptr = &empty_buf;
1494
1495 ec->started = 1;
1496
1497 if (!input_ptr) {
1498 input_ptr = (const unsigned char **)&empty_ptr;
1499 input_stop = empty_ptr;
1500 }
1501
1502 if (!output_ptr) {
1503 output_ptr = &empty_ptr;
1504 output_stop = empty_ptr;
1505 }
1506
1507 resume:
1508 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1509
1510 if (ret == econv_invalid_byte_sequence ||
1511 ret == econv_incomplete_input) {
1512 /* deal with invalid byte sequence */
1513 /* todo: add more alternative behaviors */
1514 switch (ec->flags & ECONV_INVALID_MASK) {
1516 if (output_replacement_character(ec) == 0)
1517 goto resume;
1518 }
1519 }
1520
1521 if (ret == econv_undefined_conversion) {
1522 /* valid character in source encoding
1523 * but no related character(s) in destination encoding */
1524 /* todo: add more alternative behaviors */
1525 switch (ec->flags & ECONV_UNDEF_MASK) {
1527 if (output_replacement_character(ec) == 0)
1528 goto resume;
1529 break;
1530
1532 if (output_hex_charref(ec) == 0)
1533 goto resume;
1534 break;
1535 }
1536 }
1537
1538 return ret;
1539}
1540
1541const char *
1543{
1544 rb_transcoding *tc = ec->last_tc;
1545 const rb_transcoder *tr;
1546
1547 if (tc == NULL)
1548 return "";
1549
1550 tr = tc->transcoder;
1551
1552 if (tr->asciicompat_type == asciicompat_encoder)
1553 return tr->src_encoding;
1554 return tr->dst_encoding;
1555}
1556
1557static unsigned char *
1558allocate_converted_string(const char *sname, const char *dname,
1559 const unsigned char *str, size_t len,
1560 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1561 size_t *dst_len_ptr)
1562{
1563 unsigned char *dst_str;
1564 size_t dst_len;
1565 size_t dst_bufsize;
1566
1567 rb_econv_t *ec;
1569
1570 const unsigned char *sp;
1571 unsigned char *dp;
1572
1573 if (caller_dst_buf)
1574 dst_bufsize = caller_dst_bufsize;
1575 else if (len == 0)
1576 dst_bufsize = 1;
1577 else
1578 dst_bufsize = len;
1579
1580 ec = rb_econv_open(sname, dname, 0);
1581 if (ec == NULL)
1582 return NULL;
1583 if (caller_dst_buf)
1584 dst_str = caller_dst_buf;
1585 else
1586 dst_str = xmalloc(dst_bufsize);
1587 dst_len = 0;
1588 sp = str;
1589 dp = dst_str+dst_len;
1590 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1591 dst_len = dp - dst_str;
1592 while (res == econv_destination_buffer_full) {
1593 if (SIZE_MAX/2 < dst_bufsize) {
1594 goto fail;
1595 }
1596 dst_bufsize *= 2;
1597 if (dst_str == caller_dst_buf) {
1598 unsigned char *tmp;
1599 tmp = xmalloc(dst_bufsize);
1600 memcpy(tmp, dst_str, dst_bufsize/2);
1601 dst_str = tmp;
1602 }
1603 else {
1604 dst_str = xrealloc(dst_str, dst_bufsize);
1605 }
1606 dp = dst_str+dst_len;
1607 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1608 dst_len = dp - dst_str;
1609 }
1610 if (res != econv_finished) {
1611 goto fail;
1612 }
1613 rb_econv_close(ec);
1614 *dst_len_ptr = dst_len;
1615 return dst_str;
1616
1617 fail:
1618 if (dst_str != caller_dst_buf)
1619 xfree(dst_str);
1620 rb_econv_close(ec);
1621 return NULL;
1622}
1623
1624/* result: 0:success -1:failure */
1625int
1627 const unsigned char *str, size_t len, const char *str_encoding)
1628{
1629 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1630 unsigned char insert_buf[4096];
1631 const unsigned char *insert_str = NULL;
1632 size_t insert_len;
1633
1634 int last_trans_index;
1635 rb_transcoding *tc;
1636
1637 unsigned char **buf_start_p;
1638 unsigned char **data_start_p;
1639 unsigned char **data_end_p;
1640 unsigned char **buf_end_p;
1641
1642 size_t need;
1643
1644 ec->started = 1;
1645
1646 if (len == 0)
1647 return 0;
1648
1649 if (encoding_equal(insert_encoding, str_encoding)) {
1650 insert_str = str;
1651 insert_len = len;
1652 }
1653 else {
1654 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1655 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1656 if (insert_str == NULL)
1657 return -1;
1658 }
1659
1660 need = insert_len;
1661
1662 last_trans_index = ec->num_trans-1;
1663 if (ec->num_trans == 0) {
1664 tc = NULL;
1665 buf_start_p = &ec->in_buf_start;
1666 data_start_p = &ec->in_data_start;
1667 data_end_p = &ec->in_data_end;
1668 buf_end_p = &ec->in_buf_end;
1669 }
1670 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1671 tc = ec->elems[last_trans_index].tc;
1672 need += tc->readagain_len;
1673 if (need < insert_len)
1674 goto fail;
1675 if (last_trans_index == 0) {
1676 buf_start_p = &ec->in_buf_start;
1677 data_start_p = &ec->in_data_start;
1678 data_end_p = &ec->in_data_end;
1679 buf_end_p = &ec->in_buf_end;
1680 }
1681 else {
1682 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1683 buf_start_p = &ee->out_buf_start;
1684 data_start_p = &ee->out_data_start;
1685 data_end_p = &ee->out_data_end;
1686 buf_end_p = &ee->out_buf_end;
1687 }
1688 }
1689 else {
1690 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1691 buf_start_p = &ee->out_buf_start;
1692 data_start_p = &ee->out_data_start;
1693 data_end_p = &ee->out_data_end;
1694 buf_end_p = &ee->out_buf_end;
1695 tc = ec->elems[last_trans_index].tc;
1696 }
1697
1698 if (*buf_start_p == NULL) {
1699 unsigned char *buf = xmalloc(need);
1700 *buf_start_p = buf;
1701 *data_start_p = buf;
1702 *data_end_p = buf;
1703 *buf_end_p = buf+need;
1704 }
1705 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1706 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1707 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1708 *data_start_p = *buf_start_p;
1709 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1710 unsigned char *buf;
1711 size_t s = (*data_end_p - *buf_start_p) + need;
1712 if (s < need)
1713 goto fail;
1714 buf = xrealloc(*buf_start_p, s);
1715 *data_start_p = buf;
1716 *data_end_p = buf + (*data_end_p - *buf_start_p);
1717 *buf_start_p = buf;
1718 *buf_end_p = buf + s;
1719 }
1720 }
1721
1722 memcpy(*data_end_p, insert_str, insert_len);
1723 *data_end_p += insert_len;
1724 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1725 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1726 *data_end_p += tc->readagain_len;
1727 tc->readagain_len = 0;
1728 }
1729
1730 if (insert_str != str && insert_str != insert_buf)
1731 xfree((void*)insert_str);
1732 return 0;
1733
1734 fail:
1735 if (insert_str != str && insert_str != insert_buf)
1736 xfree((void*)insert_str);
1737 return -1;
1738}
1739
1740void
1742{
1743 int i;
1744
1745 if (ec->replacement_allocated) {
1746 xfree((void *)ec->replacement_str);
1747 }
1748 for (i = 0; i < ec->num_trans; i++) {
1749 rb_transcoding_close(ec->elems[i].tc);
1750 xfree(ec->elems[i].out_buf_start);
1751 }
1752 xfree(ec->in_buf_start);
1753 xfree(ec->elems);
1754 xfree(ec);
1755}
1756
1757size_t
1758rb_econv_memsize(rb_econv_t *ec)
1759{
1760 size_t size = sizeof(rb_econv_t);
1761 int i;
1762
1763 if (ec->replacement_allocated) {
1764 size += ec->replacement_len;
1765 }
1766 for (i = 0; i < ec->num_trans; i++) {
1767 size += rb_transcoding_memsize(ec->elems[i].tc);
1768
1769 if (ec->elems[i].out_buf_start) {
1770 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1771 }
1772 }
1773 size += ec->in_buf_end - ec->in_buf_start;
1774 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1775
1776 return size;
1777}
1778
1779int
1781{
1782 if (ec->num_trans == 0)
1783 return 0;
1784#if SIZEOF_SIZE_T > SIZEOF_INT
1785 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1786#endif
1787 return (int)ec->elems[0].tc->readagain_len;
1788}
1789
1790void
1791rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1792{
1793 rb_transcoding *tc;
1794 if (ec->num_trans == 0 || n == 0)
1795 return;
1796 tc = ec->elems[0].tc;
1797 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1798 tc->readagain_len -= n;
1799}
1800
1802 const char *ascii_compat_name;
1803 const char *ascii_incompat_name;
1804};
1805
1806static int
1807asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1808{
1809 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1810 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1811 const rb_transcoder *tr;
1812
1813 if (DECORATOR_P(entry->sname, entry->dname))
1814 return ST_CONTINUE;
1815 tr = load_transcoder_entry(entry);
1816 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1817 data->ascii_compat_name = tr->dst_encoding;
1818 return ST_STOP;
1819 }
1820 return ST_CONTINUE;
1821}
1822
1823const char *
1824rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1825{
1826 st_data_t v;
1827 st_table *table2;
1828 struct asciicompat_encoding_t data = {0};
1829
1830 unsigned int lev;
1831 RB_VM_LOCK_ENTER_LEV(&lev);
1832 {
1833 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1834 table2 = (st_table *)v;
1835 /*
1836 * Assumption:
1837 * There is at most one transcoder for
1838 * converting from ASCII incompatible encoding.
1839 *
1840 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1841 */
1842 if (table2->num_entries == 1) {
1843 data.ascii_incompat_name = ascii_incompat_name;
1844 data.ascii_compat_name = NULL;
1845 if (rb_multi_ractor_p()) {
1846 /*
1847 * We need to unlock in case `load_transcoder_entry` actually loads the encoding
1848 * and table2 could be inserted into when we unlock.
1849 */
1850 st_table *dup_table2 = st_copy(table2);
1851 RB_VM_LOCK_LEAVE_LEV(&lev);
1852 st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data);
1853 st_free_table(dup_table2);
1854 RB_VM_LOCK_ENTER_LEV(&lev);
1855 }
1856 else {
1857 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1858 }
1859 }
1860
1861 }
1862 }
1863 RB_VM_LOCK_LEAVE_LEV(&lev);
1864
1865 return data.ascii_compat_name; // can be NULL
1866}
1867
1868/*
1869 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1870 *
1871 * If the result of the conversion is not compatible with the encoding of
1872 * `dst`, `dst` may not be valid encoding.
1873 */
1874VALUE
1875rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1876{
1877 unsigned const char *sp, *se;
1878 unsigned char *ds, *dp, *de;
1880 int max_output;
1881 enum ruby_coderange_type coderange;
1882 rb_encoding *dst_enc = ec->destination_encoding;
1883
1884 if (NIL_P(dst)) {
1885 dst = rb_str_buf_new(len);
1886 if (dst_enc) {
1887 rb_enc_associate(dst, dst_enc);
1888 }
1889 coderange = ENC_CODERANGE_7BIT; // scan from the start
1890 }
1891 else {
1892 dst_enc = rb_enc_get(dst);
1893 coderange = rb_enc_str_coderange(dst);
1894 }
1895
1896 if (ec->last_tc)
1897 max_output = ec->last_tc->transcoder->max_output;
1898 else
1899 max_output = 1;
1900
1901 do {
1902 int cr;
1903 long dlen = RSTRING_LEN(dst);
1904 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1905 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1906 if (LONG_MAX < new_capa)
1907 rb_raise(rb_eArgError, "too long string");
1908 rb_str_modify_expand(dst, new_capa - dlen);
1909 }
1910 sp = (const unsigned char *)ss;
1911 se = sp + len;
1912 ds = (unsigned char *)RSTRING_PTR(dst);
1913 de = ds + rb_str_capacity(dst);
1914 dp = ds += dlen;
1915 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1916 switch (coderange) {
1917 case ENC_CODERANGE_7BIT:
1919 cr = (int)coderange;
1920 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1921 coderange = cr;
1922 ENC_CODERANGE_SET(dst, coderange);
1923 break;
1926 break;
1927 }
1928 len -= (const char *)sp - ss;
1929 ss = (const char *)sp;
1930 rb_str_set_len(dst, dlen + (dp - ds));
1932 } while (res == econv_destination_buffer_full);
1933
1934 return dst;
1935}
1936
1937VALUE
1938rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1939{
1940 src = rb_str_new_frozen(src);
1941 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1942 RB_GC_GUARD(src);
1943 return dst;
1944}
1945
1946VALUE
1948{
1949 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1950}
1951
1952VALUE
1953rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1954{
1955 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1956}
1957
1958VALUE
1960{
1961 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1962}
1963
1964static int
1965rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1966{
1967 transcoder_entry_t *entry;
1968 const rb_transcoder *tr = NULL;
1969
1970 if (ec->started != 0)
1971 return -1;
1972
1973 entry = get_transcoder_entry(sname, dname);
1974 if (entry) {
1975 tr = load_transcoder_entry(entry);
1976 }
1977
1978 return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
1979}
1980
1981static int
1982rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1983{
1984 return rb_econv_add_converter(ec, "", decorator_name, n);
1985}
1986
1987int
1988rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1989{
1990 const rb_transcoder *tr;
1991
1992 if (ec->num_trans == 0)
1993 return rb_econv_decorate_at(ec, decorator_name, 0);
1994
1995 tr = ec->elems[0].tc->transcoder;
1996
1997 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1998 tr->asciicompat_type == asciicompat_decoder)
1999 return rb_econv_decorate_at(ec, decorator_name, 1);
2000
2001 return rb_econv_decorate_at(ec, decorator_name, 0);
2002}
2003
2004int
2005rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
2006{
2007 const rb_transcoder *tr;
2008
2009 if (ec->num_trans == 0)
2010 return rb_econv_decorate_at(ec, decorator_name, 0);
2011
2012 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2013
2014 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
2015 tr->asciicompat_type == asciicompat_encoder)
2016 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2017
2018 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2019}
2020
2021void
2023{
2024 const char *dname = 0;
2025
2026 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2028 dname = "universal_newline";
2029 break;
2031 dname = "crlf_newline";
2032 break;
2034 dname = "cr_newline";
2035 break;
2037 dname = "lf_newline";
2038 break;
2039 }
2040
2041 if (dname) {
2042 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2043 int num_trans = ec->num_trans;
2044 int i, j = 0;
2045
2046 for (i=0; i < num_trans; i++) {
2047 if (transcoder == ec->elems[i].tc->transcoder) {
2048 rb_transcoding_close(ec->elems[i].tc);
2049 xfree(ec->elems[i].out_buf_start);
2050 ec->num_trans--;
2051 }
2052 else
2053 ec->elems[j++] = ec->elems[i];
2054 }
2055 }
2056
2057 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2058}
2059
2060static VALUE
2061econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2062{
2063 int has_description = 0;
2064
2065 if (NIL_P(mesg))
2066 mesg = rb_str_new(NULL, 0);
2067
2068 if (*sname != '\0' || *dname != '\0') {
2069 if (*sname == '\0')
2070 rb_str_cat2(mesg, dname);
2071 else if (*dname == '\0')
2072 rb_str_cat2(mesg, sname);
2073 else
2074 rb_str_catf(mesg, "%s to %s", sname, dname);
2075 has_description = 1;
2076 }
2077
2078 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2082 const char *pre = "";
2083 if (has_description)
2084 rb_str_cat2(mesg, " with ");
2085 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2086 rb_str_cat2(mesg, pre); pre = ",";
2087 rb_str_cat2(mesg, "universal_newline");
2088 }
2089 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2090 rb_str_cat2(mesg, pre); pre = ",";
2091 rb_str_cat2(mesg, "crlf_newline");
2092 }
2093 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2094 rb_str_cat2(mesg, pre); pre = ",";
2095 rb_str_cat2(mesg, "cr_newline");
2096 }
2097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2098 rb_str_cat2(mesg, pre); pre = ",";
2099 rb_str_cat2(mesg, "lf_newline");
2100 }
2101 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2102 rb_str_cat2(mesg, pre); pre = ",";
2103 rb_str_cat2(mesg, "xml_text");
2104 }
2105 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2106 rb_str_cat2(mesg, pre); pre = ",";
2107 rb_str_cat2(mesg, "xml_attr_content");
2108 }
2109 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2110 rb_str_cat2(mesg, pre); pre = ",";
2111 rb_str_cat2(mesg, "xml_attr_quote");
2112 }
2113 has_description = 1;
2114 }
2115 if (!has_description) {
2116 rb_str_cat2(mesg, "no-conversion");
2117 }
2118
2119 return mesg;
2120}
2121
2122VALUE
2123rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2124{
2125 VALUE mesg, exc;
2126 mesg = rb_str_new_cstr("code converter not found (");
2127 econv_description(sname, dname, ecflags, mesg);
2128 rb_str_cat2(mesg, ")");
2129 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2130 return exc;
2131}
2132
2133static VALUE
2134make_econv_exception(rb_econv_t *ec)
2135{
2136 VALUE mesg, exc;
2137 if (ec->last_error.result == econv_invalid_byte_sequence ||
2138 ec->last_error.result == econv_incomplete_input) {
2139 const char *err = (const char *)ec->last_error.error_bytes_start;
2140 size_t error_len = ec->last_error.error_bytes_len;
2141 VALUE bytes = rb_str_new(err, error_len);
2142 VALUE dumped = rb_str_dump(bytes);
2143 size_t readagain_len = ec->last_error.readagain_len;
2144 VALUE bytes2 = Qnil;
2145 VALUE dumped2;
2146 if (ec->last_error.result == econv_incomplete_input) {
2147 mesg = rb_sprintf("incomplete %s on %s",
2148 StringValueCStr(dumped),
2149 ec->last_error.source_encoding);
2150 }
2151 else if (readagain_len) {
2152 bytes2 = rb_str_new(err+error_len, readagain_len);
2153 dumped2 = rb_str_dump(bytes2);
2154 mesg = rb_sprintf("%s followed by %s on %s",
2155 StringValueCStr(dumped),
2156 StringValueCStr(dumped2),
2157 ec->last_error.source_encoding);
2158 }
2159 else {
2160 mesg = rb_sprintf("%s on %s",
2161 StringValueCStr(dumped),
2162 ec->last_error.source_encoding);
2163 }
2164
2165 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2166 rb_ivar_set(exc, id_error_bytes, bytes);
2167 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2168 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2169 goto set_encs;
2170 }
2171 if (ec->last_error.result == econv_undefined_conversion) {
2172 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2173 ec->last_error.error_bytes_len);
2174 VALUE dumped = Qnil;
2175 int idx;
2176 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2177 rb_encoding *utf8 = rb_utf8_encoding();
2178 const char *start, *end;
2179 int n;
2180 start = (const char *)ec->last_error.error_bytes_start;
2181 end = start + ec->last_error.error_bytes_len;
2182 n = rb_enc_precise_mbclen(start, end, utf8);
2183 if (MBCLEN_CHARFOUND_P(n) &&
2184 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2185 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2186 dumped = rb_sprintf("U+%04X", cc);
2187 }
2188 }
2189 if (NIL_P(dumped))
2190 dumped = rb_str_dump(bytes);
2191 if (strcmp(ec->last_error.source_encoding,
2192 ec->source_encoding_name) == 0 &&
2193 strcmp(ec->last_error.destination_encoding,
2194 ec->destination_encoding_name) == 0) {
2195 mesg = rb_sprintf("%s from %s to %s",
2196 StringValueCStr(dumped),
2197 ec->last_error.source_encoding,
2198 ec->last_error.destination_encoding);
2199 }
2200 else {
2201 int i;
2202 mesg = rb_sprintf("%s to %s in conversion from %s",
2203 StringValueCStr(dumped),
2204 ec->last_error.destination_encoding,
2205 ec->source_encoding_name);
2206 for (i = 0; i < ec->num_trans; i++) {
2207 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2208 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2209 rb_str_catf(mesg, " to %s",
2210 ec->elems[i].tc->transcoder->dst_encoding);
2211 }
2212 }
2213 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2214 idx = rb_enc_find_index(ec->last_error.source_encoding);
2215 if (0 <= idx)
2216 rb_enc_associate_index(bytes, idx);
2217 rb_ivar_set(exc, id_error_char, bytes);
2218 goto set_encs;
2219 }
2220 return Qnil;
2221
2222 set_encs:
2223 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2224 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2225 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2226 if (0 <= idx)
2227 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2228 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2229 if (0 <= idx)
2230 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2231 return exc;
2232}
2233
2234static void
2235more_output_buffer(
2236 VALUE destination,
2237 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2238 int max_output,
2239 unsigned char **out_start_ptr,
2240 unsigned char **out_pos,
2241 unsigned char **out_stop_ptr)
2242{
2243 size_t len = (*out_pos - *out_start_ptr);
2244 size_t new_len = (len + max_output) * 2;
2245 *out_start_ptr = resize_destination(destination, len, new_len);
2246 *out_pos = *out_start_ptr + len;
2247 *out_stop_ptr = *out_start_ptr + new_len;
2248}
2249
2250static int
2251make_replacement(rb_econv_t *ec)
2252{
2253 rb_transcoding *tc;
2254 const rb_transcoder *tr;
2255 const unsigned char *replacement;
2256 const char *repl_enc;
2257 const char *ins_enc;
2258 size_t len;
2259
2260 if (ec->replacement_str)
2261 return 0;
2262
2264
2265 tc = ec->last_tc;
2266 if (*ins_enc) {
2267 tr = tc->transcoder;
2268 rb_enc_find(tr->dst_encoding);
2269 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2270 }
2271 else {
2272 replacement = (unsigned char *)"?";
2273 len = 1;
2274 repl_enc = "";
2275 }
2276
2277 ec->replacement_str = replacement;
2278 ec->replacement_len = len;
2279 ec->replacement_enc = repl_enc;
2280 ec->replacement_allocated = 0;
2281 return 0;
2282}
2283
2284int
2286 const unsigned char *str, size_t len, const char *encname)
2287{
2288 unsigned char *str2;
2289 size_t len2;
2290 const char *encname2;
2291
2293
2294 if (!*encname2 || encoding_equal(encname, encname2)) {
2295 str2 = xmalloc(len);
2296 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2297 len2 = len;
2298 encname2 = encname;
2299 }
2300 else {
2301 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2302 if (!str2)
2303 return -1;
2304 }
2305
2306 if (ec->replacement_allocated) {
2307 xfree((void *)ec->replacement_str);
2308 }
2309 ec->replacement_allocated = 1;
2310 ec->replacement_str = str2;
2311 ec->replacement_len = len2;
2312 ec->replacement_enc = encname2;
2313 return 0;
2314}
2315
2316static int
2317output_replacement_character(rb_econv_t *ec)
2318{
2319 int ret;
2320
2321 if (make_replacement(ec) == -1)
2322 return -1;
2323
2324 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2325 if (ret == -1)
2326 return -1;
2327
2328 return 0;
2329}
2330
2331#if 1
2332#define hash_fallback rb_hash_aref
2333
2334static VALUE
2335proc_fallback(VALUE fallback, VALUE c)
2336{
2337 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2338}
2339
2340static VALUE
2341method_fallback(VALUE fallback, VALUE c)
2342{
2343 return rb_method_call(1, &c, fallback);
2344}
2345
2346static VALUE
2347aref_fallback(VALUE fallback, VALUE c)
2348{
2349 return rb_funcallv_public(fallback, idAREF, 1, &c);
2350}
2351
2352static void
2353transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2354 const unsigned char *in_stop, unsigned char *out_stop,
2355 VALUE destination,
2356 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2357 const char *src_encoding,
2358 const char *dst_encoding,
2359 int ecflags,
2360 VALUE ecopts)
2361{
2362 rb_econv_t *ec;
2363 rb_transcoding *last_tc;
2365 unsigned char *out_start = *out_pos;
2366 int max_output;
2367 VALUE exc;
2368 VALUE fallback = Qnil;
2369 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2370
2371 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2372 if (!ec)
2373 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2374
2375 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2376 fallback = rb_hash_aref(ecopts, sym_fallback);
2377 if (RB_TYPE_P(fallback, T_HASH)) {
2378 fallback_func = hash_fallback;
2379 }
2380 else if (rb_obj_is_proc(fallback)) {
2381 fallback_func = proc_fallback;
2382 }
2383 else if (rb_obj_is_method(fallback)) {
2384 fallback_func = method_fallback;
2385 }
2386 else {
2387 fallback_func = aref_fallback;
2388 }
2389 }
2390 last_tc = ec->last_tc;
2391 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2392
2393 resume:
2394 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2395
2396 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2397 VALUE rep = rb_enc_str_new(
2398 (const char *)ec->last_error.error_bytes_start,
2399 ec->last_error.error_bytes_len,
2400 rb_enc_find(ec->last_error.source_encoding));
2401 rep = (*fallback_func)(fallback, rep);
2402 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2403 StringValue(rep);
2404 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2405 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2406 if ((int)ret == -1) {
2407 rb_raise(rb_eArgError, "too big fallback string");
2408 }
2409 goto resume;
2410 }
2411 }
2412
2413 if (ret == econv_invalid_byte_sequence ||
2414 ret == econv_incomplete_input ||
2416 exc = make_econv_exception(ec);
2417 rb_econv_close(ec);
2418 rb_exc_raise(exc);
2419 }
2420
2421 if (ret == econv_destination_buffer_full) {
2422 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2423 goto resume;
2424 }
2425
2426 rb_econv_close(ec);
2427 return;
2428}
2429#else
2430/* sample transcode_loop implementation in byte-by-byte stream style */
2431static void
2432transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2433 const unsigned char *in_stop, unsigned char *out_stop,
2434 VALUE destination,
2435 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2436 const char *src_encoding,
2437 const char *dst_encoding,
2438 int ecflags,
2439 VALUE ecopts)
2440{
2441 rb_econv_t *ec;
2442 rb_transcoding *last_tc;
2444 unsigned char *out_start = *out_pos;
2445 const unsigned char *ptr;
2446 int max_output;
2447 VALUE exc;
2448
2449 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2450 if (!ec)
2451 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2452
2453 last_tc = ec->last_tc;
2454 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2455
2457 ptr = *in_pos;
2458 while (ret != econv_finished) {
2459 unsigned char input_byte;
2460 const unsigned char *p = &input_byte;
2461
2462 if (ret == econv_source_buffer_empty) {
2463 if (ptr < in_stop) {
2464 input_byte = *ptr;
2465 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2466 }
2467 else {
2468 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2469 }
2470 }
2471 else {
2472 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2473 }
2474 if (&input_byte != p)
2475 ptr += p - &input_byte;
2476 switch (ret) {
2480 exc = make_econv_exception(ec);
2481 rb_econv_close(ec);
2482 rb_exc_raise(exc);
2483 break;
2484
2486 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2487 break;
2488
2490 break;
2491
2492 case econv_finished:
2493 break;
2494 }
2495 }
2496 rb_econv_close(ec);
2497 *in_pos = in_stop;
2498 return;
2499}
2500#endif
2501
2502
2503/*
2504 * String-specific code
2505 */
2506
2507static unsigned char *
2508str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2509{
2510 rb_str_resize(destination, new_len);
2511 return (unsigned char *)RSTRING_PTR(destination);
2512}
2513
2514static int
2515econv_opts(VALUE opt, int ecflags)
2516{
2517 VALUE v;
2518 int newlineflag = 0;
2519
2520 v = rb_hash_aref(opt, sym_invalid);
2521 if (NIL_P(v)) {
2522 }
2523 else if (v==sym_replace) {
2524 ecflags |= ECONV_INVALID_REPLACE;
2525 }
2526 else {
2527 rb_raise(rb_eArgError, "unknown value for invalid character option");
2528 }
2529
2530 v = rb_hash_aref(opt, sym_undef);
2531 if (NIL_P(v)) {
2532 }
2533 else if (v==sym_replace) {
2534 ecflags |= ECONV_UNDEF_REPLACE;
2535 }
2536 else {
2537 rb_raise(rb_eArgError, "unknown value for undefined character option");
2538 }
2539
2540 v = rb_hash_aref(opt, sym_replace);
2541 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2542 ecflags |= ECONV_UNDEF_REPLACE;
2543 }
2544
2545 v = rb_hash_aref(opt, sym_xml);
2546 if (!NIL_P(v)) {
2547 if (v==sym_text) {
2549 }
2550 else if (v==sym_attr) {
2552 }
2553 else if (SYMBOL_P(v)) {
2554 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2555 }
2556 else {
2557 rb_raise(rb_eArgError, "unexpected value for xml option");
2558 }
2559 }
2560
2561#ifdef ENABLE_ECONV_NEWLINE_OPTION
2562 v = rb_hash_aref(opt, sym_newline);
2563 if (!NIL_P(v)) {
2564 newlineflag = 2;
2565 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2566 if (v == sym_universal) {
2568 }
2569 else if (v == sym_crlf) {
2571 }
2572 else if (v == sym_cr) {
2573 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2574 }
2575 else if (v == sym_lf) {
2576 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2577 }
2578 else if (SYMBOL_P(v)) {
2579 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2580 rb_sym2str(v));
2581 }
2582 else {
2583 rb_raise(rb_eArgError, "unexpected value for newline option");
2584 }
2585 }
2586#endif
2587 {
2588 int setflags = 0;
2589
2590 v = rb_hash_aref(opt, sym_universal_newline);
2591 if (RTEST(v))
2593 newlineflag |= !NIL_P(v);
2594
2595 v = rb_hash_aref(opt, sym_crlf_newline);
2596 if (RTEST(v))
2597 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2598 newlineflag |= !NIL_P(v);
2599
2600 v = rb_hash_aref(opt, sym_cr_newline);
2601 if (RTEST(v))
2602 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2603 newlineflag |= !NIL_P(v);
2604
2605 v = rb_hash_aref(opt, sym_lf_newline);
2606 if (RTEST(v))
2607 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2608 newlineflag |= !NIL_P(v);
2609
2610 switch (newlineflag) {
2611 case 1:
2612 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2613 ecflags |= setflags;
2614 break;
2615
2616 case 3:
2617 rb_warning(":newline option precedes other newline options");
2618 break;
2619 }
2620 }
2621
2622 return ecflags;
2623}
2624
2625int
2626rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2627{
2628 VALUE newhash = Qnil;
2629 VALUE v;
2630
2631 if (NIL_P(opthash)) {
2632 *opts = Qnil;
2633 return ecflags;
2634 }
2635 ecflags = econv_opts(opthash, ecflags);
2636
2637 v = rb_hash_aref(opthash, sym_replace);
2638 if (!NIL_P(v)) {
2639 StringValue(v);
2640 if (is_broken_string(v)) {
2641 VALUE dumped = rb_str_dump(v);
2642 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2643 StringValueCStr(dumped),
2644 rb_enc_name(rb_enc_get(v)));
2645 }
2646 v = rb_str_new_frozen(v);
2647 newhash = rb_hash_new();
2648 rb_hash_aset(newhash, sym_replace, v);
2649 }
2650
2651 v = rb_hash_aref(opthash, sym_fallback);
2652 if (!NIL_P(v)) {
2653 VALUE h = rb_check_hash_type(v);
2654 if (NIL_P(h)
2655 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2656 : (v = h, 1)) {
2657 if (NIL_P(newhash))
2658 newhash = rb_hash_new();
2659 rb_hash_aset(newhash, sym_fallback, v);
2660 }
2661 }
2662
2663 if (!NIL_P(newhash))
2664 rb_hash_freeze(newhash);
2665 *opts = newhash;
2666
2667 return ecflags;
2668}
2669
2670int
2672{
2673 return rb_econv_prepare_options(opthash, opts, 0);
2674}
2675
2676rb_econv_t *
2677rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2678{
2679 rb_econv_t *ec;
2680 VALUE replacement;
2681
2682 if (NIL_P(opthash)) {
2683 replacement = Qnil;
2684 }
2685 else {
2686 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2687 rb_bug("rb_econv_open_opts called with invalid opthash");
2688 replacement = rb_hash_aref(opthash, sym_replace);
2689 }
2690
2691 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2692 if (ec) {
2693 if (!NIL_P(replacement)) {
2694 int ret;
2695 rb_encoding *enc = rb_enc_get(replacement);
2696
2697 ret = rb_econv_set_replacement(ec,
2698 (const unsigned char *)RSTRING_PTR(replacement),
2699 RSTRING_LEN(replacement),
2700 rb_enc_name(enc));
2701 if (ret == -1) {
2702 rb_econv_close(ec);
2703 ec = NULL;
2704 }
2705 }
2706 }
2707 return ec; // can be NULL
2708}
2709
2710static int
2711enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2712{
2713 rb_encoding *enc;
2714 const char *n;
2715 int encidx;
2716 VALUE encval;
2717
2718 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2719 !(enc = rb_enc_from_index(encidx))) {
2720 enc = NULL;
2721 encidx = 0;
2722 n = StringValueCStr(*arg);
2723 }
2724 else {
2725 n = rb_enc_name(enc);
2726 }
2727
2728 *name_p = n;
2729 *enc_p = enc;
2730
2731 return encidx;
2732}
2733
2734static int
2735str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2736 const char **sname_p, rb_encoding **senc_p,
2737 const char **dname_p, rb_encoding **denc_p)
2738{
2739 rb_encoding *senc, *denc;
2740 const char *sname, *dname;
2741 int sencidx, dencidx;
2742
2743 dencidx = enc_arg(arg1, &dname, &denc);
2744
2745 if (NIL_P(*arg2)) {
2746 sencidx = rb_enc_get_index(str);
2747 senc = rb_enc_from_index(sencidx);
2748 sname = rb_enc_name(senc);
2749 }
2750 else {
2751 sencidx = enc_arg(arg2, &sname, &senc);
2752 }
2753
2754 *sname_p = sname;
2755 *senc_p = senc;
2756 *dname_p = dname;
2757 *denc_p = denc;
2758 return dencidx;
2759}
2760
2761static int
2762str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2763{
2764 VALUE dest;
2765 VALUE str = *self;
2766 VALUE arg1, arg2;
2767 long blen, slen;
2768 unsigned char *buf, *bp, *sp;
2769 const unsigned char *fromp;
2770 rb_encoding *senc, *denc;
2771 const char *sname, *dname;
2772 int dencidx;
2773 int explicitly_invalid_replace = TRUE;
2774
2775 rb_check_arity(argc, 0, 2);
2776
2777 if (argc == 0) {
2778 arg1 = rb_enc_default_internal();
2779 if (NIL_P(arg1)) {
2780 if (!ecflags) return -1;
2781 arg1 = rb_obj_encoding(str);
2782 }
2783 if (!(ecflags & ECONV_INVALID_MASK)) {
2784 explicitly_invalid_replace = FALSE;
2785 }
2787 }
2788 else {
2789 arg1 = argv[0];
2790 }
2791 arg2 = argc<=1 ? Qnil : argv[1];
2792 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2793
2794 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2798 if (senc && senc == denc) {
2799 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2800 VALUE rep = Qnil;
2801 if (!NIL_P(ecopts)) {
2802 rep = rb_hash_aref(ecopts, sym_replace);
2803 }
2804 dest = rb_enc_str_scrub(senc, str, rep);
2805 if (NIL_P(dest)) dest = str;
2806 *self = dest;
2807 return dencidx;
2808 }
2809 return NIL_P(arg2) ? -1 : dencidx;
2810 }
2811 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2812 if (is_ascii_string(str)) {
2813 return dencidx;
2814 }
2815 }
2816 if (encoding_equal(sname, dname)) {
2817 return NIL_P(arg2) ? -1 : dencidx;
2818 }
2819 }
2820 else {
2821 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2822 rb_encoding *utf8 = rb_utf8_encoding();
2823 str = rb_str_conv_enc(str, senc, utf8);
2824 senc = utf8;
2825 sname = "UTF-8";
2826 }
2827 if (encoding_equal(sname, dname)) {
2828 sname = "";
2829 dname = "";
2830 }
2831 }
2832
2833 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2834 slen = RSTRING_LEN(str);
2835 blen = slen + 30; /* len + margin */
2836 dest = rb_str_tmp_new(blen);
2837 bp = (unsigned char *)RSTRING_PTR(dest);
2838
2839 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2840 if (fromp != sp+slen) {
2841 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2842 }
2843 buf = (unsigned char *)RSTRING_PTR(dest);
2844 *bp = '\0';
2845 rb_str_set_len(dest, bp - buf);
2846
2847 /* set encoding */
2848 if (!denc) {
2849 dencidx = rb_define_dummy_encoding(dname);
2850 RB_GC_GUARD(arg1);
2851 RB_GC_GUARD(arg2);
2852 }
2853 *self = dest;
2854
2855 return dencidx;
2856}
2857
2858static int
2859str_transcode(int argc, VALUE *argv, VALUE *self)
2860{
2861 VALUE opt;
2862 int ecflags = 0;
2863 VALUE ecopts = Qnil;
2864
2865 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2866 if (!NIL_P(opt)) {
2867 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2868 }
2869 return str_transcode0(argc, argv, self, ecflags, ecopts);
2870}
2871
2872static inline VALUE
2873str_encode_associate(VALUE str, int encidx)
2874{
2875 int cr = 0;
2876
2877 rb_enc_associate_index(str, encidx);
2878
2879 /* transcoded string never be broken. */
2880 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2881 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2882 }
2883 else {
2885 }
2886 ENC_CODERANGE_SET(str, cr);
2887 return str;
2888}
2889
2890/*
2891 * call-seq:
2892 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2893 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2894 *
2895 * Like #encode, but applies encoding changes to +self+; returns +self+.
2896 *
2897 * Related: see {Modifying}[rdoc-ref:String@Modifying].
2898 */
2899
2900static VALUE
2901str_encode_bang(int argc, VALUE *argv, VALUE str)
2902{
2903 VALUE newstr;
2904 int encidx;
2905
2906 rb_check_frozen(str);
2907
2908 newstr = str;
2909 encidx = str_transcode(argc, argv, &newstr);
2910
2911 if (encidx < 0) return str;
2912 if (newstr == str) {
2913 rb_enc_associate_index(str, encidx);
2914 return str;
2915 }
2916 rb_str_shared_replace(str, newstr);
2917 return str_encode_associate(str, encidx);
2918}
2919
2920static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2921
2922/*
2923 * call-seq:
2924 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2925 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2926 *
2927 * :include: doc/string/encode.rdoc
2928 *
2929 */
2930
2931static VALUE
2932str_encode(int argc, VALUE *argv, VALUE str)
2933{
2934 VALUE newstr = str;
2935 int encidx = str_transcode(argc, argv, &newstr);
2936 return encoded_dup(newstr, str, encidx);
2937}
2938
2939VALUE
2940rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2941{
2942 int argc = 1;
2943 VALUE *argv = &to;
2944 VALUE newstr = str;
2945 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2946 return encoded_dup(newstr, str, encidx);
2947}
2948
2949static VALUE
2950encoded_dup(VALUE newstr, VALUE str, int encidx)
2951{
2952 if (encidx < 0) return rb_str_dup(str);
2953 if (newstr == str) {
2954 newstr = rb_str_dup(str);
2955 rb_enc_associate_index(newstr, encidx);
2956 return newstr;
2957 }
2958 else {
2959 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2960 }
2961 return str_encode_associate(newstr, encidx);
2962}
2963
2964/*
2965 * Document-class: Encoding::Converter
2966 *
2967 * Encoding conversion class.
2968 */
2969static void
2970econv_free(void *ptr)
2971{
2972 rb_econv_t *ec = ptr;
2973 rb_econv_close(ec);
2974}
2975
2976static size_t
2977econv_memsize(const void *ptr)
2978{
2979 return sizeof(rb_econv_t);
2980}
2981
2982static const rb_data_type_t econv_data_type = {
2983 "econv",
2984 {0, econv_free, econv_memsize,},
2985 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2986};
2987
2988static VALUE
2989econv_s_allocate(VALUE klass)
2990{
2991 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2992}
2993
2994static rb_encoding *
2995make_dummy_encoding(const char *name)
2996{
2997 rb_encoding *enc;
2998 int idx;
2999 idx = rb_define_dummy_encoding(name);
3000 enc = rb_enc_from_index(idx);
3001 return enc;
3002}
3003
3004static rb_encoding *
3005make_encoding(const char *name)
3006{
3007 rb_encoding *enc;
3008 enc = rb_enc_find(name);
3009 if (!enc) {
3010 RB_VM_LOCKING() {
3011 if (rb_enc_registered(name)) {
3012 enc = NULL;
3013 }
3014 else {
3015 enc = make_dummy_encoding(name);
3016 }
3017 }
3018 }
3019 return enc;
3020}
3021
3022static VALUE
3023make_encobj(const char *name)
3024{
3025 return rb_enc_from_encoding(make_encoding(name));
3026}
3027
3028/*
3029 * call-seq:
3030 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3031 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3032 *
3033 * Returns the corresponding ASCII compatible encoding.
3034 *
3035 * Returns nil if the argument is an ASCII compatible encoding.
3036 *
3037 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3038 * can represents exactly the same characters as the given ASCII incompatible encoding.
3039 * So, no conversion undefined error occurs when converting between the two encodings.
3040 *
3041 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3042 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3043 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3044 *
3045 */
3046static VALUE
3047econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3048{
3049 const char *arg_name, *result_name;
3050 rb_encoding *arg_enc, *result_enc;
3051 VALUE enc = Qnil;
3052
3053 enc_arg(&arg, &arg_name, &arg_enc);
3054 result_name = rb_econv_asciicompat_encoding(arg_name);
3055 if (result_name) {
3056 result_enc = make_encoding(result_name);
3057 enc = rb_enc_from_encoding(result_enc);
3058 }
3059 return enc;
3060}
3061
3062static void
3063econv_args(int argc, VALUE *argv,
3064 VALUE *snamev_p, VALUE *dnamev_p,
3065 const char **sname_p, const char **dname_p,
3066 rb_encoding **senc_p, rb_encoding **denc_p,
3067 int *ecflags_p,
3068 VALUE *ecopts_p)
3069{
3070 VALUE opt, flags_v, ecopts;
3071 int sidx, didx;
3072 const char *sname, *dname;
3073 rb_encoding *senc, *denc;
3074 int ecflags;
3075
3076 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3077
3078 if (!NIL_P(flags_v)) {
3079 if (!NIL_P(opt)) {
3080 rb_error_arity(argc + 1, 2, 3);
3081 }
3082 ecflags = NUM2INT(rb_to_int(flags_v));
3083 ecopts = Qnil;
3084 }
3085 else if (!NIL_P(opt)) {
3086 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3087 }
3088 else {
3089 ecflags = 0;
3090 ecopts = Qnil;
3091 }
3092
3093 senc = NULL;
3094 sidx = rb_to_encoding_index(*snamev_p);
3095 if (0 <= sidx) {
3096 senc = rb_enc_from_index(sidx);
3097 }
3098 else {
3099 StringValue(*snamev_p);
3100 }
3101
3102 denc = NULL;
3103 didx = rb_to_encoding_index(*dnamev_p);
3104 if (0 <= didx) {
3105 denc = rb_enc_from_index(didx);
3106 }
3107 else {
3108 StringValue(*dnamev_p);
3109 }
3110
3111 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3112 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3113
3114 *sname_p = sname;
3115 *dname_p = dname;
3116 *senc_p = senc;
3117 *denc_p = denc;
3118 *ecflags_p = ecflags;
3119 *ecopts_p = ecopts;
3120}
3121
3122static int
3123decorate_convpath(VALUE convpath, int ecflags)
3124{
3125 int num_decorators;
3126 const char *decorators[MAX_ECFLAGS_DECORATORS];
3127 int i;
3128 int n, len;
3129
3130 num_decorators = decorator_names(ecflags, decorators);
3131 if (num_decorators == -1)
3132 return -1;
3133
3134 len = n = RARRAY_LENINT(convpath);
3135 if (n != 0) {
3136 VALUE pair = RARRAY_AREF(convpath, n-1);
3137 if (RB_TYPE_P(pair, T_ARRAY)) {
3138 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3139 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3140 transcoder_entry_t *entry;
3141 const rb_transcoder *tr;
3142 entry = get_transcoder_entry(sname, dname);
3143 tr = load_transcoder_entry(entry);
3144 if (!tr)
3145 return -1;
3146 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3147 tr->asciicompat_type == asciicompat_encoder) {
3148 n--;
3149 rb_ary_store(convpath, len + num_decorators - 1, pair);
3150 }
3151 }
3152 else {
3153 rb_ary_store(convpath, len + num_decorators - 1, pair);
3154 }
3155 }
3156
3157 for (i = 0; i < num_decorators; i++)
3158 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3159
3160 return 0;
3161}
3162
3163static void
3164search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3165{
3166 VALUE *ary_p = arg;
3167 VALUE v;
3168
3169 if (NIL_P(*ary_p)) {
3170 *ary_p = rb_ary_new();
3171 }
3172
3173 if (DECORATOR_P(sname, dname)) {
3174 v = rb_str_new_cstr(dname);
3175 }
3176 else {
3177 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3178 }
3179 rb_ary_store(*ary_p, depth, v);
3180}
3181
3182/*
3183 * call-seq:
3184 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3185 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3186 *
3187 * Returns a conversion path.
3188 *
3189 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3190 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3191 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3192 *
3193 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3194 * or
3195 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3196 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3197 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3198 * # "universal_newline"]
3199 *
3200 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3201 * or
3202 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3203 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3204 * # "universal_newline",
3205 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3206 */
3207static VALUE
3208econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3209{
3210 VALUE snamev, dnamev;
3211 const char *sname, *dname;
3212 rb_encoding *senc, *denc;
3213 int ecflags;
3214 VALUE ecopts;
3215 VALUE convpath;
3216
3217 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3218
3219 convpath = Qnil;
3220 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3221
3222 if (NIL_P(convpath)) {
3223 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3224 RB_GC_GUARD(snamev);
3225 RB_GC_GUARD(dnamev);
3226 rb_exc_raise(exc);
3227 }
3228
3229 if (decorate_convpath(convpath, ecflags) == -1) {
3230 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3231 RB_GC_GUARD(snamev);
3232 RB_GC_GUARD(dnamev);
3233 rb_exc_raise(exc);
3234 }
3235
3236 return convpath;
3237}
3238
3239/*
3240 * Check the existence of a conversion path.
3241 * Returns the number of converters in the conversion path.
3242 * result: >=0:success -1:failure
3243 */
3244int
3245rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3246{
3247 VALUE convpath = Qnil;
3248 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3249 &convpath);
3250 return RTEST(convpath);
3251}
3252
3254 rb_econv_t *ec;
3255 int index;
3256 int ret;
3257};
3258
3259static void
3260rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3261{
3263 int ret;
3264
3265 if (a->ret == -1)
3266 return;
3267
3268 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3269
3270 a->ret = ret;
3271 return;
3272}
3273
3274static rb_econv_t *
3275rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3276 const char **sname_p, const char **dname_p,
3277 rb_encoding **senc_p, rb_encoding**denc_p)
3278{
3279 rb_econv_t *ec;
3280 long i;
3281 int ret, first=1;
3282 VALUE elt;
3283 rb_encoding *senc = 0, *denc = 0;
3284 const char *sname, *dname;
3285
3286 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3287 DATA_PTR(self) = ec;
3288
3289 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3290 VALUE snamev, dnamev;
3291 VALUE pair;
3292 elt = rb_ary_entry(convpath, i);
3293 if (!NIL_P(pair = rb_check_array_type(elt))) {
3294 if (RARRAY_LEN(pair) != 2)
3295 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3296 snamev = rb_ary_entry(pair, 0);
3297 enc_arg(&snamev, &sname, &senc);
3298 dnamev = rb_ary_entry(pair, 1);
3299 enc_arg(&dnamev, &dname, &denc);
3300 }
3301 else {
3302 sname = "";
3303 dname = StringValueCStr(elt);
3304 }
3305 if (DECORATOR_P(sname, dname)) {
3306 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3307 if (ret == -1) {
3308 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3309 RB_GC_GUARD(snamev);
3310 RB_GC_GUARD(dnamev);
3311 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3312 }
3313 }
3314 else {
3315 int j = ec->num_trans;
3316 struct rb_econv_init_by_convpath_t arg;
3317 arg.ec = ec;
3318 arg.index = ec->num_trans;
3319 arg.ret = 0;
3320 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3321 if (ret == -1 || arg.ret == -1) {
3322 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3323 RB_GC_GUARD(snamev);
3324 RB_GC_GUARD(dnamev);
3325 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3326 }
3327 if (first) {
3328 first = 0;
3329 *senc_p = senc;
3330 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3331 }
3332 *denc_p = denc;
3333 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3334 }
3335 }
3336
3337 if (first) {
3338 *senc_p = NULL;
3339 *denc_p = NULL;
3340 *sname_p = "";
3341 *dname_p = "";
3342 }
3343
3344 ec->source_encoding_name = *sname_p;
3345 ec->destination_encoding_name = *dname_p;
3346
3347 return ec;
3348}
3349
3350/*
3351 * call-seq:
3352 * Encoding::Converter.new(source_encoding, destination_encoding)
3353 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3354 * Encoding::Converter.new(convpath)
3355 *
3356 * possible options elements:
3357 * hash form:
3358 * :invalid => nil # raise error on invalid byte sequence (default)
3359 * :invalid => :replace # replace invalid byte sequence
3360 * :undef => nil # raise error on undefined conversion (default)
3361 * :undef => :replace # replace undefined conversion
3362 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3363 * :newline => :universal # decorator for converting CRLF and CR to LF
3364 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3365 * :newline => :crlf # decorator for converting LF to CRLF
3366 * :newline => :cr # decorator for converting LF to CR
3367 * :universal_newline => true # decorator for converting CRLF and CR to LF
3368 * :crlf_newline => true # decorator for converting LF to CRLF
3369 * :cr_newline => true # decorator for converting LF to CR
3370 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3371 * :xml => :text # escape as XML CharData.
3372 * :xml => :attr # escape as XML AttValue
3373 * integer form:
3374 * Encoding::Converter::INVALID_REPLACE
3375 * Encoding::Converter::UNDEF_REPLACE
3376 * Encoding::Converter::UNDEF_HEX_CHARREF
3377 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3378 * Encoding::Converter::LF_NEWLINE_DECORATOR
3379 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3380 * Encoding::Converter::CR_NEWLINE_DECORATOR
3381 * Encoding::Converter::XML_TEXT_DECORATOR
3382 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3383 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3384 *
3385 * Encoding::Converter.new creates an instance of Encoding::Converter.
3386 *
3387 * Source_encoding and destination_encoding should be a string or
3388 * Encoding object.
3389 *
3390 * opt should be nil, a hash or an integer.
3391 *
3392 * convpath should be an array.
3393 * convpath may contain
3394 * - two-element arrays which contain encodings or encoding names, or
3395 * - strings representing decorator names.
3396 *
3397 * Encoding::Converter.new optionally takes an option.
3398 * The option should be a hash or an integer.
3399 * The option hash can contain :invalid => nil, etc.
3400 * The option integer should be logical-or of constants such as
3401 * Encoding::Converter::INVALID_REPLACE, etc.
3402 *
3403 * [:invalid => nil]
3404 * Raise error on invalid byte sequence. This is a default behavior.
3405 * [:invalid => :replace]
3406 * Replace invalid byte sequence by replacement string.
3407 * [:undef => nil]
3408 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3409 * This is a default behavior.
3410 * [:undef => :replace]
3411 * Replace undefined character in destination_encoding with replacement string.
3412 * [:replace => string]
3413 * Specify the replacement string.
3414 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3415 * [:universal_newline => true]
3416 * Convert CRLF and CR to LF.
3417 * [:crlf_newline => true]
3418 * Convert LF to CRLF.
3419 * [:cr_newline => true]
3420 * Convert LF to CR.
3421 * [:lf_newline => true]
3422 * Convert CRLF and CR to LF (when writing).
3423 * [:xml => :text]
3424 * Escape as XML CharData.
3425 * This form can be used as an HTML 4.0 #PCDATA.
3426 * - '&' -> '&amp;'
3427 * - '<' -> '&lt;'
3428 * - '>' -> '&gt;'
3429 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3430 * [:xml => :attr]
3431 * Escape as XML AttValue.
3432 * The converted result is quoted as "...".
3433 * This form can be used as an HTML 4.0 attribute value.
3434 * - '&' -> '&amp;'
3435 * - '<' -> '&lt;'
3436 * - '>' -> '&gt;'
3437 * - '"' -> '&quot;'
3438 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3439 *
3440 * Examples:
3441 * # UTF-16BE to UTF-8
3442 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3443 *
3444 * # Usually, decorators such as newline conversion are inserted last.
3445 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3446 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3447 * # "universal_newline"]
3448 *
3449 * # But, if the last encoding is ASCII incompatible,
3450 * # decorators are inserted before the last conversion.
3451 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3452 * p ec.convpath #=> ["crlf_newline",
3453 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3454 *
3455 * # Conversion path can be specified directly.
3456 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3457 * p ec.convpath #=> ["universal_newline",
3458 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3459 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3460 */
3461static VALUE
3462econv_init(int argc, VALUE *argv, VALUE self)
3463{
3464 VALUE ecopts;
3465 VALUE snamev, dnamev;
3466 const char *sname, *dname;
3467 rb_encoding *senc, *denc;
3468 rb_econv_t *ec;
3469 int ecflags;
3470 VALUE convpath;
3471
3472 if (rb_check_typeddata(self, &econv_data_type)) {
3473 rb_raise(rb_eTypeError, "already initialized");
3474 }
3475
3476 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3477 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3478 ecflags = 0;
3479 ecopts = Qnil;
3480 }
3481 else {
3482 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3483 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3484 }
3485
3486 if (!ec) {
3487 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3488 RB_GC_GUARD(snamev);
3489 RB_GC_GUARD(dnamev);
3490 rb_exc_raise(exc);
3491 }
3492
3493 if (!DECORATOR_P(sname, dname)) {
3494 if (!senc)
3495 senc = make_dummy_encoding(sname);
3496 if (!denc)
3497 denc = make_dummy_encoding(dname);
3498 RB_GC_GUARD(snamev);
3499 RB_GC_GUARD(dnamev);
3500 }
3501
3502 ec->source_encoding = senc;
3503 ec->destination_encoding = denc;
3504
3505 DATA_PTR(self) = ec;
3506
3507 return self;
3508}
3509
3510/*
3511 * call-seq:
3512 * ec.inspect -> string
3513 *
3514 * Returns a printable version of <i>ec</i>
3515 *
3516 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3517 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3518 *
3519 */
3520static VALUE
3521econv_inspect(VALUE self)
3522{
3523 const char *cname = rb_obj_classname(self);
3524 rb_econv_t *ec;
3525
3526 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3527 if (!ec)
3528 return rb_sprintf("#<%s: uninitialized>", cname);
3529 else {
3530 const char *sname = ec->source_encoding_name;
3531 const char *dname = ec->destination_encoding_name;
3532 VALUE str;
3533 str = rb_sprintf("#<%s: ", cname);
3534 econv_description(sname, dname, ec->flags, str);
3535 rb_str_cat2(str, ">");
3536 return str;
3537 }
3538}
3539
3540static rb_econv_t *
3541check_econv(VALUE self)
3542{
3543 rb_econv_t *ec;
3544
3545 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3546 if (!ec) {
3547 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3548 }
3549 return ec;
3550}
3551
3552static VALUE
3553econv_get_encoding(rb_encoding *encoding)
3554{
3555 if (!encoding)
3556 return Qnil;
3557 return rb_enc_from_encoding(encoding);
3558}
3559
3560/*
3561 * call-seq:
3562 * ec.source_encoding -> encoding
3563 *
3564 * Returns the source encoding as an Encoding object.
3565 */
3566static VALUE
3567econv_source_encoding(VALUE self)
3568{
3569 rb_econv_t *ec = check_econv(self);
3570 return econv_get_encoding(ec->source_encoding);
3571}
3572
3573/*
3574 * call-seq:
3575 * ec.destination_encoding -> encoding
3576 *
3577 * Returns the destination encoding as an Encoding object.
3578 */
3579static VALUE
3580econv_destination_encoding(VALUE self)
3581{
3582 rb_econv_t *ec = check_econv(self);
3583 return econv_get_encoding(ec->destination_encoding);
3584}
3585
3586/*
3587 * call-seq:
3588 * ec.convpath -> ary
3589 *
3590 * Returns the conversion path of ec.
3591 *
3592 * The result is an array of conversions.
3593 *
3594 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3595 * p ec.convpath
3596 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3597 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3598 * # "crlf_newline"]
3599 *
3600 * Each element of the array is a pair of encodings or a string.
3601 * A pair means an encoding conversion.
3602 * A string means a decorator.
3603 *
3604 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3605 * a converter from ISO-8859-1 to UTF-8.
3606 * "crlf_newline" means newline converter from LF to CRLF.
3607 */
3608static VALUE
3609econv_convpath(VALUE self)
3610{
3611 rb_econv_t *ec = check_econv(self);
3612 VALUE result;
3613 int i;
3614
3615 result = rb_ary_new();
3616 for (i = 0; i < ec->num_trans; i++) {
3617 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3618 VALUE v;
3619 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3620 v = rb_str_new_cstr(tr->dst_encoding);
3621 else
3622 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3623 rb_ary_push(result, v);
3624 }
3625 return result;
3626}
3627
3628/*
3629 * call-seq:
3630 * ec == other -> true or false
3631 */
3632static VALUE
3633econv_equal(VALUE self, VALUE other)
3634{
3635 rb_econv_t *ec1 = check_econv(self);
3636 rb_econv_t *ec2;
3637 int i;
3638
3639 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3640 return Qnil;
3641 }
3642 ec2 = DATA_PTR(other);
3643 if (!ec2) return Qfalse;
3644 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3645 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3646 return Qfalse;
3647 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3648 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3649 return Qfalse;
3650 if (ec1->flags != ec2->flags) return Qfalse;
3651 if (ec1->replacement_enc != ec2->replacement_enc &&
3652 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3653 return Qfalse;
3654 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3655 if (ec1->replacement_str != ec2->replacement_str &&
3656 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3657 return Qfalse;
3658
3659 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3660 for (i = 0; i < ec1->num_trans; i++) {
3661 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3662 return Qfalse;
3663 }
3664 return Qtrue;
3665}
3666
3667static VALUE
3668econv_result_to_symbol(rb_econv_result_t res)
3669{
3670 switch (res) {
3671 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3672 case econv_incomplete_input: return sym_incomplete_input;
3673 case econv_undefined_conversion: return sym_undefined_conversion;
3674 case econv_destination_buffer_full: return sym_destination_buffer_full;
3675 case econv_source_buffer_empty: return sym_source_buffer_empty;
3676 case econv_finished: return sym_finished;
3677 case econv_after_output: return sym_after_output;
3678 default: return INT2NUM(res); /* should not be reached */
3679 }
3680}
3681
3682/*
3683 * call-seq:
3684 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3685 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3686 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3687 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3688 *
3689 * possible opt elements:
3690 * hash form:
3691 * :partial_input => true # source buffer may be part of larger source
3692 * :after_output => true # stop conversion after output before input
3693 * integer form:
3694 * Encoding::Converter::PARTIAL_INPUT
3695 * Encoding::Converter::AFTER_OUTPUT
3696 *
3697 * possible results:
3698 * :invalid_byte_sequence
3699 * :incomplete_input
3700 * :undefined_conversion
3701 * :after_output
3702 * :destination_buffer_full
3703 * :source_buffer_empty
3704 * :finished
3705 *
3706 * primitive_convert converts source_buffer into destination_buffer.
3707 *
3708 * source_buffer should be a string or nil.
3709 * nil means an empty string.
3710 *
3711 * destination_buffer should be a string.
3712 *
3713 * destination_byteoffset should be an integer or nil.
3714 * nil means the end of destination_buffer.
3715 * If it is omitted, nil is assumed.
3716 *
3717 * destination_bytesize should be an integer or nil.
3718 * nil means unlimited.
3719 * If it is omitted, nil is assumed.
3720 *
3721 * opt should be nil, a hash or an integer.
3722 * nil means no flags.
3723 * If it is omitted, nil is assumed.
3724 *
3725 * primitive_convert converts the content of source_buffer from beginning
3726 * and store the result into destination_buffer.
3727 *
3728 * destination_byteoffset and destination_bytesize specify the region which
3729 * the converted result is stored.
3730 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3731 * If destination_byteoffset is nil,
3732 * destination_buffer.bytesize is used for appending the result.
3733 * destination_bytesize specifies maximum number of bytes.
3734 * If destination_bytesize is nil,
3735 * destination size is unlimited.
3736 * After conversion, destination_buffer is resized to
3737 * destination_byteoffset + actually produced number of bytes.
3738 * Also destination_buffer's encoding is set to destination_encoding.
3739 *
3740 * primitive_convert drops the converted part of source_buffer.
3741 * the dropped part is converted in destination_buffer or
3742 * buffered in Encoding::Converter object.
3743 *
3744 * primitive_convert stops conversion when one of following condition met.
3745 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3746 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3747 * - unexpected end of source buffer (:incomplete_input)
3748 * this occur only when :partial_input is not specified.
3749 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3750 * - character not representable in output encoding (:undefined_conversion)
3751 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3752 * - after some output is generated, before input is done (:after_output)
3753 * this occur only when :after_output is specified.
3754 * - destination buffer is full (:destination_buffer_full)
3755 * this occur only when destination_bytesize is non-nil.
3756 * - source buffer is empty (:source_buffer_empty)
3757 * this occur only when :partial_input is specified.
3758 * - conversion is finished (:finished)
3759 *
3760 * example:
3761 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3762 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3763 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3764 *
3765 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3766 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3767 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3768 * ret = ec.primitive_convert(src, dst="", nil, 1)
3769 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3770 * ret = ec.primitive_convert(src, dst="", nil, 1)
3771 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3772 * ret = ec.primitive_convert(src, dst="", nil, 1)
3773 * p [ret, src, dst] #=> [:finished, "", "i"]
3774 *
3775 */
3776static VALUE
3777econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3778{
3779 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3780 rb_econv_t *ec = check_econv(self);
3782 const unsigned char *ip, *is;
3783 unsigned char *op, *os;
3784 long output_byteoffset, output_bytesize;
3785 unsigned long output_byteend;
3786 int flags;
3787
3788 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3789
3790 if (NIL_P(output_byteoffset_v))
3791 output_byteoffset = 0; /* dummy */
3792 else
3793 output_byteoffset = NUM2LONG(output_byteoffset_v);
3794
3795 if (NIL_P(output_bytesize_v))
3796 output_bytesize = 0; /* dummy */
3797 else
3798 output_bytesize = NUM2LONG(output_bytesize_v);
3799
3800 if (!NIL_P(flags_v)) {
3801 if (!NIL_P(opt)) {
3802 rb_error_arity(argc + 1, 2, 5);
3803 }
3804 flags = NUM2INT(rb_to_int(flags_v));
3805 }
3806 else if (!NIL_P(opt)) {
3807 VALUE v;
3808 flags = 0;
3809 v = rb_hash_aref(opt, sym_partial_input);
3810 if (RTEST(v))
3811 flags |= ECONV_PARTIAL_INPUT;
3812 v = rb_hash_aref(opt, sym_after_output);
3813 if (RTEST(v))
3814 flags |= ECONV_AFTER_OUTPUT;
3815 }
3816 else {
3817 flags = 0;
3818 }
3819
3820 StringValue(output);
3821 if (!NIL_P(input))
3822 StringValue(input);
3823 rb_str_modify(output);
3824
3825 if (NIL_P(output_bytesize_v)) {
3826 output_bytesize = rb_str_capacity(output);
3827
3828 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3829 output_bytesize = RSTRING_LEN(input);
3830 }
3831
3832 retry:
3833
3834 if (NIL_P(output_byteoffset_v))
3835 output_byteoffset = RSTRING_LEN(output);
3836
3837 if (output_byteoffset < 0)
3838 rb_raise(rb_eArgError, "negative output_byteoffset");
3839
3840 if (RSTRING_LEN(output) < output_byteoffset)
3841 rb_raise(rb_eArgError, "output_byteoffset too big");
3842
3843 if (output_bytesize < 0)
3844 rb_raise(rb_eArgError, "negative output_bytesize");
3845
3846 output_byteend = (unsigned long)output_byteoffset +
3847 (unsigned long)output_bytesize;
3848
3849 if (output_byteend < (unsigned long)output_byteoffset ||
3850 LONG_MAX < output_byteend)
3851 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3852
3853 if (rb_str_capacity(output) < output_byteend)
3854 rb_str_resize(output, output_byteend);
3855
3856 if (NIL_P(input)) {
3857 ip = is = NULL;
3858 }
3859 else {
3860 ip = (const unsigned char *)RSTRING_PTR(input);
3861 is = ip + RSTRING_LEN(input);
3862 }
3863
3864 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3865 os = op + output_bytesize;
3866
3867 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3868 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3869 if (!NIL_P(input)) {
3870 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3871 }
3872
3873 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3874 if (LONG_MAX / 2 < output_bytesize)
3875 rb_raise(rb_eArgError, "too long conversion result");
3876 output_bytesize *= 2;
3877 output_byteoffset_v = Qnil;
3878 goto retry;
3879 }
3880
3881 if (ec->destination_encoding) {
3882 rb_enc_associate(output, ec->destination_encoding);
3883 }
3884
3885 return econv_result_to_symbol(res);
3886}
3887
3888/*
3889 * call-seq:
3890 * ec.convert(source_string) -> destination_string
3891 *
3892 * Convert source_string and return destination_string.
3893 *
3894 * source_string is assumed as a part of source.
3895 * i.e. :partial_input=>true is specified internally.
3896 * finish method should be used last.
3897 *
3898 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3899 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3900 * puts ec.finish.dump #=> ""
3901 *
3902 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3903 * puts ec.convert("\xA4").dump #=> ""
3904 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3905 * puts ec.finish.dump #=> ""
3906 *
3907 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3908 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3909 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3910 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3911 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3912 *
3913 * If a conversion error occur,
3914 * Encoding::UndefinedConversionError or
3915 * Encoding::InvalidByteSequenceError is raised.
3916 * Encoding::Converter#convert doesn't supply methods to recover or restart
3917 * from these exceptions.
3918 * When you want to handle these conversion errors,
3919 * use Encoding::Converter#primitive_convert.
3920 *
3921 */
3922static VALUE
3923econv_convert(VALUE self, VALUE source_string)
3924{
3925 VALUE ret, dst;
3926 VALUE av[5];
3927 int ac;
3928 rb_econv_t *ec = check_econv(self);
3929
3930 StringValue(source_string);
3931
3932 dst = rb_str_new(NULL, 0);
3933
3934 av[0] = rb_str_dup(source_string);
3935 av[1] = dst;
3936 av[2] = Qnil;
3937 av[3] = Qnil;
3939 ac = 5;
3940
3941 ret = econv_primitive_convert(ac, av, self);
3942
3943 if (ret == sym_invalid_byte_sequence ||
3944 ret == sym_undefined_conversion ||
3945 ret == sym_incomplete_input) {
3946 VALUE exc = make_econv_exception(ec);
3947 rb_exc_raise(exc);
3948 }
3949
3950 if (ret == sym_finished) {
3951 rb_raise(rb_eArgError, "converter already finished");
3952 }
3953
3954 if (ret != sym_source_buffer_empty) {
3955 rb_bug("unexpected result of econv_primitive_convert");
3956 }
3957
3958 return dst;
3959}
3960
3961/*
3962 * call-seq:
3963 * ec.finish -> string
3964 *
3965 * Finishes the converter.
3966 * It returns the last part of the converted string.
3967 *
3968 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3969 * p ec.convert("\u3042") #=> "\e$B$\""
3970 * p ec.finish #=> "\e(B"
3971 */
3972static VALUE
3973econv_finish(VALUE self)
3974{
3975 VALUE ret, dst;
3976 VALUE av[5];
3977 int ac;
3978 rb_econv_t *ec = check_econv(self);
3979
3980 dst = rb_str_new(NULL, 0);
3981
3982 av[0] = Qnil;
3983 av[1] = dst;
3984 av[2] = Qnil;
3985 av[3] = Qnil;
3986 av[4] = INT2FIX(0);
3987 ac = 5;
3988
3989 ret = econv_primitive_convert(ac, av, self);
3990
3991 if (ret == sym_invalid_byte_sequence ||
3992 ret == sym_undefined_conversion ||
3993 ret == sym_incomplete_input) {
3994 VALUE exc = make_econv_exception(ec);
3995 rb_exc_raise(exc);
3996 }
3997
3998 if (ret != sym_finished) {
3999 rb_bug("unexpected result of econv_primitive_convert");
4000 }
4001
4002 return dst;
4003}
4004
4005/*
4006 * call-seq:
4007 * ec.primitive_errinfo -> array
4008 *
4009 * primitive_errinfo returns important information regarding the last error
4010 * as a 5-element array:
4011 *
4012 * [result, enc1, enc2, error_bytes, readagain_bytes]
4013 *
4014 * result is the last result of primitive_convert.
4015 *
4016 * Other elements are only meaningful when result is
4017 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
4018 *
4019 * enc1 and enc2 indicate a conversion step as a pair of strings.
4020 * For example, a converter from EUC-JP to ISO-8859-1 converts
4021 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
4022 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
4023 *
4024 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4025 * error_bytes is discarded portion.
4026 * readagain_bytes is buffered portion which is read again on next conversion.
4027 *
4028 * Example:
4029 *
4030 * # \xff is invalid as EUC-JP.
4031 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4032 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4033 * p ec.primitive_errinfo
4034 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4035 *
4036 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4037 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4038 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4039 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4040 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4041 * p ec.primitive_errinfo
4042 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4043 *
4044 * # partial character is invalid
4045 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4046 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4047 * p ec.primitive_errinfo
4048 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4049 *
4050 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4051 * # partial characters.
4052 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4053 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4054 * p ec.primitive_errinfo
4055 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4056 *
4057 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4058 * # no low surrogate after high surrogate (\xd8\x00).
4059 * # It is detected by 3rd byte (\00) which is part of next character.
4060 * # So the high surrogate (\xd8\x00) is discarded and
4061 * # the 3rd byte is read again later.
4062 * # Since the byte is buffered in ec, it is dropped from src.
4063 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4064 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4065 * p ec.primitive_errinfo
4066 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4067 * p src
4068 * #=> "@"
4069 *
4070 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4071 * # The problem is detected by 4th byte.
4072 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4073 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4074 * p ec.primitive_errinfo
4075 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4076 * p src
4077 * #=> ""
4078 *
4079 */
4080static VALUE
4081econv_primitive_errinfo(VALUE self)
4082{
4083 rb_econv_t *ec = check_econv(self);
4084
4085 VALUE ary;
4086
4087 ary = rb_ary_new2(5);
4088
4089 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4090 rb_ary_store(ary, 4, Qnil);
4091
4092 if (ec->last_error.source_encoding)
4093 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4094
4095 if (ec->last_error.destination_encoding)
4096 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4097
4098 if (ec->last_error.error_bytes_start) {
4099 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4100 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4101 }
4102
4103 return ary;
4104}
4105
4106/*
4107 * call-seq:
4108 * ec.insert_output(string) -> nil
4109 *
4110 * Inserts string into the encoding converter.
4111 * The string will be converted to the destination encoding and
4112 * output on later conversions.
4113 *
4114 * If the destination encoding is stateful,
4115 * string is converted according to the state and the state is updated.
4116 *
4117 * This method should be used only when a conversion error occurs.
4118 *
4119 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4120 * src = "HIRAGANA LETTER A is \u{3042}."
4121 * dst = ""
4122 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4123 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4124 * ec.insert_output("<err>")
4125 * p ec.primitive_convert(src, dst) #=> :finished
4126 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4127 *
4128 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4129 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4130 * dst = ""
4131 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4132 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4133 * ec.insert_output "?" # state change required to output "?".
4134 * p ec.primitive_convert(src, dst) #=> :finished
4135 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4136 *
4137 */
4138static VALUE
4139econv_insert_output(VALUE self, VALUE string)
4140{
4141 const char *insert_enc;
4142
4143 int ret;
4144
4145 rb_econv_t *ec = check_econv(self);
4146
4147 StringValue(string);
4148 insert_enc = rb_econv_encoding_to_insert_output(ec);
4149 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4150
4151 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4152 if (ret == -1) {
4153 rb_raise(rb_eArgError, "too big string");
4154 }
4155
4156 return Qnil;
4157}
4158
4159/*
4160 * call-seq:
4161 * ec.putback -> string
4162 * ec.putback(max_numbytes) -> string
4163 *
4164 * Put back the bytes which will be converted.
4165 *
4166 * The bytes are caused by invalid_byte_sequence error.
4167 * When invalid_byte_sequence error, some bytes are discarded and
4168 * some bytes are buffered to be converted later.
4169 * The latter bytes can be put back.
4170 * It can be observed by
4171 * Encoding::InvalidByteSequenceError#readagain_bytes and
4172 * Encoding::Converter#primitive_errinfo.
4173 *
4174 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4175 * src = "\x00\xd8\x61\x00"
4176 * dst = ""
4177 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4178 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4179 * p ec.putback #=> "a\x00"
4180 * p ec.putback #=> "" # no more bytes to put back
4181 *
4182 */
4183static VALUE
4184econv_putback(int argc, VALUE *argv, VALUE self)
4185{
4186 rb_econv_t *ec = check_econv(self);
4187 int n;
4188 int putbackable;
4189 VALUE str, max;
4190
4191 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4192 n = rb_econv_putbackable(ec);
4193 }
4194 else {
4195 n = NUM2INT(max);
4196 putbackable = rb_econv_putbackable(ec);
4197 if (putbackable < n)
4198 n = putbackable;
4199 }
4200
4201 str = rb_str_new(NULL, n);
4202 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4203
4204 if (ec->source_encoding) {
4205 rb_enc_associate(str, ec->source_encoding);
4206 }
4207
4208 return str;
4209}
4210
4211/*
4212 * call-seq:
4213 * ec.last_error -> exception or nil
4214 *
4215 * Returns an exception object for the last conversion.
4216 * Returns nil if the last conversion did not produce an error.
4217 *
4218 * "error" means that
4219 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4220 * Encoding::Converter#convert and
4221 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4222 * Encoding::Converter#primitive_convert.
4223 *
4224 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4225 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4226 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4227 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4228 * p ec.last_error #=> nil
4229 *
4230 */
4231static VALUE
4232econv_last_error(VALUE self)
4233{
4234 rb_econv_t *ec = check_econv(self);
4235 VALUE exc;
4236
4237 exc = make_econv_exception(ec);
4238 if (NIL_P(exc))
4239 return Qnil;
4240 return exc;
4241}
4242
4243/*
4244 * call-seq:
4245 * ec.replacement -> string
4246 *
4247 * Returns the replacement string.
4248 *
4249 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4250 * p ec.replacement #=> "?"
4251 *
4252 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4253 * p ec.replacement #=> "\uFFFD"
4254 */
4255static VALUE
4256econv_get_replacement(VALUE self)
4257{
4258 rb_econv_t *ec = check_econv(self);
4259 int ret;
4260 rb_encoding *enc;
4261
4262 ret = make_replacement(ec);
4263 if (ret == -1) {
4264 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4265 }
4266
4267 enc = rb_enc_find(ec->replacement_enc);
4268 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4269}
4270
4271/*
4272 * call-seq:
4273 * ec.replacement = string
4274 *
4275 * Sets the replacement string.
4276 *
4277 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4278 * ec.replacement = "<undef>"
4279 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4280 */
4281static VALUE
4282econv_set_replacement(VALUE self, VALUE arg)
4283{
4284 rb_econv_t *ec = check_econv(self);
4285 VALUE string = arg;
4286 int ret;
4287 rb_encoding *enc;
4288
4289 StringValue(string);
4290 enc = rb_enc_get(string);
4291
4292 ret = rb_econv_set_replacement(ec,
4293 (const unsigned char *)RSTRING_PTR(string),
4294 RSTRING_LEN(string),
4295 rb_enc_name(enc));
4296
4297 if (ret == -1) {
4298 /* xxx: rb_eInvalidByteSequenceError? */
4299 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4300 }
4301
4302 return arg;
4303}
4304
4305VALUE
4307{
4308 return make_econv_exception(ec);
4309}
4310
4311void
4313{
4314 VALUE exc;
4315
4316 exc = make_econv_exception(ec);
4317 if (NIL_P(exc))
4318 return;
4319 rb_exc_raise(exc);
4320}
4321
4322/*
4323 * call-seq:
4324 * ecerr.source_encoding_name -> string
4325 *
4326 * Returns the source encoding name as a string.
4327 */
4328static VALUE
4329ecerr_source_encoding_name(VALUE self)
4330{
4331 return rb_attr_get(self, id_source_encoding_name);
4332}
4333
4334/*
4335 * call-seq:
4336 * ecerr.source_encoding -> encoding
4337 *
4338 * Returns the source encoding as an encoding object.
4339 *
4340 * Note that the result may not be equal to the source encoding of
4341 * the encoding converter if the conversion has multiple steps.
4342 *
4343 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4344 * begin
4345 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4346 * rescue Encoding::UndefinedConversionError
4347 * p $!.source_encoding #=> #<Encoding:UTF-8>
4348 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4349 * p $!.source_encoding_name #=> "UTF-8"
4350 * p $!.destination_encoding_name #=> "EUC-JP"
4351 * end
4352 *
4353 */
4354static VALUE
4355ecerr_source_encoding(VALUE self)
4356{
4357 return rb_attr_get(self, id_source_encoding);
4358}
4359
4360/*
4361 * call-seq:
4362 * ecerr.destination_encoding_name -> string
4363 *
4364 * Returns the destination encoding name as a string.
4365 */
4366static VALUE
4367ecerr_destination_encoding_name(VALUE self)
4368{
4369 return rb_attr_get(self, id_destination_encoding_name);
4370}
4371
4372/*
4373 * call-seq:
4374 * ecerr.destination_encoding -> string
4375 *
4376 * Returns the destination encoding as an encoding object.
4377 */
4378static VALUE
4379ecerr_destination_encoding(VALUE self)
4380{
4381 return rb_attr_get(self, id_destination_encoding);
4382}
4383
4384/*
4385 * call-seq:
4386 * ecerr.error_char -> string
4387 *
4388 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4389 *
4390 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4391 * begin
4392 * ec.convert("\xa0")
4393 * rescue Encoding::UndefinedConversionError
4394 * puts $!.error_char.dump #=> "\xC2\xA0"
4395 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4396 * end
4397 *
4398 */
4399static VALUE
4400ecerr_error_char(VALUE self)
4401{
4402 return rb_attr_get(self, id_error_char);
4403}
4404
4405/*
4406 * call-seq:
4407 * ecerr.error_bytes -> string
4408 *
4409 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4410 *
4411 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4412 * begin
4413 * ec.convert("abc\xA1\xFFdef")
4414 * rescue Encoding::InvalidByteSequenceError
4415 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4416 * puts $!.error_bytes.dump #=> "\xA1"
4417 * puts $!.readagain_bytes.dump #=> "\xFF"
4418 * end
4419 */
4420static VALUE
4421ecerr_error_bytes(VALUE self)
4422{
4423 return rb_attr_get(self, id_error_bytes);
4424}
4425
4426/*
4427 * call-seq:
4428 * ecerr.readagain_bytes -> string
4429 *
4430 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4431 */
4432static VALUE
4433ecerr_readagain_bytes(VALUE self)
4434{
4435 return rb_attr_get(self, id_readagain_bytes);
4436}
4437
4438/*
4439 * call-seq:
4440 * ecerr.incomplete_input? -> true or false
4441 *
4442 * Returns true if the invalid byte sequence error is caused by
4443 * premature end of string.
4444 *
4445 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4446 *
4447 * begin
4448 * ec.convert("abc\xA1z")
4449 * rescue Encoding::InvalidByteSequenceError
4450 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4451 * p $!.incomplete_input? #=> false
4452 * end
4453 *
4454 * begin
4455 * ec.convert("abc\xA1")
4456 * ec.finish
4457 * rescue Encoding::InvalidByteSequenceError
4458 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4459 * p $!.incomplete_input? #=> true
4460 * end
4461 */
4462static VALUE
4463ecerr_incomplete_input(VALUE self)
4464{
4465 return rb_attr_get(self, id_incomplete_input);
4466}
4467
4468/*
4469 * Document-class: Encoding::UndefinedConversionError
4470 *
4471 * Raised by Encoding and String methods when a transcoding operation
4472 * fails.
4473 */
4474
4475/*
4476 * Document-class: Encoding::InvalidByteSequenceError
4477 *
4478 * Raised by Encoding and String methods when the string being
4479 * transcoded contains a byte invalid for the either the source or
4480 * target encoding.
4481 */
4482
4483/*
4484 * Document-class: Encoding::ConverterNotFoundError
4485 *
4486 * Raised by transcoding methods when a named encoding does not
4487 * correspond with a known converter.
4488 */
4489
4490void
4491Init_transcode(void)
4492{
4493 transcoder_table = st_init_strcasetable();
4494
4495 id_destination_encoding = rb_intern_const("destination_encoding");
4496 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4497 id_error_bytes = rb_intern_const("error_bytes");
4498 id_error_char = rb_intern_const("error_char");
4499 id_incomplete_input = rb_intern_const("incomplete_input");
4500 id_readagain_bytes = rb_intern_const("readagain_bytes");
4501 id_source_encoding = rb_intern_const("source_encoding");
4502 id_source_encoding_name = rb_intern_const("source_encoding_name");
4503
4504 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4505 sym_undef = ID2SYM(rb_intern_const("undef"));
4506 sym_replace = ID2SYM(rb_intern_const("replace"));
4507 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4508 sym_xml = ID2SYM(rb_intern_const("xml"));
4509 sym_text = ID2SYM(rb_intern_const("text"));
4510 sym_attr = ID2SYM(rb_intern_const("attr"));
4511
4512 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4513 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4514 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4515 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4516 sym_finished = ID2SYM(rb_intern_const("finished"));
4517 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4518 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4519 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4520 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4521 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4522 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4523 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4524
4525#ifdef ENABLE_ECONV_NEWLINE_OPTION
4526 sym_newline = ID2SYM(rb_intern_const("newline"));
4527 sym_universal = ID2SYM(rb_intern_const("universal"));
4528 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4529 sym_cr = ID2SYM(rb_intern_const("cr"));
4530 sym_lf = ID2SYM(rb_intern_const("lf"));
4531#endif
4532
4533 InitVM(transcode);
4534}
4535
4536void
4537InitVM_transcode(void)
4538{
4539 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4540 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4541 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4542
4543 rb_define_method(rb_cString, "encode", str_encode, -1);
4544 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4545
4546 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4547 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4548 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4549 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4550 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4551 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4552 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4553 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4554 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4555 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4556 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4557 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4558 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4559 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4560 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4561 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4562 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4563 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4564 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4565
4566 /*
4567 *Mask for invalid byte sequences
4568 */
4569 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4570
4571 /*
4572 * Replace invalid byte sequences
4573 */
4574 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4575
4576 /*
4577 * Mask for a valid character in the source encoding but no related
4578 * character(s) in destination encoding.
4579 */
4580 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4581
4582 /*
4583 * Replace byte sequences that are undefined in the destination encoding.
4584 */
4585 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4586
4587 /*
4588 * Replace byte sequences that are undefined in the destination encoding
4589 * with an XML hexadecimal character reference. This is valid for XML
4590 * conversion.
4591 */
4592 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4593
4594 /*
4595 * Indicates the source may be part of a larger string. See
4596 * primitive_convert for an example.
4597 */
4598 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4599
4600 /*
4601 * Stop converting after some output is complete but before all of the
4602 * input was consumed. See primitive_convert for an example.
4603 */
4604 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4605
4606 /*
4607 * Decorator for converting CRLF and CR to LF
4608 */
4609 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4610
4611 /*
4612 * Decorator for converting CRLF and CR to LF when writing
4613 */
4614 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4615
4616 /*
4617 * Decorator for converting LF to CRLF
4618 */
4619 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4620
4621 /*
4622 * Decorator for converting LF to CR
4623 */
4624 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4625
4626 /*
4627 * Escape as XML CharData
4628 */
4629 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4630
4631 /*
4632 * Escape as XML AttValue
4633 */
4634 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4635
4636 /*
4637 * Escape as XML AttValue
4638 */
4639 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4640
4641 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4642 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4643 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4644 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4645 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4646
4647 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4648 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4649 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4650 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4651 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4652 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4653 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4654
4655 Init_newline();
4656}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1510
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1675
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:403
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1380
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1397
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1481
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1436
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:59
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3221
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1317
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:932
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:816
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2626
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2123
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1542
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2671
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1780
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3245
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1106
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1947
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1938
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1824
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1626
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1959
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2677
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:2005
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:2022
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1988
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2940
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4306
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4312
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1953
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1875
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1791
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2285
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1168
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1008
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1678
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2577
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:120
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1711
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1753
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:986
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1493
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1956
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3347
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2704
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7418
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1683
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5820
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1949
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3341
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:523
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:458
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:513
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:204
Definition st.h:79
Definition string.c:8362
Definition transcode.c:177
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376