Ruby 3.5.0dev (2025-11-03 revision 4a3d8346a6d0e068508631541f6bc43e8b154ea1)
transcode.c (4a3d8346a6d0e068508631541f6bc43e8b154ea1)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "internal/encoding.h"
23#include "ruby/encoding.h"
24#include "vm_sync.h"
25
26#include "transcode_data.h"
27#include "id.h"
28
29#define ENABLE_ECONV_NEWLINE_OPTION 1
30
31/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
32static VALUE rb_eUndefinedConversionError;
33static VALUE rb_eInvalidByteSequenceError;
34static VALUE rb_eConverterNotFoundError;
35
36VALUE rb_cEncodingConverter;
37
38static ID id_destination_encoding;
39static ID id_destination_encoding_name;
40static ID id_error_bytes;
41static ID id_error_char;
42static ID id_incomplete_input;
43static ID id_readagain_bytes;
44static ID id_source_encoding;
45static ID id_source_encoding_name;
46
47static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
48static VALUE sym_xml, sym_text, sym_attr;
49static VALUE sym_universal_newline;
50static VALUE sym_crlf_newline;
51static VALUE sym_cr_newline;
52static VALUE sym_lf_newline;
53#ifdef ENABLE_ECONV_NEWLINE_OPTION
54static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
55#endif
56static VALUE sym_partial_input;
57
58static VALUE sym_invalid_byte_sequence;
59static VALUE sym_undefined_conversion;
60static VALUE sym_destination_buffer_full;
61static VALUE sym_source_buffer_empty;
62static VALUE sym_finished;
63static VALUE sym_after_output;
64static VALUE sym_incomplete_input;
65
66static unsigned char *
67allocate_converted_string(const char *sname, const char *dname,
68 const unsigned char *str, size_t len,
69 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
70 size_t *dst_len_ptr);
71
72/* dynamic structure, one per conversion (similar to iconv_t) */
73/* may carry conversion state (e.g. for iso-2022-jp) */
74typedef struct rb_transcoding {
75 const rb_transcoder *transcoder;
76
77 int flags;
78
79 int resume_position;
80 unsigned int next_table;
81 VALUE next_info;
82 unsigned char next_byte;
83 unsigned int output_index;
84
85 ssize_t recognized_len; /* already interpreted */
86 ssize_t readagain_len; /* not yet interpreted */
87 union {
88 unsigned char ary[8]; /* max_input <= sizeof(ary) */
89 unsigned char *ptr; /* length: max_input */
90 } readbuf; /* recognized_len + readagain_len used */
91
92 ssize_t writebuf_off;
93 ssize_t writebuf_len;
94 union {
95 unsigned char ary[8]; /* max_output <= sizeof(ary) */
96 unsigned char *ptr; /* length: max_output */
97 } writebuf;
98
99 union rb_transcoding_state_t { /* opaque data for stateful encoding */
100 void *ptr;
101 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
102 double dummy_for_alignment;
103 } state;
105#define TRANSCODING_READBUF(tc) \
106 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
107 (tc)->readbuf.ary : \
108 (tc)->readbuf.ptr)
109#define TRANSCODING_WRITEBUF(tc) \
110 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
111 (tc)->writebuf.ary : \
112 (tc)->writebuf.ptr)
113#define TRANSCODING_WRITEBUF_SIZE(tc) \
114 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
115 sizeof((tc)->writebuf.ary) : \
116 (size_t)(tc)->transcoder->max_output)
117#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
118#define TRANSCODING_STATE(tc) \
119 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
120 (tc)->state.ary : \
121 (tc)->state.ptr)
122
123typedef struct {
124 struct rb_transcoding *tc;
125 unsigned char *out_buf_start;
126 unsigned char *out_data_start;
127 unsigned char *out_data_end;
128 unsigned char *out_buf_end;
129 rb_econv_result_t last_result;
131
133 int flags;
134 int started; /* bool */
135
136 const char *source_encoding_name;
137 const char *destination_encoding_name;
138
139 const unsigned char *replacement_str;
140 size_t replacement_len;
141 const char *replacement_enc;
142
143 unsigned char *in_buf_start;
144 unsigned char *in_data_start;
145 unsigned char *in_data_end;
146 unsigned char *in_buf_end;
147 rb_econv_elem_t *elems;
148 int replacement_allocated; /* bool */
149 int num_allocated;
150 int num_trans;
151 int num_finished;
152 struct rb_transcoding *last_tc;
153
154 /* last error */
155 struct {
156 rb_econv_result_t result;
157 struct rb_transcoding *error_tc;
158 const char *source_encoding;
159 const char *destination_encoding;
160 const unsigned char *error_bytes_start;
161 size_t error_bytes_len;
162 size_t readagain_len;
163 } last_error;
164
165 /* The following fields are only for Encoding::Converter.
166 * rb_econv_open set them NULL. */
167 rb_encoding *source_encoding;
168 rb_encoding *destination_encoding;
169};
170
171/*
172 * Dispatch data and logic
173 */
174
175#define DECORATOR_P(sname, dname) (*(sname) == '\0')
176
177typedef struct {
178 const char *sname;
179 const char *dname;
180 const char *lib; /* null means no need to load a library */
181 const rb_transcoder *transcoder;
183
184static st_table *transcoder_table;
185
186static int
187free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
188{
189 xfree((void *)val);
190 return ST_DELETE;
191}
192
193static int
194free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
195{
196 st_foreach((void *)val, free_inner_transcode_i, 0);
197 st_free_table((void *)val);
198 return ST_DELETE;
199}
200
201void
202rb_free_transcoder_table(void)
203{
204 st_foreach(transcoder_table, free_transcode_i, 0);
205 st_free_table(transcoder_table);
206}
207
208static transcoder_entry_t *
209make_transcoder_entry(const char *sname, const char *dname)
210{
211 st_data_t val;
212 st_table *table2;
213
214 RB_VM_LOCKING() {
215 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
216 val = (st_data_t)st_init_strcasetable();
217 st_add_direct(transcoder_table, (st_data_t)sname, val);
218 }
219 table2 = (st_table *)val;
220 if (!st_lookup(table2, (st_data_t)dname, &val)) {
222 entry->sname = sname;
223 entry->dname = dname;
224 entry->lib = NULL;
225 entry->transcoder = NULL;
226 val = (st_data_t)entry;
227 st_add_direct(table2, (st_data_t)dname, val);
228 }
229 }
230 return (transcoder_entry_t *)val;
231}
232
233static transcoder_entry_t *
234get_transcoder_entry(const char *sname, const char *dname)
235{
236 st_data_t val = 0;
237 st_table *table2;
238 RB_VM_LOCKING() {
239 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
240 table2 = (st_table *)val;
241 if (!st_lookup(table2, (st_data_t)dname, &val)) {
242 val = 0;
243 }
244 }
245 }
246 return (transcoder_entry_t *)val;
247}
248
249void
250rb_register_transcoder(const rb_transcoder *tr)
251{
252 const char *const sname = tr->src_encoding;
253 const char *const dname = tr->dst_encoding;
254
255 transcoder_entry_t *entry;
256
257 RB_VM_LOCKING() {
258 entry = make_transcoder_entry(sname, dname);
259 if (entry->transcoder) {
260 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
261 sname, dname);
262 }
263 entry->transcoder = tr;
264 }
265}
266
267static void
268declare_transcoder(const char *sname, const char *dname, const char *lib)
269{
270 transcoder_entry_t *entry;
271
272 entry = make_transcoder_entry(sname, dname);
273 entry->lib = lib;
274}
275
276static const char transcoder_lib_prefix[] = "enc/trans/";
277
278void
279rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
280{
281 if (!lib) {
282 rb_raise(rb_eArgError, "invalid library name - (null)");
283 }
284 declare_transcoder(enc1, enc2, lib);
285}
286
287#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
288
289typedef struct search_path_queue_tag {
290 struct search_path_queue_tag *next;
291 const char *enc;
293
294typedef struct {
295 st_table *visited;
296 search_path_queue_t *queue;
297 search_path_queue_t **queue_last_ptr;
298 const char *base_enc;
300
301static int
302transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
303{
304 const char *dname = (const char *)key;
307
308 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
309 return ST_CONTINUE;
310 }
311
313 q->enc = dname;
314 q->next = NULL;
315 *bfs->queue_last_ptr = q;
316 bfs->queue_last_ptr = &q->next;
317
318 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
319 return ST_CONTINUE;
320}
321
322static int
323transcode_search_path(const char *sname, const char *dname,
324 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
325 void *arg)
326{
329 st_data_t val;
330 st_table *table2;
331 int pathlen = -1;
332 bool found = false;
333 bool lookup_res;
334
335 if (encoding_equal(sname, dname))
336 return -1;
337
339 q->enc = sname;
340 q->next = NULL;
341 bfs.queue_last_ptr = &q->next;
342 bfs.queue = q;
343
344 bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop
345 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
346
347 RB_VM_LOCKING() {
348 while (bfs.queue) {
349 q = bfs.queue;
350 bfs.queue = q->next;
351 if (!bfs.queue) {
352 bfs.queue_last_ptr = &bfs.queue;
353 }
354
355 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2
356 if (!lookup_res) {
357 xfree(q);
358 continue;
359 }
360 table2 = (st_table *)val;
361
362 if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv
363 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
364 xfree(q);
365 found = true;
366 break;
367 }
368
369 bfs.base_enc = q->enc;
370 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
371
372 bfs.base_enc = NULL;
373 xfree(q);
374 }
375 }
376
377 while (bfs.queue) {
378 q = bfs.queue;
379 bfs.queue = q->next;
380 xfree(q);
381 }
382
383 if (found) {
384 const char *enc = dname;
385 int depth;
386 pathlen = 0;
387 while (1) {
388 st_lookup(bfs.visited, (st_data_t)enc, &val);
389 if (!val)
390 break;
391 pathlen++;
392 enc = (const char *)val;
393 }
394 depth = pathlen;
395 enc = dname;
396 while (1) {
397 st_lookup(bfs.visited, (st_data_t)enc, &val);
398 if (!val)
399 break;
400 callback((const char *)val, enc, --depth, arg);
401 enc = (const char *)val;
402 }
403 }
404
405 st_free_table(bfs.visited);
406
407 return pathlen; /* is -1 if not found */
408}
409
410int rb_require_internal_silent(VALUE fname);
411
412static const rb_transcoder *
413load_transcoder_entry(transcoder_entry_t *entry)
414{
415 ASSERT_vm_unlocking();
416 if (entry->transcoder)
417 return entry->transcoder;
418
419 if (entry->lib) {
420 const char *const lib = entry->lib;
421 const size_t len = strlen(lib);
422 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
423 const VALUE fn = rb_str_new(0, total_len);
424 char *const path = RSTRING_PTR(fn);
425
426 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
427 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
428 rb_str_set_len(fn, total_len);
429 OBJ_FREEZE(fn);
430 rb_require_internal_silent(fn); // Sets entry->transcoder
431 }
432
433 if (entry->transcoder)
434 return entry->transcoder;
435
436 return NULL;
437}
438
439static const char*
440get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
441{
442 if (encoding_equal(encname, "UTF-8")) {
443 *len_ret = 3;
444 *repl_encname_ptr = "UTF-8";
445 return "\xEF\xBF\xBD";
446 }
447 else {
448 *len_ret = 1;
449 *repl_encname_ptr = "US-ASCII";
450 return "?";
451 }
452}
453
454/*
455 * Transcoding engine logic
456 */
457
458static const unsigned char *
459transcode_char_start(rb_transcoding *tc,
460 const unsigned char *in_start,
461 const unsigned char *inchar_start,
462 const unsigned char *in_p,
463 size_t *char_len_ptr)
464{
465 const unsigned char *ptr;
466 if (inchar_start - in_start < tc->recognized_len) {
467 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
468 inchar_start, unsigned char, in_p - inchar_start);
469 ptr = TRANSCODING_READBUF(tc);
470 }
471 else {
472 ptr = inchar_start - tc->recognized_len;
473 }
474 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
475 return ptr;
476}
477
479transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
480 const unsigned char *in_stop, unsigned char *out_stop,
481 rb_transcoding *tc,
482 const int opt)
483{
484 const rb_transcoder *tr = tc->transcoder;
485 int unitlen = tr->input_unit_length;
486 ssize_t readagain_len = 0;
487
488 const unsigned char *inchar_start;
489 const unsigned char *in_p;
490
491 unsigned char *out_p;
492
493 in_p = inchar_start = *in_pos;
494
495 out_p = *out_pos;
496
497#define SUSPEND(ret, num) \
498 do { \
499 tc->resume_position = (num); \
500 if (0 < in_p - inchar_start) \
501 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
502 inchar_start, unsigned char, in_p - inchar_start); \
503 *in_pos = in_p; \
504 *out_pos = out_p; \
505 tc->recognized_len += in_p - inchar_start; \
506 if (readagain_len) { \
507 tc->recognized_len -= readagain_len; \
508 tc->readagain_len = readagain_len; \
509 } \
510 return (ret); \
511 resume_label ## num:; \
512 } while (0)
513#define SUSPEND_OBUF(num) \
514 do { \
515 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
516 } while (0)
517
518#define SUSPEND_AFTER_OUTPUT(num) \
519 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
520 SUSPEND(econv_after_output, num); \
521 }
522
523#define next_table (tc->next_table)
524#define next_info (tc->next_info)
525#define next_byte (tc->next_byte)
526#define writebuf_len (tc->writebuf_len)
527#define writebuf_off (tc->writebuf_off)
528
529 switch (tc->resume_position) {
530 case 0: break;
531 case 1: goto resume_label1;
532 case 2: goto resume_label2;
533 case 3: goto resume_label3;
534 case 4: goto resume_label4;
535 case 5: goto resume_label5;
536 case 6: goto resume_label6;
537 case 7: goto resume_label7;
538 case 8: goto resume_label8;
539 case 9: goto resume_label9;
540 case 10: goto resume_label10;
541 case 11: goto resume_label11;
542 case 12: goto resume_label12;
543 case 13: goto resume_label13;
544 case 14: goto resume_label14;
545 case 15: goto resume_label15;
546 case 16: goto resume_label16;
547 case 17: goto resume_label17;
548 case 18: goto resume_label18;
549 case 19: goto resume_label19;
550 case 20: goto resume_label20;
551 case 21: goto resume_label21;
552 case 22: goto resume_label22;
553 case 23: goto resume_label23;
554 case 24: goto resume_label24;
555 case 25: goto resume_label25;
556 case 26: goto resume_label26;
557 case 27: goto resume_label27;
558 case 28: goto resume_label28;
559 case 29: goto resume_label29;
560 case 30: goto resume_label30;
561 case 31: goto resume_label31;
562 case 32: goto resume_label32;
563 case 33: goto resume_label33;
564 case 34: goto resume_label34;
565 }
566
567 while (1) {
568 inchar_start = in_p;
569 tc->recognized_len = 0;
570 next_table = tr->conv_tree_start;
571
572 SUSPEND_AFTER_OUTPUT(24);
573
574 if (in_stop <= in_p) {
575 if (!(opt & ECONV_PARTIAL_INPUT))
576 break;
577 SUSPEND(econv_source_buffer_empty, 7);
578 continue;
579 }
580
581#define BYTE_ADDR(index) (tr->byte_array + (index))
582#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
583#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
584#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
585#define BL_MIN_BYTE (BL_BASE[0])
586#define BL_MAX_BYTE (BL_BASE[1])
587#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
588#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
589
590 next_byte = (unsigned char)*in_p++;
591 follow_byte:
592 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
593 next_info = INVALID;
594 else {
595 next_info = (VALUE)BL_ACTION(next_byte);
596 }
597 follow_info:
598 switch (next_info & 0x1F) {
599 case NOMAP:
600 {
601 const unsigned char *p = inchar_start;
602 writebuf_off = 0;
603 while (p < in_p) {
604 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
605 }
606 writebuf_len = writebuf_off;
607 writebuf_off = 0;
608 while (writebuf_off < writebuf_len) {
609 SUSPEND_OBUF(3);
610 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
611 }
612 }
613 continue;
614 case 0x00: case 0x04: case 0x08: case 0x0C:
615 case 0x10: case 0x14: case 0x18: case 0x1C:
616 SUSPEND_AFTER_OUTPUT(25);
617 while (in_p >= in_stop) {
618 if (!(opt & ECONV_PARTIAL_INPUT))
619 goto incomplete;
620 SUSPEND(econv_source_buffer_empty, 5);
621 }
622 next_byte = (unsigned char)*in_p++;
623 next_table = (unsigned int)next_info;
624 goto follow_byte;
625 case ZERObt: /* drop input */
626 continue;
627 case ONEbt:
628 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
629 continue;
630 case TWObt:
631 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
632 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
633 continue;
634 case THREEbt:
635 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
636 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
637 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
638 continue;
639 case FOURbt:
640 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
641 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
642 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
643 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
644 continue;
645 case GB4bt:
646 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
647 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
648 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
649 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
650 continue;
651 case STR1:
652 tc->output_index = 0;
653 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
654 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
655 tc->output_index++;
656 }
657 continue;
658 case FUNii:
659 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
660 goto follow_info;
661 case FUNsi:
662 {
663 const unsigned char *char_start;
664 size_t char_len;
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
667 goto follow_info;
668 }
669 case FUNio:
670 SUSPEND_OBUF(13);
671 if (tr->max_output <= out_stop - out_p)
672 out_p += tr->func_io(TRANSCODING_STATE(tc),
673 next_info, out_p, out_stop - out_p);
674 else {
675 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
676 next_info,
677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
678 writebuf_off = 0;
679 while (writebuf_off < writebuf_len) {
680 SUSPEND_OBUF(20);
681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682 }
683 }
684 break;
685 case FUNso:
686 {
687 const unsigned char *char_start;
688 size_t char_len;
689 SUSPEND_OBUF(14);
690 if (tr->max_output <= out_stop - out_p) {
691 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
692 out_p += tr->func_so(TRANSCODING_STATE(tc),
693 char_start, (size_t)char_len,
694 out_p, out_stop - out_p);
695 }
696 else {
697 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
698 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
699 char_start, (size_t)char_len,
700 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
701 writebuf_off = 0;
702 while (writebuf_off < writebuf_len) {
703 SUSPEND_OBUF(22);
704 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
705 }
706 }
707 break;
708 }
709 case FUNsio:
710 {
711 const unsigned char *char_start;
712 size_t char_len;
713 SUSPEND_OBUF(33);
714 if (tr->max_output <= out_stop - out_p) {
715 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
716 out_p += tr->func_sio(TRANSCODING_STATE(tc),
717 char_start, (size_t)char_len, next_info,
718 out_p, out_stop - out_p);
719 }
720 else {
721 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
722 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
723 char_start, (size_t)char_len, next_info,
724 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
725 writebuf_off = 0;
726 while (writebuf_off < writebuf_len) {
727 SUSPEND_OBUF(34);
728 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
729 }
730 }
731 break;
732 }
733 case INVALID:
734 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
735 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
736 SUSPEND_AFTER_OUTPUT(26);
737 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
738 in_p = in_stop;
739 SUSPEND(econv_source_buffer_empty, 8);
740 }
741 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
742 in_p = in_stop;
743 }
744 else {
745 in_p = inchar_start + (unitlen - tc->recognized_len);
746 }
747 }
748 else {
749 ssize_t invalid_len; /* including the last byte which causes invalid */
750 ssize_t discard_len;
751 invalid_len = tc->recognized_len + (in_p - inchar_start);
752 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
753 readagain_len = invalid_len - discard_len;
754 }
755 goto invalid;
756 case UNDEF:
757 goto undef;
758 default:
759 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
760 }
761 continue;
762
763 invalid:
764 SUSPEND(econv_invalid_byte_sequence, 1);
765 continue;
766
767 incomplete:
768 SUSPEND(econv_incomplete_input, 27);
769 continue;
770
771 undef:
772 SUSPEND(econv_undefined_conversion, 2);
773 continue;
774 }
775
776 /* cleanup */
777 if (tr->finish_func) {
778 SUSPEND_OBUF(4);
779 if (tr->max_output <= out_stop - out_p) {
780 out_p += tr->finish_func(TRANSCODING_STATE(tc),
781 out_p, out_stop - out_p);
782 }
783 else {
784 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
785 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
786 writebuf_off = 0;
787 while (writebuf_off < writebuf_len) {
788 SUSPEND_OBUF(23);
789 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
790 }
791 }
792 }
793 while (1)
794 SUSPEND(econv_finished, 6);
795#undef SUSPEND
796#undef next_table
797#undef next_info
798#undef next_byte
799#undef writebuf_len
800#undef writebuf_off
801}
802
804transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
805 const unsigned char *in_stop, unsigned char *out_stop,
806 rb_transcoding *tc,
807 const int opt)
808{
809 if (tc->readagain_len) {
810 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
811 const unsigned char *readagain_pos = readagain_buf;
812 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
814
815 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
816 unsigned char, tc->readagain_len);
817 tc->readagain_len = 0;
818 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
819 if (res != econv_source_buffer_empty) {
820 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
821 readagain_pos, unsigned char, readagain_stop - readagain_pos);
822 tc->readagain_len += readagain_stop - readagain_pos;
823 return res;
824 }
825 }
826 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
827}
828
829static rb_transcoding *
830rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
831{
832 rb_transcoding *tc;
833
834 tc = ALLOC(rb_transcoding);
835 tc->transcoder = tr;
836 tc->flags = flags;
837 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
838 tc->state.ptr = xmalloc(tr->state_size);
839 if (tr->state_init_func) {
840 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
841 }
842 tc->resume_position = 0;
843 tc->recognized_len = 0;
844 tc->readagain_len = 0;
845 tc->writebuf_len = 0;
846 tc->writebuf_off = 0;
847 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
848 tc->readbuf.ptr = xmalloc(tr->max_input);
849 }
850 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
851 tc->writebuf.ptr = xmalloc(tr->max_output);
852 }
853 return tc;
854}
855
857rb_transcoding_convert(rb_transcoding *tc,
858 const unsigned char **input_ptr, const unsigned char *input_stop,
859 unsigned char **output_ptr, unsigned char *output_stop,
860 int flags)
861{
862 return transcode_restartable(
863 input_ptr, output_ptr,
864 input_stop, output_stop,
865 tc, flags);
866}
867
868static void
869rb_transcoding_close(rb_transcoding *tc)
870{
871 const rb_transcoder *tr = tc->transcoder;
872 if (tr->state_fini_func) {
873 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
874 }
875 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
876 xfree(tc->state.ptr);
877 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
878 xfree(tc->readbuf.ptr);
879 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
880 xfree(tc->writebuf.ptr);
881 xfree(tc);
882}
883
884static size_t
885rb_transcoding_memsize(rb_transcoding *tc)
886{
887 size_t size = sizeof(rb_transcoding);
888 const rb_transcoder *tr = tc->transcoder;
889
890 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
891 size += tr->state_size;
892 }
893 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
894 size += tr->max_input;
895 }
896 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
897 size += tr->max_output;
898 }
899 return size;
900}
901
902static rb_econv_t *
903rb_econv_alloc(int n_hint)
904{
905 rb_econv_t *ec;
906
907 if (n_hint <= 0)
908 n_hint = 1;
909
910 ec = ALLOC(rb_econv_t);
911 ec->flags = 0;
912 ec->source_encoding_name = NULL;
913 ec->destination_encoding_name = NULL;
914 ec->started = 0;
915 ec->replacement_str = NULL;
916 ec->replacement_len = 0;
917 ec->replacement_enc = NULL;
918 ec->replacement_allocated = 0;
919 ec->in_buf_start = NULL;
920 ec->in_data_start = NULL;
921 ec->in_data_end = NULL;
922 ec->in_buf_end = NULL;
923 ec->num_allocated = n_hint;
924 ec->num_trans = 0;
925 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
926 ec->num_finished = 0;
927 ec->last_tc = NULL;
928 ec->last_error.result = econv_source_buffer_empty;
929 ec->last_error.error_tc = NULL;
930 ec->last_error.source_encoding = NULL;
931 ec->last_error.destination_encoding = NULL;
932 ec->last_error.error_bytes_start = NULL;
933 ec->last_error.error_bytes_len = 0;
934 ec->last_error.readagain_len = 0;
935 ec->source_encoding = NULL;
936 ec->destination_encoding = NULL;
937 return ec;
938}
939
940static int
941rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
942{
943 int n, j;
944 int bufsize = 4096;
945 unsigned char *p;
946
947 if (ec->num_trans == ec->num_allocated) {
948 n = ec->num_allocated * 2;
949 REALLOC_N(ec->elems, rb_econv_elem_t, n);
950 ec->num_allocated = n;
951 }
952
953 p = xmalloc(bufsize);
954
955 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
956
957 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
958 ec->elems[i].out_buf_start = p;
959 ec->elems[i].out_buf_end = p + bufsize;
960 ec->elems[i].out_data_start = p;
961 ec->elems[i].out_data_end = p;
962 ec->elems[i].last_result = econv_source_buffer_empty;
963
964 ec->num_trans++;
965
966 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
967 for (j = ec->num_trans-1; i <= j; j--) {
968 rb_transcoding *tc = ec->elems[j].tc;
969 const rb_transcoder *tr2 = tc->transcoder;
970 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
971 ec->last_tc = tc;
972 break;
973 }
974 }
975
976 return 0;
977}
978
979static rb_econv_t *
980rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
981{
982 rb_econv_t *ec;
983 int i, ret;
984
985 for (i = 0; i < n; i++) {
986 const rb_transcoder *tr;
987 tr = load_transcoder_entry(entries[i]);
988 if (!tr)
989 return NULL;
990 }
991
992 ec = rb_econv_alloc(n);
993
994 for (i = 0; i < n; i++) {
995 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
996 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
997 if (ret == -1) {
998 rb_econv_close(ec);
999 return NULL;
1000 }
1001 }
1002
1003 return ec;
1004}
1005
1007 transcoder_entry_t **entries;
1008 int num_additional;
1009};
1010
1011static void
1012trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1013{
1014 struct trans_open_t *toarg = arg;
1015
1016 if (!toarg->entries) {
1017 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1018 }
1019 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1020}
1021
1022static rb_econv_t *
1023rb_econv_open0(const char *sname, const char *dname, int ecflags)
1024{
1025 transcoder_entry_t **entries = NULL;
1026 int num_trans;
1027 rb_econv_t *ec;
1028
1029 // loads encodings if not loaded already
1030 if (*sname) rb_enc_find_index(sname);
1031 if (*dname) rb_enc_find_index(dname);
1032
1033 if (*sname == '\0' && *dname == '\0') {
1034 num_trans = 0;
1035 entries = NULL;
1036 sname = dname = "";
1037 }
1038 else {
1039 struct trans_open_t toarg;
1040 toarg.entries = NULL;
1041 toarg.num_additional = 0;
1042 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1043 entries = toarg.entries;
1044 if (num_trans < 0) {
1045 xfree(entries);
1046 return NULL;
1047 }
1048 }
1049
1050 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1051 xfree(entries);
1052 if (!ec)
1053 return NULL;
1054
1055 ec->flags = ecflags;
1056 ec->source_encoding_name = sname;
1057 ec->destination_encoding_name = dname;
1058
1059 return ec;
1060}
1061
1062#define MAX_ECFLAGS_DECORATORS 32
1063
1064static int
1065decorator_names(int ecflags, const char **decorators_ret)
1066{
1067 int num_decorators;
1068
1069 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1074 case 0:
1075 break;
1076 default:
1077 return -1;
1078 }
1079
1080 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1082 return -1;
1083
1084 num_decorators = 0;
1085
1086 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1087 decorators_ret[num_decorators++] = "xml_text_escape";
1089 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1090 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1091 decorators_ret[num_decorators++] = "xml_attr_quote";
1092
1093 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1094 decorators_ret[num_decorators++] = "crlf_newline";
1095 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1096 decorators_ret[num_decorators++] = "cr_newline";
1097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1098 decorators_ret[num_decorators++] = "lf_newline";
1100 decorators_ret[num_decorators++] = "universal_newline";
1101
1102 return num_decorators;
1103}
1104
1105rb_econv_t *
1106rb_econv_open(const char *sname, const char *dname, int ecflags)
1107{
1108 rb_econv_t *ec;
1109 int num_decorators;
1110 const char *decorators[MAX_ECFLAGS_DECORATORS];
1111 int i;
1112
1113 num_decorators = decorator_names(ecflags, decorators);
1114 if (num_decorators == -1)
1115 return NULL;
1116
1117 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1118 if (ec) {
1119 for (i = 0; i < num_decorators; i++) {
1120 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1121 rb_econv_close(ec);
1122 ec = NULL;
1123 break;
1124 }
1125 }
1126 }
1127
1128 if (ec) {
1129 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1130 }
1131 return ec; // can be NULL
1132}
1133
1134static int
1135trans_sweep(rb_econv_t *ec,
1136 const unsigned char **input_ptr, const unsigned char *input_stop,
1137 unsigned char **output_ptr, unsigned char *output_stop,
1138 int flags,
1139 int start)
1140{
1141 int try;
1142 int i, f;
1143
1144 const unsigned char **ipp, *is, *iold;
1145 unsigned char **opp, *os, *oold;
1147
1148 try = 1;
1149 while (try) {
1150 try = 0;
1151 for (i = start; i < ec->num_trans; i++) {
1152 rb_econv_elem_t *te = &ec->elems[i];
1153
1154 if (i == 0) {
1155 ipp = input_ptr;
1156 is = input_stop;
1157 }
1158 else {
1159 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1160 ipp = (const unsigned char **)&prev_te->out_data_start;
1161 is = prev_te->out_data_end;
1162 }
1163
1164 if (i == ec->num_trans-1) {
1165 opp = output_ptr;
1166 os = output_stop;
1167 }
1168 else {
1169 if (te->out_buf_start != te->out_data_start) {
1170 ssize_t len = te->out_data_end - te->out_data_start;
1171 ssize_t off = te->out_data_start - te->out_buf_start;
1172 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1173 te->out_data_start = te->out_buf_start;
1174 te->out_data_end -= off;
1175 }
1176 opp = &te->out_data_end;
1177 os = te->out_buf_end;
1178 }
1179
1180 f = flags;
1181 if (ec->num_finished != i)
1183 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1184 start = 1;
1185 flags &= ~ECONV_AFTER_OUTPUT;
1186 }
1187 if (i != 0)
1188 f &= ~ECONV_AFTER_OUTPUT;
1189 iold = *ipp;
1190 oold = *opp;
1191 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1192 if (iold != *ipp || oold != *opp)
1193 try = 1;
1194
1195 switch (res) {
1199 case econv_after_output:
1200 return i;
1201
1204 break;
1205
1206 case econv_finished:
1207 ec->num_finished = i+1;
1208 break;
1209 }
1210 }
1211 }
1212 return -1;
1213}
1214
1215static rb_econv_result_t
1216rb_trans_conv(rb_econv_t *ec,
1217 const unsigned char **input_ptr, const unsigned char *input_stop,
1218 unsigned char **output_ptr, unsigned char *output_stop,
1219 int flags,
1220 int *result_position_ptr)
1221{
1222 int i;
1223 int needreport_index;
1224 int sweep_start;
1225
1226 unsigned char empty_buf;
1227 unsigned char *empty_ptr = &empty_buf;
1228
1229 if (!input_ptr) {
1230 input_ptr = (const unsigned char **)&empty_ptr;
1231 input_stop = empty_ptr;
1232 }
1233
1234 if (!output_ptr) {
1235 output_ptr = &empty_ptr;
1236 output_stop = empty_ptr;
1237 }
1238
1239 if (ec->elems[0].last_result == econv_after_output)
1240 ec->elems[0].last_result = econv_source_buffer_empty;
1241
1242 for (i = ec->num_trans-1; 0 <= i; i--) {
1243 switch (ec->elems[i].last_result) {
1247 case econv_after_output:
1248 case econv_finished:
1249 sweep_start = i+1;
1250 goto found_needreport;
1251
1254 break;
1255
1256 default:
1257 rb_bug("unexpected transcode last result");
1258 }
1259 }
1260
1261 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1262
1263 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1264 (flags & ECONV_AFTER_OUTPUT)) {
1266
1267 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1269 result_position_ptr);
1270
1271 if (res == econv_source_buffer_empty)
1272 return econv_after_output;
1273 return res;
1274 }
1275
1276 sweep_start = 0;
1277
1278 found_needreport:
1279
1280 do {
1281 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1282 sweep_start = needreport_index + 1;
1283 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1284
1285 for (i = ec->num_trans-1; 0 <= i; i--) {
1286 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1287 rb_econv_result_t res = ec->elems[i].last_result;
1288 if (res == econv_invalid_byte_sequence ||
1289 res == econv_incomplete_input ||
1291 res == econv_after_output) {
1292 ec->elems[i].last_result = econv_source_buffer_empty;
1293 }
1294 if (result_position_ptr)
1295 *result_position_ptr = i;
1296 return res;
1297 }
1298 }
1299 if (result_position_ptr)
1300 *result_position_ptr = -1;
1302}
1303
1304static rb_econv_result_t
1305rb_econv_convert0(rb_econv_t *ec,
1306 const unsigned char **input_ptr, const unsigned char *input_stop,
1307 unsigned char **output_ptr, unsigned char *output_stop,
1308 int flags)
1309{
1311 int result_position;
1312 int has_output = 0;
1313
1314 memset(&ec->last_error, 0, sizeof(ec->last_error));
1315
1316 if (ec->num_trans == 0) {
1317 size_t len;
1318 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1319 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1320 len = output_stop - *output_ptr;
1321 memcpy(*output_ptr, ec->in_data_start, len);
1322 *output_ptr = output_stop;
1323 ec->in_data_start += len;
1325 goto gotresult;
1326 }
1327 len = ec->in_data_end - ec->in_data_start;
1328 memcpy(*output_ptr, ec->in_data_start, len);
1329 *output_ptr += len;
1330 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1331 if (flags & ECONV_AFTER_OUTPUT) {
1332 res = econv_after_output;
1333 goto gotresult;
1334 }
1335 }
1336 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1337 len = output_stop - *output_ptr;
1338 }
1339 else {
1340 len = input_stop - *input_ptr;
1341 }
1342 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1343 *(*output_ptr)++ = *(*input_ptr)++;
1344 res = econv_after_output;
1345 goto gotresult;
1346 }
1347 memcpy(*output_ptr, *input_ptr, len);
1348 *output_ptr += len;
1349 *input_ptr += len;
1350 if (*input_ptr != input_stop)
1352 else if (flags & ECONV_PARTIAL_INPUT)
1354 else
1355 res = econv_finished;
1356 goto gotresult;
1357 }
1358
1359 if (ec->elems[ec->num_trans-1].out_data_start) {
1360 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1361 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1362 if (data_start != data_end) {
1363 size_t len;
1364 if (output_stop - *output_ptr < data_end - data_start) {
1365 len = output_stop - *output_ptr;
1366 memcpy(*output_ptr, data_start, len);
1367 *output_ptr = output_stop;
1368 ec->elems[ec->num_trans-1].out_data_start += len;
1370 goto gotresult;
1371 }
1372 len = data_end - data_start;
1373 memcpy(*output_ptr, data_start, len);
1374 *output_ptr += len;
1375 ec->elems[ec->num_trans-1].out_data_start =
1376 ec->elems[ec->num_trans-1].out_data_end =
1377 ec->elems[ec->num_trans-1].out_buf_start;
1378 has_output = 1;
1379 }
1380 }
1381
1382 if (ec->in_buf_start &&
1383 ec->in_data_start != ec->in_data_end) {
1384 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1385 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1386 if (res != econv_source_buffer_empty)
1387 goto gotresult;
1388 }
1389
1390 if (has_output &&
1391 (flags & ECONV_AFTER_OUTPUT) &&
1392 *input_ptr != input_stop) {
1393 input_stop = *input_ptr;
1394 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1395 if (res == econv_source_buffer_empty)
1396 res = econv_after_output;
1397 }
1398 else if ((flags & ECONV_AFTER_OUTPUT) ||
1399 ec->num_trans == 1) {
1400 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1401 }
1402 else {
1403 flags |= ECONV_AFTER_OUTPUT;
1404 do {
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1406 } while (res == econv_after_output);
1407 }
1408
1409 gotresult:
1410 ec->last_error.result = res;
1411 if (res == econv_invalid_byte_sequence ||
1412 res == econv_incomplete_input ||
1414 rb_transcoding *error_tc = ec->elems[result_position].tc;
1415 ec->last_error.error_tc = error_tc;
1416 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1417 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1418 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1419 ec->last_error.error_bytes_len = error_tc->recognized_len;
1420 ec->last_error.readagain_len = error_tc->readagain_len;
1421 }
1422
1423 return res;
1424}
1425
1426static int output_replacement_character(rb_econv_t *ec);
1427
1428static int
1429output_hex_charref(rb_econv_t *ec)
1430{
1431 int ret;
1432 unsigned char utfbuf[1024];
1433 const unsigned char *utf;
1434 size_t utf_len;
1435 int utf_allocated = 0;
1436 char charef_buf[16];
1437 const unsigned char *p;
1438
1439 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1440 utf = ec->last_error.error_bytes_start;
1441 utf_len = ec->last_error.error_bytes_len;
1442 }
1443 else {
1444 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1445 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1446 utfbuf, sizeof(utfbuf),
1447 &utf_len);
1448 if (!utf)
1449 return -1;
1450 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1451 utf_allocated = 1;
1452 }
1453
1454 if (utf_len % 4 != 0)
1455 goto fail;
1456
1457 p = utf;
1458 while (4 <= utf_len) {
1459 unsigned int u = 0;
1460 u += p[0] << 24;
1461 u += p[1] << 16;
1462 u += p[2] << 8;
1463 u += p[3];
1464 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1465
1466 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1467 if (ret == -1)
1468 goto fail;
1469
1470 p += 4;
1471 utf_len -= 4;
1472 }
1473
1474 if (utf_allocated)
1475 xfree((void *)utf);
1476 return 0;
1477
1478 fail:
1479 if (utf_allocated)
1480 xfree((void *)utf);
1481 return -1;
1482}
1483
1486 const unsigned char **input_ptr, const unsigned char *input_stop,
1487 unsigned char **output_ptr, unsigned char *output_stop,
1488 int flags)
1489{
1491
1492 unsigned char empty_buf;
1493 unsigned char *empty_ptr = &empty_buf;
1494
1495 ec->started = 1;
1496
1497 if (!input_ptr) {
1498 input_ptr = (const unsigned char **)&empty_ptr;
1499 input_stop = empty_ptr;
1500 }
1501
1502 if (!output_ptr) {
1503 output_ptr = &empty_ptr;
1504 output_stop = empty_ptr;
1505 }
1506
1507 resume:
1508 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1509
1510 if (ret == econv_invalid_byte_sequence ||
1511 ret == econv_incomplete_input) {
1512 /* deal with invalid byte sequence */
1513 /* todo: add more alternative behaviors */
1514 switch (ec->flags & ECONV_INVALID_MASK) {
1516 if (output_replacement_character(ec) == 0)
1517 goto resume;
1518 }
1519 }
1520
1521 if (ret == econv_undefined_conversion) {
1522 /* valid character in source encoding
1523 * but no related character(s) in destination encoding */
1524 /* todo: add more alternative behaviors */
1525 switch (ec->flags & ECONV_UNDEF_MASK) {
1527 if (output_replacement_character(ec) == 0)
1528 goto resume;
1529 break;
1530
1532 if (output_hex_charref(ec) == 0)
1533 goto resume;
1534 break;
1535 }
1536 }
1537
1538 return ret;
1539}
1540
1541const char *
1543{
1544 rb_transcoding *tc = ec->last_tc;
1545 const rb_transcoder *tr;
1546
1547 if (tc == NULL)
1548 return "";
1549
1550 tr = tc->transcoder;
1551
1552 if (tr->asciicompat_type == asciicompat_encoder)
1553 return tr->src_encoding;
1554 return tr->dst_encoding;
1555}
1556
1557static unsigned char *
1558allocate_converted_string(const char *sname, const char *dname,
1559 const unsigned char *str, size_t len,
1560 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1561 size_t *dst_len_ptr)
1562{
1563 unsigned char *dst_str;
1564 size_t dst_len;
1565 size_t dst_bufsize;
1566
1567 rb_econv_t *ec;
1569
1570 const unsigned char *sp;
1571 unsigned char *dp;
1572
1573 if (caller_dst_buf)
1574 dst_bufsize = caller_dst_bufsize;
1575 else if (len == 0)
1576 dst_bufsize = 1;
1577 else
1578 dst_bufsize = len;
1579
1580 ec = rb_econv_open(sname, dname, 0);
1581 if (ec == NULL)
1582 return NULL;
1583 if (caller_dst_buf)
1584 dst_str = caller_dst_buf;
1585 else
1586 dst_str = xmalloc(dst_bufsize);
1587 dst_len = 0;
1588 sp = str;
1589 dp = dst_str+dst_len;
1590 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1591 dst_len = dp - dst_str;
1592 while (res == econv_destination_buffer_full) {
1593 if (SIZE_MAX/2 < dst_bufsize) {
1594 goto fail;
1595 }
1596 dst_bufsize *= 2;
1597 if (dst_str == caller_dst_buf) {
1598 unsigned char *tmp;
1599 tmp = xmalloc(dst_bufsize);
1600 memcpy(tmp, dst_str, dst_bufsize/2);
1601 dst_str = tmp;
1602 }
1603 else {
1604 dst_str = xrealloc(dst_str, dst_bufsize);
1605 }
1606 dp = dst_str+dst_len;
1607 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1608 dst_len = dp - dst_str;
1609 }
1610 if (res != econv_finished) {
1611 goto fail;
1612 }
1613 rb_econv_close(ec);
1614 *dst_len_ptr = dst_len;
1615 return dst_str;
1616
1617 fail:
1618 if (dst_str != caller_dst_buf)
1619 xfree(dst_str);
1620 rb_econv_close(ec);
1621 return NULL;
1622}
1623
1624/* result: 0:success -1:failure */
1625int
1627 const unsigned char *str, size_t len, const char *str_encoding)
1628{
1629 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1630 unsigned char insert_buf[4096];
1631 const unsigned char *insert_str = NULL;
1632 size_t insert_len;
1633
1634 int last_trans_index;
1635 rb_transcoding *tc;
1636
1637 unsigned char **buf_start_p;
1638 unsigned char **data_start_p;
1639 unsigned char **data_end_p;
1640 unsigned char **buf_end_p;
1641
1642 size_t need;
1643
1644 ec->started = 1;
1645
1646 if (len == 0)
1647 return 0;
1648
1649 if (encoding_equal(insert_encoding, str_encoding)) {
1650 insert_str = str;
1651 insert_len = len;
1652 }
1653 else {
1654 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1655 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1656 if (insert_str == NULL)
1657 return -1;
1658 }
1659
1660 need = insert_len;
1661
1662 last_trans_index = ec->num_trans-1;
1663 if (ec->num_trans == 0) {
1664 tc = NULL;
1665 buf_start_p = &ec->in_buf_start;
1666 data_start_p = &ec->in_data_start;
1667 data_end_p = &ec->in_data_end;
1668 buf_end_p = &ec->in_buf_end;
1669 }
1670 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1671 tc = ec->elems[last_trans_index].tc;
1672 need += tc->readagain_len;
1673 if (need < insert_len)
1674 goto fail;
1675 if (last_trans_index == 0) {
1676 buf_start_p = &ec->in_buf_start;
1677 data_start_p = &ec->in_data_start;
1678 data_end_p = &ec->in_data_end;
1679 buf_end_p = &ec->in_buf_end;
1680 }
1681 else {
1682 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1683 buf_start_p = &ee->out_buf_start;
1684 data_start_p = &ee->out_data_start;
1685 data_end_p = &ee->out_data_end;
1686 buf_end_p = &ee->out_buf_end;
1687 }
1688 }
1689 else {
1690 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1691 buf_start_p = &ee->out_buf_start;
1692 data_start_p = &ee->out_data_start;
1693 data_end_p = &ee->out_data_end;
1694 buf_end_p = &ee->out_buf_end;
1695 tc = ec->elems[last_trans_index].tc;
1696 }
1697
1698 if (*buf_start_p == NULL) {
1699 unsigned char *buf = xmalloc(need);
1700 *buf_start_p = buf;
1701 *data_start_p = buf;
1702 *data_end_p = buf;
1703 *buf_end_p = buf+need;
1704 }
1705 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1706 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1707 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1708 *data_start_p = *buf_start_p;
1709 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1710 unsigned char *buf;
1711 size_t s = (*data_end_p - *buf_start_p) + need;
1712 if (s < need)
1713 goto fail;
1714 buf = xrealloc(*buf_start_p, s);
1715 *data_start_p = buf;
1716 *data_end_p = buf + (*data_end_p - *buf_start_p);
1717 *buf_start_p = buf;
1718 *buf_end_p = buf + s;
1719 }
1720 }
1721
1722 memcpy(*data_end_p, insert_str, insert_len);
1723 *data_end_p += insert_len;
1724 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1725 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1726 *data_end_p += tc->readagain_len;
1727 tc->readagain_len = 0;
1728 }
1729
1730 if (insert_str != str && insert_str != insert_buf)
1731 xfree((void*)insert_str);
1732 return 0;
1733
1734 fail:
1735 if (insert_str != str && insert_str != insert_buf)
1736 xfree((void*)insert_str);
1737 return -1;
1738}
1739
1740void
1742{
1743 int i;
1744
1745 if (ec->replacement_allocated) {
1746 xfree((void *)ec->replacement_str);
1747 }
1748 for (i = 0; i < ec->num_trans; i++) {
1749 rb_transcoding_close(ec->elems[i].tc);
1750 xfree(ec->elems[i].out_buf_start);
1751 }
1752 xfree(ec->in_buf_start);
1753 xfree(ec->elems);
1754 xfree(ec);
1755}
1756
1757size_t
1758rb_econv_memsize(rb_econv_t *ec)
1759{
1760 size_t size = sizeof(rb_econv_t);
1761 int i;
1762
1763 if (ec->replacement_allocated) {
1764 size += ec->replacement_len;
1765 }
1766 for (i = 0; i < ec->num_trans; i++) {
1767 size += rb_transcoding_memsize(ec->elems[i].tc);
1768
1769 if (ec->elems[i].out_buf_start) {
1770 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1771 }
1772 }
1773 size += ec->in_buf_end - ec->in_buf_start;
1774 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1775
1776 return size;
1777}
1778
1779int
1781{
1782 if (ec->num_trans == 0)
1783 return 0;
1784#if SIZEOF_SIZE_T > SIZEOF_INT
1785 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1786#endif
1787 return (int)ec->elems[0].tc->readagain_len;
1788}
1789
1790void
1791rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1792{
1793 rb_transcoding *tc;
1794 if (ec->num_trans == 0 || n == 0)
1795 return;
1796 tc = ec->elems[0].tc;
1797 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1798 tc->readagain_len -= n;
1799}
1800
1802 const char *ascii_compat_name;
1803 const char *ascii_incompat_name;
1804};
1805
1806static int
1807asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1808{
1809 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1810 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1811 const rb_transcoder *tr;
1812
1813 if (DECORATOR_P(entry->sname, entry->dname))
1814 return ST_CONTINUE;
1815 tr = load_transcoder_entry(entry);
1816 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1817 data->ascii_compat_name = tr->dst_encoding;
1818 return ST_STOP;
1819 }
1820 return ST_CONTINUE;
1821}
1822
1823const char *
1824rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1825{
1826 st_data_t v;
1827 st_table *table2;
1828 struct asciicompat_encoding_t data = {0};
1829
1830 unsigned int lev;
1831 RB_VM_LOCK_ENTER_LEV(&lev);
1832 {
1833 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1834 table2 = (st_table *)v;
1835 /*
1836 * Assumption:
1837 * There is at most one transcoder for
1838 * converting from ASCII incompatible encoding.
1839 *
1840 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1841 */
1842 if (table2->num_entries == 1) {
1843 data.ascii_incompat_name = ascii_incompat_name;
1844 data.ascii_compat_name = NULL;
1845 if (rb_multi_ractor_p()) {
1846 /*
1847 * We need to unlock in case `load_transcoder_entry` actually loads the encoding
1848 * and table2 could be inserted into when we unlock.
1849 */
1850 st_table *dup_table2 = st_copy(table2);
1851 RB_VM_LOCK_LEAVE_LEV(&lev);
1852 st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data);
1853 st_free_table(dup_table2);
1854 RB_VM_LOCK_ENTER_LEV(&lev);
1855 }
1856 else {
1857 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1858 }
1859 }
1860
1861 }
1862 }
1863 RB_VM_LOCK_LEAVE_LEV(&lev);
1864
1865 return data.ascii_compat_name; // can be NULL
1866}
1867
1868/*
1869 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1870 *
1871 * If the result of the conversion is not compatible with the encoding of
1872 * `dst`, `dst` may not be valid encoding.
1873 */
1874VALUE
1875rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1876{
1877 unsigned const char *sp, *se;
1878 unsigned char *ds, *dp, *de;
1880 int max_output;
1881 enum ruby_coderange_type coderange;
1882 rb_encoding *dst_enc = ec->destination_encoding;
1883
1884 if (NIL_P(dst)) {
1885 dst = rb_str_buf_new(len);
1886 if (dst_enc) {
1887 rb_enc_associate(dst, dst_enc);
1888 }
1889 coderange = ENC_CODERANGE_7BIT; // scan from the start
1890 }
1891 else {
1892 dst_enc = rb_enc_get(dst);
1893 coderange = rb_enc_str_coderange(dst);
1894 }
1895
1896 if (ec->last_tc)
1897 max_output = ec->last_tc->transcoder->max_output;
1898 else
1899 max_output = 1;
1900
1901 do {
1902 int cr;
1903 long dlen = RSTRING_LEN(dst);
1904 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1905 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1906 if (LONG_MAX < new_capa)
1907 rb_raise(rb_eArgError, "too long string");
1908 rb_str_modify_expand(dst, new_capa - dlen);
1909 }
1910 sp = (const unsigned char *)ss;
1911 se = sp + len;
1912 ds = (unsigned char *)RSTRING_PTR(dst);
1913 de = ds + rb_str_capacity(dst);
1914 dp = ds += dlen;
1915 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1916 switch (coderange) {
1917 case ENC_CODERANGE_7BIT:
1919 cr = (int)coderange;
1920 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1921 coderange = cr;
1922 ENC_CODERANGE_SET(dst, coderange);
1923 break;
1926 break;
1927 }
1928 len -= (const char *)sp - ss;
1929 ss = (const char *)sp;
1930 rb_str_set_len(dst, dlen + (dp - ds));
1932 } while (res == econv_destination_buffer_full);
1933
1934 return dst;
1935}
1936
1937VALUE
1938rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1939{
1940 src = rb_str_new_frozen(src);
1941 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1942 RB_GC_GUARD(src);
1943 return dst;
1944}
1945
1946VALUE
1948{
1949 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1950}
1951
1952VALUE
1953rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1954{
1955 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1956}
1957
1958VALUE
1960{
1961 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1962}
1963
1964static int
1965rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1966{
1967 transcoder_entry_t *entry;
1968 const rb_transcoder *tr = NULL;
1969
1970 if (ec->started != 0)
1971 return -1;
1972
1973 entry = get_transcoder_entry(sname, dname);
1974 if (entry) {
1975 tr = load_transcoder_entry(entry);
1976 }
1977
1978 return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
1979}
1980
1981static int
1982rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1983{
1984 return rb_econv_add_converter(ec, "", decorator_name, n);
1985}
1986
1987int
1988rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1989{
1990 const rb_transcoder *tr;
1991
1992 if (ec->num_trans == 0)
1993 return rb_econv_decorate_at(ec, decorator_name, 0);
1994
1995 tr = ec->elems[0].tc->transcoder;
1996
1997 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1998 tr->asciicompat_type == asciicompat_decoder)
1999 return rb_econv_decorate_at(ec, decorator_name, 1);
2000
2001 return rb_econv_decorate_at(ec, decorator_name, 0);
2002}
2003
2004int
2005rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
2006{
2007 const rb_transcoder *tr;
2008
2009 if (ec->num_trans == 0)
2010 return rb_econv_decorate_at(ec, decorator_name, 0);
2011
2012 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2013
2014 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
2015 tr->asciicompat_type == asciicompat_encoder)
2016 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2017
2018 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2019}
2020
2021void
2023{
2024 const char *dname = 0;
2025
2026 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2028 dname = "universal_newline";
2029 break;
2031 dname = "crlf_newline";
2032 break;
2034 dname = "cr_newline";
2035 break;
2037 dname = "lf_newline";
2038 break;
2039 }
2040
2041 if (dname) {
2042 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2043 int num_trans = ec->num_trans;
2044 int i, j = 0;
2045
2046 for (i=0; i < num_trans; i++) {
2047 if (transcoder == ec->elems[i].tc->transcoder) {
2048 rb_transcoding_close(ec->elems[i].tc);
2049 xfree(ec->elems[i].out_buf_start);
2050 ec->num_trans--;
2051 }
2052 else
2053 ec->elems[j++] = ec->elems[i];
2054 }
2055 }
2056
2057 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2058}
2059
2060static VALUE
2061econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2062{
2063 int has_description = 0;
2064
2065 if (NIL_P(mesg))
2066 mesg = rb_str_new(NULL, 0);
2067
2068 if (*sname != '\0' || *dname != '\0') {
2069 if (*sname == '\0')
2070 rb_str_cat2(mesg, dname);
2071 else if (*dname == '\0')
2072 rb_str_cat2(mesg, sname);
2073 else
2074 rb_str_catf(mesg, "%s to %s", sname, dname);
2075 has_description = 1;
2076 }
2077
2078 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2082 const char *pre = "";
2083 if (has_description)
2084 rb_str_cat2(mesg, " with ");
2085 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2086 rb_str_cat2(mesg, pre); pre = ",";
2087 rb_str_cat2(mesg, "universal_newline");
2088 }
2089 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2090 rb_str_cat2(mesg, pre); pre = ",";
2091 rb_str_cat2(mesg, "crlf_newline");
2092 }
2093 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2094 rb_str_cat2(mesg, pre); pre = ",";
2095 rb_str_cat2(mesg, "cr_newline");
2096 }
2097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2098 rb_str_cat2(mesg, pre); pre = ",";
2099 rb_str_cat2(mesg, "lf_newline");
2100 }
2101 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2102 rb_str_cat2(mesg, pre); pre = ",";
2103 rb_str_cat2(mesg, "xml_text");
2104 }
2105 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2106 rb_str_cat2(mesg, pre); pre = ",";
2107 rb_str_cat2(mesg, "xml_attr_content");
2108 }
2109 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2110 rb_str_cat2(mesg, pre); pre = ",";
2111 rb_str_cat2(mesg, "xml_attr_quote");
2112 }
2113 has_description = 1;
2114 }
2115 if (!has_description) {
2116 rb_str_cat2(mesg, "no-conversion");
2117 }
2118
2119 return mesg;
2120}
2121
2122VALUE
2123rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2124{
2125 VALUE mesg, exc;
2126 mesg = rb_str_new_cstr("code converter not found (");
2127 econv_description(sname, dname, ecflags, mesg);
2128 rb_str_cat2(mesg, ")");
2129 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2130 return exc;
2131}
2132
2133static VALUE
2134make_econv_exception(rb_econv_t *ec)
2135{
2136 VALUE mesg, exc;
2137 if (ec->last_error.result == econv_invalid_byte_sequence ||
2138 ec->last_error.result == econv_incomplete_input) {
2139 const char *err = (const char *)ec->last_error.error_bytes_start;
2140 size_t error_len = ec->last_error.error_bytes_len;
2141 VALUE bytes = rb_str_new(err, error_len);
2142 VALUE dumped = rb_str_dump(bytes);
2143 size_t readagain_len = ec->last_error.readagain_len;
2144 VALUE bytes2 = Qnil;
2145 VALUE dumped2;
2146 if (ec->last_error.result == econv_incomplete_input) {
2147 mesg = rb_sprintf("incomplete %s on %s",
2148 StringValueCStr(dumped),
2149 ec->last_error.source_encoding);
2150 }
2151 else if (readagain_len) {
2152 bytes2 = rb_str_new(err+error_len, readagain_len);
2153 dumped2 = rb_str_dump(bytes2);
2154 mesg = rb_sprintf("%s followed by %s on %s",
2155 StringValueCStr(dumped),
2156 StringValueCStr(dumped2),
2157 ec->last_error.source_encoding);
2158 }
2159 else {
2160 mesg = rb_sprintf("%s on %s",
2161 StringValueCStr(dumped),
2162 ec->last_error.source_encoding);
2163 }
2164
2165 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2166 rb_ivar_set(exc, id_error_bytes, bytes);
2167 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2168 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2169 goto set_encs;
2170 }
2171 if (ec->last_error.result == econv_undefined_conversion) {
2172 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2173 ec->last_error.error_bytes_len);
2174 VALUE dumped = Qnil;
2175 int idx;
2176 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2177 rb_encoding *utf8 = rb_utf8_encoding();
2178 const char *start, *end;
2179 int n;
2180 start = (const char *)ec->last_error.error_bytes_start;
2181 end = start + ec->last_error.error_bytes_len;
2182 n = rb_enc_precise_mbclen(start, end, utf8);
2183 if (MBCLEN_CHARFOUND_P(n) &&
2184 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2185 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2186 dumped = rb_sprintf("U+%04X", cc);
2187 }
2188 }
2189 if (NIL_P(dumped))
2190 dumped = rb_str_dump(bytes);
2191 if (strcmp(ec->last_error.source_encoding,
2192 ec->source_encoding_name) == 0 &&
2193 strcmp(ec->last_error.destination_encoding,
2194 ec->destination_encoding_name) == 0) {
2195 mesg = rb_sprintf("%s from %s to %s",
2196 StringValueCStr(dumped),
2197 ec->last_error.source_encoding,
2198 ec->last_error.destination_encoding);
2199 }
2200 else {
2201 int i;
2202 mesg = rb_sprintf("%s to %s in conversion from %s",
2203 StringValueCStr(dumped),
2204 ec->last_error.destination_encoding,
2205 ec->source_encoding_name);
2206 for (i = 0; i < ec->num_trans; i++) {
2207 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2208 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2209 rb_str_catf(mesg, " to %s",
2210 ec->elems[i].tc->transcoder->dst_encoding);
2211 }
2212 }
2213 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2214 idx = rb_enc_find_index(ec->last_error.source_encoding);
2215 if (0 <= idx)
2216 rb_enc_associate_index(bytes, idx);
2217 rb_ivar_set(exc, id_error_char, bytes);
2218 goto set_encs;
2219 }
2220 return Qnil;
2221
2222 set_encs:
2223 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2224 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2225 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2226 if (0 <= idx)
2227 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2228 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2229 if (0 <= idx)
2230 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2231 return exc;
2232}
2233
2234static void
2235more_output_buffer(
2236 VALUE destination,
2237 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2238 int max_output,
2239 unsigned char **out_start_ptr,
2240 unsigned char **out_pos,
2241 unsigned char **out_stop_ptr)
2242{
2243 size_t len = (*out_pos - *out_start_ptr);
2244 size_t new_len = (len + max_output) * 2;
2245 *out_start_ptr = resize_destination(destination, len, new_len);
2246 *out_pos = *out_start_ptr + len;
2247 *out_stop_ptr = *out_start_ptr + new_len;
2248}
2249
2250static int
2251make_replacement(rb_econv_t *ec)
2252{
2253 rb_transcoding *tc;
2254 const rb_transcoder *tr;
2255 const unsigned char *replacement;
2256 const char *repl_enc;
2257 const char *ins_enc;
2258 size_t len;
2259
2260 if (ec->replacement_str)
2261 return 0;
2262
2264
2265 tc = ec->last_tc;
2266 if (*ins_enc) {
2267 tr = tc->transcoder;
2268 rb_enc_find(tr->dst_encoding);
2269 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2270 }
2271 else {
2272 replacement = (unsigned char *)"?";
2273 len = 1;
2274 repl_enc = "";
2275 }
2276
2277 ec->replacement_str = replacement;
2278 ec->replacement_len = len;
2279 ec->replacement_enc = repl_enc;
2280 ec->replacement_allocated = 0;
2281 return 0;
2282}
2283
2284int
2286 const unsigned char *str, size_t len, const char *encname)
2287{
2288 unsigned char *str2;
2289 size_t len2;
2290 const char *encname2;
2291
2293
2294 if (!*encname2 || encoding_equal(encname, encname2)) {
2295 str2 = xmalloc(len);
2296 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2297 len2 = len;
2298 encname2 = encname;
2299 }
2300 else {
2301 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2302 if (!str2)
2303 return -1;
2304 }
2305
2306 if (ec->replacement_allocated) {
2307 xfree((void *)ec->replacement_str);
2308 }
2309 ec->replacement_allocated = 1;
2310 ec->replacement_str = str2;
2311 ec->replacement_len = len2;
2312 ec->replacement_enc = encname2;
2313 return 0;
2314}
2315
2316static int
2317output_replacement_character(rb_econv_t *ec)
2318{
2319 int ret;
2320
2321 if (make_replacement(ec) == -1)
2322 return -1;
2323
2324 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2325 if (ret == -1)
2326 return -1;
2327
2328 return 0;
2329}
2330
2331#if 1
2332#define hash_fallback rb_hash_aref
2333
2334static VALUE
2335proc_fallback(VALUE fallback, VALUE c)
2336{
2337 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2338}
2339
2340static VALUE
2341method_fallback(VALUE fallback, VALUE c)
2342{
2343 return rb_method_call(1, &c, fallback);
2344}
2345
2346static VALUE
2347aref_fallback(VALUE fallback, VALUE c)
2348{
2349 return rb_funcallv_public(fallback, idAREF, 1, &c);
2350}
2351
2353 VALUE (*fallback_func)(VALUE, VALUE);
2354 VALUE fallback;
2355 VALUE rep;
2356};
2357
2358static VALUE
2359transcode_loop_fallback_try(VALUE a)
2360{
2362
2363 VALUE ret = args->fallback_func(args->fallback, args->rep);
2364
2365 if (!UNDEF_P(ret) && !NIL_P(ret)) {
2366 StringValue(ret);
2367 }
2368
2369 return ret;
2370}
2371
2372static void
2373transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2374 const unsigned char *in_stop, unsigned char *out_stop,
2375 VALUE destination,
2376 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2377 const char *src_encoding,
2378 const char *dst_encoding,
2379 int ecflags,
2380 VALUE ecopts)
2381{
2382 rb_econv_t *ec;
2383 rb_transcoding *last_tc;
2385 unsigned char *out_start = *out_pos;
2386 int max_output;
2387 VALUE exc;
2388 VALUE fallback = Qnil;
2389 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2390
2391 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2392 if (!ec)
2393 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2394
2395 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2396 fallback = rb_hash_aref(ecopts, sym_fallback);
2397 if (RB_TYPE_P(fallback, T_HASH)) {
2398 fallback_func = hash_fallback;
2399 }
2400 else if (rb_obj_is_proc(fallback)) {
2401 fallback_func = proc_fallback;
2402 }
2403 else if (rb_obj_is_method(fallback)) {
2404 fallback_func = method_fallback;
2405 }
2406 else {
2407 fallback_func = aref_fallback;
2408 }
2409 }
2410 last_tc = ec->last_tc;
2411 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2412
2413 resume:
2414 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2415
2416 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2417 VALUE rep = rb_enc_str_new(
2418 (const char *)ec->last_error.error_bytes_start,
2419 ec->last_error.error_bytes_len,
2420 rb_enc_find(ec->last_error.source_encoding));
2421
2422
2423 struct transcode_loop_fallback_args args = {
2424 .fallback_func = fallback_func,
2425 .fallback = fallback,
2426 .rep = rep,
2427 };
2428
2429 int state;
2430 rep = rb_protect(transcode_loop_fallback_try, (VALUE)&args, &state);
2431 if (state) {
2432 rb_econv_close(ec);
2433 rb_jump_tag(state);
2434 }
2435
2436 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2437 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2438 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2439 if ((int)ret == -1) {
2440 rb_econv_close(ec);
2441 rb_raise(rb_eArgError, "too big fallback string");
2442 }
2443 goto resume;
2444 }
2445 }
2446
2447 if (ret == econv_invalid_byte_sequence ||
2448 ret == econv_incomplete_input ||
2450 exc = make_econv_exception(ec);
2451 rb_econv_close(ec);
2452 rb_exc_raise(exc);
2453 }
2454
2455 if (ret == econv_destination_buffer_full) {
2456 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2457 goto resume;
2458 }
2459
2460 rb_econv_close(ec);
2461 return;
2462}
2463#else
2464/* sample transcode_loop implementation in byte-by-byte stream style */
2465static void
2466transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2467 const unsigned char *in_stop, unsigned char *out_stop,
2468 VALUE destination,
2469 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2470 const char *src_encoding,
2471 const char *dst_encoding,
2472 int ecflags,
2473 VALUE ecopts)
2474{
2475 rb_econv_t *ec;
2476 rb_transcoding *last_tc;
2478 unsigned char *out_start = *out_pos;
2479 const unsigned char *ptr;
2480 int max_output;
2481 VALUE exc;
2482
2483 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2484 if (!ec)
2485 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2486
2487 last_tc = ec->last_tc;
2488 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2489
2491 ptr = *in_pos;
2492 while (ret != econv_finished) {
2493 unsigned char input_byte;
2494 const unsigned char *p = &input_byte;
2495
2496 if (ret == econv_source_buffer_empty) {
2497 if (ptr < in_stop) {
2498 input_byte = *ptr;
2499 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2500 }
2501 else {
2502 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2503 }
2504 }
2505 else {
2506 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2507 }
2508 if (&input_byte != p)
2509 ptr += p - &input_byte;
2510 switch (ret) {
2514 exc = make_econv_exception(ec);
2515 rb_econv_close(ec);
2516 rb_exc_raise(exc);
2517 break;
2518
2520 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2521 break;
2522
2524 break;
2525
2526 case econv_finished:
2527 break;
2528 }
2529 }
2530 rb_econv_close(ec);
2531 *in_pos = in_stop;
2532 return;
2533}
2534#endif
2535
2536
2537/*
2538 * String-specific code
2539 */
2540
2541static unsigned char *
2542str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2543{
2544 rb_str_resize(destination, new_len);
2545 return (unsigned char *)RSTRING_PTR(destination);
2546}
2547
2548static int
2549econv_opts(VALUE opt, int ecflags)
2550{
2551 VALUE v;
2552 int newlineflag = 0;
2553
2554 v = rb_hash_aref(opt, sym_invalid);
2555 if (NIL_P(v)) {
2556 }
2557 else if (v==sym_replace) {
2558 ecflags |= ECONV_INVALID_REPLACE;
2559 }
2560 else {
2561 rb_raise(rb_eArgError, "unknown value for invalid character option");
2562 }
2563
2564 v = rb_hash_aref(opt, sym_undef);
2565 if (NIL_P(v)) {
2566 }
2567 else if (v==sym_replace) {
2568 ecflags |= ECONV_UNDEF_REPLACE;
2569 }
2570 else {
2571 rb_raise(rb_eArgError, "unknown value for undefined character option");
2572 }
2573
2574 v = rb_hash_aref(opt, sym_replace);
2575 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2576 ecflags |= ECONV_UNDEF_REPLACE;
2577 }
2578
2579 v = rb_hash_aref(opt, sym_xml);
2580 if (!NIL_P(v)) {
2581 if (v==sym_text) {
2583 }
2584 else if (v==sym_attr) {
2586 }
2587 else if (SYMBOL_P(v)) {
2588 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2589 }
2590 else {
2591 rb_raise(rb_eArgError, "unexpected value for xml option");
2592 }
2593 }
2594
2595#ifdef ENABLE_ECONV_NEWLINE_OPTION
2596 v = rb_hash_aref(opt, sym_newline);
2597 if (!NIL_P(v)) {
2598 newlineflag = 2;
2599 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2600 if (v == sym_universal) {
2602 }
2603 else if (v == sym_crlf) {
2605 }
2606 else if (v == sym_cr) {
2607 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2608 }
2609 else if (v == sym_lf) {
2610 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2611 }
2612 else if (SYMBOL_P(v)) {
2613 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2614 rb_sym2str(v));
2615 }
2616 else {
2617 rb_raise(rb_eArgError, "unexpected value for newline option");
2618 }
2619 }
2620#endif
2621 {
2622 int setflags = 0;
2623
2624 v = rb_hash_aref(opt, sym_universal_newline);
2625 if (RTEST(v))
2627 newlineflag |= !NIL_P(v);
2628
2629 v = rb_hash_aref(opt, sym_crlf_newline);
2630 if (RTEST(v))
2631 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2632 newlineflag |= !NIL_P(v);
2633
2634 v = rb_hash_aref(opt, sym_cr_newline);
2635 if (RTEST(v))
2636 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2637 newlineflag |= !NIL_P(v);
2638
2639 v = rb_hash_aref(opt, sym_lf_newline);
2640 if (RTEST(v))
2641 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2642 newlineflag |= !NIL_P(v);
2643
2644 switch (newlineflag) {
2645 case 1:
2646 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2647 ecflags |= setflags;
2648 break;
2649
2650 case 3:
2651 rb_warning(":newline option precedes other newline options");
2652 break;
2653 }
2654 }
2655
2656 return ecflags;
2657}
2658
2659int
2660rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2661{
2662 VALUE newhash = Qnil;
2663 VALUE v;
2664
2665 if (NIL_P(opthash)) {
2666 *opts = Qnil;
2667 return ecflags;
2668 }
2669 ecflags = econv_opts(opthash, ecflags);
2670
2671 v = rb_hash_aref(opthash, sym_replace);
2672 if (!NIL_P(v)) {
2673 StringValue(v);
2674 if (is_broken_string(v)) {
2675 VALUE dumped = rb_str_dump(v);
2676 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2677 StringValueCStr(dumped),
2678 rb_enc_name(rb_enc_get(v)));
2679 }
2680 v = rb_str_new_frozen(v);
2681 newhash = rb_hash_new();
2682 rb_hash_aset(newhash, sym_replace, v);
2683 }
2684
2685 v = rb_hash_aref(opthash, sym_fallback);
2686 if (!NIL_P(v)) {
2687 VALUE h = rb_check_hash_type(v);
2688 if (NIL_P(h)
2689 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2690 : (v = h, 1)) {
2691 if (NIL_P(newhash))
2692 newhash = rb_hash_new();
2693 rb_hash_aset(newhash, sym_fallback, v);
2694 }
2695 }
2696
2697 if (!NIL_P(newhash))
2698 rb_hash_freeze(newhash);
2699 *opts = newhash;
2700
2701 return ecflags;
2702}
2703
2704int
2706{
2707 return rb_econv_prepare_options(opthash, opts, 0);
2708}
2709
2710rb_econv_t *
2711rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2712{
2713 rb_econv_t *ec;
2714 VALUE replacement;
2715
2716 if (NIL_P(opthash)) {
2717 replacement = Qnil;
2718 }
2719 else {
2720 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2721 rb_bug("rb_econv_open_opts called with invalid opthash");
2722 replacement = rb_hash_aref(opthash, sym_replace);
2723 }
2724
2725 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2726 if (ec) {
2727 if (!NIL_P(replacement)) {
2728 int ret;
2729 rb_encoding *enc = rb_enc_get(replacement);
2730
2731 ret = rb_econv_set_replacement(ec,
2732 (const unsigned char *)RSTRING_PTR(replacement),
2733 RSTRING_LEN(replacement),
2734 rb_enc_name(enc));
2735 if (ret == -1) {
2736 rb_econv_close(ec);
2737 ec = NULL;
2738 }
2739 }
2740 }
2741 return ec; // can be NULL
2742}
2743
2744static int
2745enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2746{
2747 rb_encoding *enc;
2748 const char *n;
2749 int encidx;
2750 VALUE encval;
2751
2752 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2753 !(enc = rb_enc_from_index(encidx))) {
2754 enc = NULL;
2755 encidx = 0;
2756 n = StringValueCStr(*arg);
2757 }
2758 else {
2759 n = rb_enc_name(enc);
2760 }
2761
2762 *name_p = n;
2763 *enc_p = enc;
2764
2765 return encidx;
2766}
2767
2768static int
2769str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2770 const char **sname_p, rb_encoding **senc_p,
2771 const char **dname_p, rb_encoding **denc_p)
2772{
2773 rb_encoding *senc, *denc;
2774 const char *sname, *dname;
2775 int sencidx, dencidx;
2776
2777 dencidx = enc_arg(arg1, &dname, &denc);
2778
2779 if (NIL_P(*arg2)) {
2780 sencidx = rb_enc_get_index(str);
2781 senc = rb_enc_from_index(sencidx);
2782 sname = rb_enc_name(senc);
2783 }
2784 else {
2785 sencidx = enc_arg(arg2, &sname, &senc);
2786 }
2787
2788 *sname_p = sname;
2789 *senc_p = senc;
2790 *dname_p = dname;
2791 *denc_p = denc;
2792 return dencidx;
2793}
2794
2795static int
2796str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2797{
2798 VALUE dest;
2799 VALUE str = *self;
2800 VALUE arg1, arg2;
2801 long blen, slen;
2802 unsigned char *buf, *bp, *sp;
2803 const unsigned char *fromp;
2804 rb_encoding *senc, *denc;
2805 const char *sname, *dname;
2806 int dencidx;
2807 int explicitly_invalid_replace = TRUE;
2808
2809 rb_check_arity(argc, 0, 2);
2810
2811 if (argc == 0) {
2812 arg1 = rb_enc_default_internal();
2813 if (NIL_P(arg1)) {
2814 if (!ecflags) return -1;
2815 arg1 = rb_obj_encoding(str);
2816 }
2817 if (!(ecflags & ECONV_INVALID_MASK)) {
2818 explicitly_invalid_replace = FALSE;
2819 }
2821 }
2822 else {
2823 arg1 = argv[0];
2824 }
2825 arg2 = argc<=1 ? Qnil : argv[1];
2826 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2827
2828 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2832 if (senc && senc == denc) {
2833 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2834 VALUE rep = Qnil;
2835 if (!NIL_P(ecopts)) {
2836 rep = rb_hash_aref(ecopts, sym_replace);
2837 }
2838 dest = rb_enc_str_scrub(senc, str, rep);
2839 if (NIL_P(dest)) dest = str;
2840 *self = dest;
2841 return dencidx;
2842 }
2843 return NIL_P(arg2) ? -1 : dencidx;
2844 }
2845 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2846 if (is_ascii_string(str)) {
2847 return dencidx;
2848 }
2849 }
2850 if (encoding_equal(sname, dname)) {
2851 return NIL_P(arg2) ? -1 : dencidx;
2852 }
2853 }
2854 else {
2855 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2856 rb_encoding *utf8 = rb_utf8_encoding();
2857 str = rb_str_conv_enc(str, senc, utf8);
2858 senc = utf8;
2859 sname = "UTF-8";
2860 }
2861 if (encoding_equal(sname, dname)) {
2862 sname = "";
2863 dname = "";
2864 }
2865 }
2866
2867 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2868 slen = RSTRING_LEN(str);
2869 blen = slen + 30; /* len + margin */
2870 dest = rb_str_tmp_new(blen);
2871 bp = (unsigned char *)RSTRING_PTR(dest);
2872
2873 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2874 if (fromp != sp+slen) {
2875 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2876 }
2877 buf = (unsigned char *)RSTRING_PTR(dest);
2878 *bp = '\0';
2879 rb_str_set_len(dest, bp - buf);
2880
2881 /* set encoding */
2882 if (!denc) {
2883 dencidx = rb_define_dummy_encoding(dname);
2884 RB_GC_GUARD(arg1);
2885 RB_GC_GUARD(arg2);
2886 }
2887 *self = dest;
2888
2889 return dencidx;
2890}
2891
2892static int
2893str_transcode(int argc, VALUE *argv, VALUE *self)
2894{
2895 VALUE opt;
2896 int ecflags = 0;
2897 VALUE ecopts = Qnil;
2898
2899 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2900 if (!NIL_P(opt)) {
2901 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2902 }
2903 return str_transcode0(argc, argv, self, ecflags, ecopts);
2904}
2905
2906static inline VALUE
2907str_encode_associate(VALUE str, int encidx)
2908{
2909 int cr = 0;
2910
2911 rb_enc_associate_index(str, encidx);
2912
2913 /* transcoded string never be broken. */
2914 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2915 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2916 }
2917 else {
2919 }
2920 ENC_CODERANGE_SET(str, cr);
2921 return str;
2922}
2923
2924/*
2925 * call-seq:
2926 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2927 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2928 *
2929 * Like #encode, but applies encoding changes to +self+; returns +self+.
2930 *
2931 * Related: see {Modifying}[rdoc-ref:String@Modifying].
2932 */
2933
2934static VALUE
2935str_encode_bang(int argc, VALUE *argv, VALUE str)
2936{
2937 VALUE newstr;
2938 int encidx;
2939
2940 rb_check_frozen(str);
2941
2942 newstr = str;
2943 encidx = str_transcode(argc, argv, &newstr);
2944
2945 if (encidx < 0) return str;
2946 if (newstr == str) {
2947 rb_enc_associate_index(str, encidx);
2948 return str;
2949 }
2950 rb_str_shared_replace(str, newstr);
2951 return str_encode_associate(str, encidx);
2952}
2953
2954static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2955
2956/*
2957 * call-seq:
2958 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2959 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2960 *
2961 * :include: doc/string/encode.rdoc
2962 *
2963 */
2964
2965static VALUE
2966str_encode(int argc, VALUE *argv, VALUE str)
2967{
2968 VALUE newstr = str;
2969 int encidx = str_transcode(argc, argv, &newstr);
2970 return encoded_dup(newstr, str, encidx);
2971}
2972
2973VALUE
2974rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2975{
2976 int argc = 1;
2977 VALUE *argv = &to;
2978 VALUE newstr = str;
2979 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2980 return encoded_dup(newstr, str, encidx);
2981}
2982
2983static VALUE
2984encoded_dup(VALUE newstr, VALUE str, int encidx)
2985{
2986 if (encidx < 0) return rb_str_dup(str);
2987 if (newstr == str) {
2988 newstr = rb_str_dup(str);
2989 rb_enc_associate_index(newstr, encidx);
2990 return newstr;
2991 }
2992 else {
2993 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2994 }
2995 return str_encode_associate(newstr, encidx);
2996}
2997
2998/*
2999 * Document-class: Encoding::Converter
3000 *
3001 * Encoding conversion class.
3002 */
3003static void
3004econv_free(void *ptr)
3005{
3006 rb_econv_t *ec = ptr;
3007 rb_econv_close(ec);
3008}
3009
3010static size_t
3011econv_memsize(const void *ptr)
3012{
3013 return sizeof(rb_econv_t);
3014}
3015
3016static const rb_data_type_t econv_data_type = {
3017 "econv",
3018 {0, econv_free, econv_memsize,},
3019 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
3020};
3021
3022static VALUE
3023econv_s_allocate(VALUE klass)
3024{
3025 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
3026}
3027
3028static rb_encoding *
3029make_dummy_encoding(const char *name)
3030{
3031 rb_encoding *enc;
3032 int idx;
3033 idx = rb_define_dummy_encoding(name);
3034 enc = rb_enc_from_index(idx);
3035 return enc;
3036}
3037
3038static rb_encoding *
3039make_encoding(const char *name)
3040{
3041 rb_encoding *enc;
3042 enc = rb_enc_find(name);
3043 if (!enc) {
3044 RB_VM_LOCKING() {
3045 if (rb_enc_registered(name)) {
3046 enc = NULL;
3047 }
3048 else {
3049 enc = make_dummy_encoding(name);
3050 }
3051 }
3052 }
3053 return enc;
3054}
3055
3056static VALUE
3057make_encobj(const char *name)
3058{
3059 return rb_enc_from_encoding(make_encoding(name));
3060}
3061
3062/*
3063 * call-seq:
3064 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3065 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3066 *
3067 * Returns the corresponding ASCII compatible encoding.
3068 *
3069 * Returns nil if the argument is an ASCII compatible encoding.
3070 *
3071 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3072 * can represents exactly the same characters as the given ASCII incompatible encoding.
3073 * So, no conversion undefined error occurs when converting between the two encodings.
3074 *
3075 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3076 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3077 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3078 *
3079 */
3080static VALUE
3081econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3082{
3083 const char *arg_name, *result_name;
3084 rb_encoding *arg_enc, *result_enc;
3085 VALUE enc = Qnil;
3086
3087 enc_arg(&arg, &arg_name, &arg_enc);
3088 result_name = rb_econv_asciicompat_encoding(arg_name);
3089 if (result_name) {
3090 result_enc = make_encoding(result_name);
3091 enc = rb_enc_from_encoding(result_enc);
3092 }
3093 return enc;
3094}
3095
3096static void
3097econv_args(int argc, VALUE *argv,
3098 VALUE *snamev_p, VALUE *dnamev_p,
3099 const char **sname_p, const char **dname_p,
3100 rb_encoding **senc_p, rb_encoding **denc_p,
3101 int *ecflags_p,
3102 VALUE *ecopts_p)
3103{
3104 VALUE opt, flags_v, ecopts;
3105 int sidx, didx;
3106 const char *sname, *dname;
3107 rb_encoding *senc, *denc;
3108 int ecflags;
3109
3110 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3111
3112 if (!NIL_P(flags_v)) {
3113 if (!NIL_P(opt)) {
3114 rb_error_arity(argc + 1, 2, 3);
3115 }
3116 ecflags = NUM2INT(rb_to_int(flags_v));
3117 ecopts = Qnil;
3118 }
3119 else if (!NIL_P(opt)) {
3120 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3121 }
3122 else {
3123 ecflags = 0;
3124 ecopts = Qnil;
3125 }
3126
3127 senc = NULL;
3128 sidx = rb_to_encoding_index(*snamev_p);
3129 if (0 <= sidx) {
3130 senc = rb_enc_from_index(sidx);
3131 }
3132 else {
3133 StringValue(*snamev_p);
3134 }
3135
3136 denc = NULL;
3137 didx = rb_to_encoding_index(*dnamev_p);
3138 if (0 <= didx) {
3139 denc = rb_enc_from_index(didx);
3140 }
3141 else {
3142 StringValue(*dnamev_p);
3143 }
3144
3145 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3146 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3147
3148 *sname_p = sname;
3149 *dname_p = dname;
3150 *senc_p = senc;
3151 *denc_p = denc;
3152 *ecflags_p = ecflags;
3153 *ecopts_p = ecopts;
3154}
3155
3156static int
3157decorate_convpath(VALUE convpath, int ecflags)
3158{
3159 int num_decorators;
3160 const char *decorators[MAX_ECFLAGS_DECORATORS];
3161 int i;
3162 int n, len;
3163
3164 num_decorators = decorator_names(ecflags, decorators);
3165 if (num_decorators == -1)
3166 return -1;
3167
3168 len = n = RARRAY_LENINT(convpath);
3169 if (n != 0) {
3170 VALUE pair = RARRAY_AREF(convpath, n-1);
3171 if (RB_TYPE_P(pair, T_ARRAY)) {
3172 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3173 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3174 transcoder_entry_t *entry;
3175 const rb_transcoder *tr;
3176 entry = get_transcoder_entry(sname, dname);
3177 tr = load_transcoder_entry(entry);
3178 if (!tr)
3179 return -1;
3180 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3181 tr->asciicompat_type == asciicompat_encoder) {
3182 n--;
3183 rb_ary_store(convpath, len + num_decorators - 1, pair);
3184 }
3185 }
3186 else {
3187 rb_ary_store(convpath, len + num_decorators - 1, pair);
3188 }
3189 }
3190
3191 for (i = 0; i < num_decorators; i++)
3192 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3193
3194 return 0;
3195}
3196
3197static void
3198search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3199{
3200 VALUE *ary_p = arg;
3201 VALUE v;
3202
3203 if (NIL_P(*ary_p)) {
3204 *ary_p = rb_ary_new();
3205 }
3206
3207 if (DECORATOR_P(sname, dname)) {
3208 v = rb_str_new_cstr(dname);
3209 }
3210 else {
3211 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3212 }
3213 rb_ary_store(*ary_p, depth, v);
3214}
3215
3216/*
3217 * call-seq:
3218 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3219 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3220 *
3221 * Returns a conversion path.
3222 *
3223 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3224 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3225 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3226 *
3227 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3228 * or
3229 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3230 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3231 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3232 * # "universal_newline"]
3233 *
3234 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3235 * or
3236 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3237 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3238 * # "universal_newline",
3239 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3240 */
3241static VALUE
3242econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3243{
3244 VALUE snamev, dnamev;
3245 const char *sname, *dname;
3246 rb_encoding *senc, *denc;
3247 int ecflags;
3248 VALUE ecopts;
3249 VALUE convpath;
3250
3251 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3252
3253 convpath = Qnil;
3254 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3255
3256 if (NIL_P(convpath)) {
3257 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3258 RB_GC_GUARD(snamev);
3259 RB_GC_GUARD(dnamev);
3260 rb_exc_raise(exc);
3261 }
3262
3263 if (decorate_convpath(convpath, ecflags) == -1) {
3264 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3265 RB_GC_GUARD(snamev);
3266 RB_GC_GUARD(dnamev);
3267 rb_exc_raise(exc);
3268 }
3269
3270 return convpath;
3271}
3272
3273/*
3274 * Check the existence of a conversion path.
3275 * Returns the number of converters in the conversion path.
3276 * result: >=0:success -1:failure
3277 */
3278int
3279rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3280{
3281 VALUE convpath = Qnil;
3282 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3283 &convpath);
3284 return RTEST(convpath);
3285}
3286
3288 rb_econv_t *ec;
3289 int index;
3290 int ret;
3291};
3292
3293static void
3294rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3295{
3297 int ret;
3298
3299 if (a->ret == -1)
3300 return;
3301
3302 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3303
3304 a->ret = ret;
3305 return;
3306}
3307
3308static rb_econv_t *
3309rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3310 const char **sname_p, const char **dname_p,
3311 rb_encoding **senc_p, rb_encoding**denc_p)
3312{
3313 rb_econv_t *ec;
3314 long i;
3315 int ret, first=1;
3316 VALUE elt;
3317 rb_encoding *senc = 0, *denc = 0;
3318 const char *sname, *dname;
3319
3320 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3321 DATA_PTR(self) = ec;
3322
3323 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3324 VALUE snamev, dnamev;
3325 VALUE pair;
3326 elt = rb_ary_entry(convpath, i);
3327 if (!NIL_P(pair = rb_check_array_type(elt))) {
3328 if (RARRAY_LEN(pair) != 2)
3329 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3330 snamev = rb_ary_entry(pair, 0);
3331 enc_arg(&snamev, &sname, &senc);
3332 dnamev = rb_ary_entry(pair, 1);
3333 enc_arg(&dnamev, &dname, &denc);
3334 }
3335 else {
3336 sname = "";
3337 dname = StringValueCStr(elt);
3338 }
3339 if (DECORATOR_P(sname, dname)) {
3340 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3341 if (ret == -1) {
3342 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3343 RB_GC_GUARD(snamev);
3344 RB_GC_GUARD(dnamev);
3345 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3346 }
3347 }
3348 else {
3349 int j = ec->num_trans;
3350 struct rb_econv_init_by_convpath_t arg;
3351 arg.ec = ec;
3352 arg.index = ec->num_trans;
3353 arg.ret = 0;
3354 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3355 if (ret == -1 || arg.ret == -1) {
3356 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3357 RB_GC_GUARD(snamev);
3358 RB_GC_GUARD(dnamev);
3359 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3360 }
3361 if (first) {
3362 first = 0;
3363 *senc_p = senc;
3364 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3365 }
3366 *denc_p = denc;
3367 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3368 }
3369 }
3370
3371 if (first) {
3372 *senc_p = NULL;
3373 *denc_p = NULL;
3374 *sname_p = "";
3375 *dname_p = "";
3376 }
3377
3378 ec->source_encoding_name = *sname_p;
3379 ec->destination_encoding_name = *dname_p;
3380
3381 return ec;
3382}
3383
3384/*
3385 * call-seq:
3386 * Encoding::Converter.new(source_encoding, destination_encoding)
3387 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3388 * Encoding::Converter.new(convpath)
3389 *
3390 * possible options elements:
3391 * hash form:
3392 * :invalid => nil # raise error on invalid byte sequence (default)
3393 * :invalid => :replace # replace invalid byte sequence
3394 * :undef => nil # raise error on undefined conversion (default)
3395 * :undef => :replace # replace undefined conversion
3396 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3397 * :newline => :universal # decorator for converting CRLF and CR to LF
3398 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3399 * :newline => :crlf # decorator for converting LF to CRLF
3400 * :newline => :cr # decorator for converting LF to CR
3401 * :universal_newline => true # decorator for converting CRLF and CR to LF
3402 * :crlf_newline => true # decorator for converting LF to CRLF
3403 * :cr_newline => true # decorator for converting LF to CR
3404 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3405 * :xml => :text # escape as XML CharData.
3406 * :xml => :attr # escape as XML AttValue
3407 * integer form:
3408 * Encoding::Converter::INVALID_REPLACE
3409 * Encoding::Converter::UNDEF_REPLACE
3410 * Encoding::Converter::UNDEF_HEX_CHARREF
3411 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3412 * Encoding::Converter::LF_NEWLINE_DECORATOR
3413 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3414 * Encoding::Converter::CR_NEWLINE_DECORATOR
3415 * Encoding::Converter::XML_TEXT_DECORATOR
3416 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3417 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3418 *
3419 * Encoding::Converter.new creates an instance of Encoding::Converter.
3420 *
3421 * Source_encoding and destination_encoding should be a string or
3422 * Encoding object.
3423 *
3424 * opt should be nil, a hash or an integer.
3425 *
3426 * convpath should be an array.
3427 * convpath may contain
3428 * - two-element arrays which contain encodings or encoding names, or
3429 * - strings representing decorator names.
3430 *
3431 * Encoding::Converter.new optionally takes an option.
3432 * The option should be a hash or an integer.
3433 * The option hash can contain :invalid => nil, etc.
3434 * The option integer should be logical-or of constants such as
3435 * Encoding::Converter::INVALID_REPLACE, etc.
3436 *
3437 * [:invalid => nil]
3438 * Raise error on invalid byte sequence. This is a default behavior.
3439 * [:invalid => :replace]
3440 * Replace invalid byte sequence by replacement string.
3441 * [:undef => nil]
3442 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3443 * This is a default behavior.
3444 * [:undef => :replace]
3445 * Replace undefined character in destination_encoding with replacement string.
3446 * [:replace => string]
3447 * Specify the replacement string.
3448 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3449 * [:universal_newline => true]
3450 * Convert CRLF and CR to LF.
3451 * [:crlf_newline => true]
3452 * Convert LF to CRLF.
3453 * [:cr_newline => true]
3454 * Convert LF to CR.
3455 * [:lf_newline => true]
3456 * Convert CRLF and CR to LF (when writing).
3457 * [:xml => :text]
3458 * Escape as XML CharData.
3459 * This form can be used as an HTML 4.0 #PCDATA.
3460 * - '&' -> '&amp;'
3461 * - '<' -> '&lt;'
3462 * - '>' -> '&gt;'
3463 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3464 * [:xml => :attr]
3465 * Escape as XML AttValue.
3466 * The converted result is quoted as "...".
3467 * This form can be used as an HTML 4.0 attribute value.
3468 * - '&' -> '&amp;'
3469 * - '<' -> '&lt;'
3470 * - '>' -> '&gt;'
3471 * - '"' -> '&quot;'
3472 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3473 *
3474 * Examples:
3475 * # UTF-16BE to UTF-8
3476 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3477 *
3478 * # Usually, decorators such as newline conversion are inserted last.
3479 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3480 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3481 * # "universal_newline"]
3482 *
3483 * # But, if the last encoding is ASCII incompatible,
3484 * # decorators are inserted before the last conversion.
3485 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3486 * p ec.convpath #=> ["crlf_newline",
3487 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3488 *
3489 * # Conversion path can be specified directly.
3490 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3491 * p ec.convpath #=> ["universal_newline",
3492 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3493 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3494 */
3495static VALUE
3496econv_init(int argc, VALUE *argv, VALUE self)
3497{
3498 VALUE ecopts;
3499 VALUE snamev, dnamev;
3500 const char *sname, *dname;
3501 rb_encoding *senc, *denc;
3502 rb_econv_t *ec;
3503 int ecflags;
3504 VALUE convpath;
3505
3506 if (rb_check_typeddata(self, &econv_data_type)) {
3507 rb_raise(rb_eTypeError, "already initialized");
3508 }
3509
3510 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3511 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3512 ecflags = 0;
3513 ecopts = Qnil;
3514 }
3515 else {
3516 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3517 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3518 }
3519
3520 if (!ec) {
3521 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3522 RB_GC_GUARD(snamev);
3523 RB_GC_GUARD(dnamev);
3524 rb_exc_raise(exc);
3525 }
3526
3527 if (!DECORATOR_P(sname, dname)) {
3528 if (!senc)
3529 senc = make_dummy_encoding(sname);
3530 if (!denc)
3531 denc = make_dummy_encoding(dname);
3532 RB_GC_GUARD(snamev);
3533 RB_GC_GUARD(dnamev);
3534 }
3535
3536 ec->source_encoding = senc;
3537 ec->destination_encoding = denc;
3538
3539 DATA_PTR(self) = ec;
3540
3541 return self;
3542}
3543
3544/*
3545 * call-seq:
3546 * ec.inspect -> string
3547 *
3548 * Returns a printable version of <i>ec</i>
3549 *
3550 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3551 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3552 *
3553 */
3554static VALUE
3555econv_inspect(VALUE self)
3556{
3557 const char *cname = rb_obj_classname(self);
3558 rb_econv_t *ec;
3559
3560 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3561 if (!ec)
3562 return rb_sprintf("#<%s: uninitialized>", cname);
3563 else {
3564 const char *sname = ec->source_encoding_name;
3565 const char *dname = ec->destination_encoding_name;
3566 VALUE str;
3567 str = rb_sprintf("#<%s: ", cname);
3568 econv_description(sname, dname, ec->flags, str);
3569 rb_str_cat2(str, ">");
3570 return str;
3571 }
3572}
3573
3574static rb_econv_t *
3575check_econv(VALUE self)
3576{
3577 rb_econv_t *ec;
3578
3579 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3580 if (!ec) {
3581 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3582 }
3583 return ec;
3584}
3585
3586static VALUE
3587econv_get_encoding(rb_encoding *encoding)
3588{
3589 if (!encoding)
3590 return Qnil;
3591 return rb_enc_from_encoding(encoding);
3592}
3593
3594/*
3595 * call-seq:
3596 * ec.source_encoding -> encoding
3597 *
3598 * Returns the source encoding as an Encoding object.
3599 */
3600static VALUE
3601econv_source_encoding(VALUE self)
3602{
3603 rb_econv_t *ec = check_econv(self);
3604 return econv_get_encoding(ec->source_encoding);
3605}
3606
3607/*
3608 * call-seq:
3609 * ec.destination_encoding -> encoding
3610 *
3611 * Returns the destination encoding as an Encoding object.
3612 */
3613static VALUE
3614econv_destination_encoding(VALUE self)
3615{
3616 rb_econv_t *ec = check_econv(self);
3617 return econv_get_encoding(ec->destination_encoding);
3618}
3619
3620/*
3621 * call-seq:
3622 * ec.convpath -> ary
3623 *
3624 * Returns the conversion path of ec.
3625 *
3626 * The result is an array of conversions.
3627 *
3628 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3629 * p ec.convpath
3630 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3631 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3632 * # "crlf_newline"]
3633 *
3634 * Each element of the array is a pair of encodings or a string.
3635 * A pair means an encoding conversion.
3636 * A string means a decorator.
3637 *
3638 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3639 * a converter from ISO-8859-1 to UTF-8.
3640 * "crlf_newline" means newline converter from LF to CRLF.
3641 */
3642static VALUE
3643econv_convpath(VALUE self)
3644{
3645 rb_econv_t *ec = check_econv(self);
3646 VALUE result;
3647 int i;
3648
3649 result = rb_ary_new();
3650 for (i = 0; i < ec->num_trans; i++) {
3651 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3652 VALUE v;
3653 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3654 v = rb_str_new_cstr(tr->dst_encoding);
3655 else
3656 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3657 rb_ary_push(result, v);
3658 }
3659 return result;
3660}
3661
3662/*
3663 * call-seq:
3664 * ec == other -> true or false
3665 */
3666static VALUE
3667econv_equal(VALUE self, VALUE other)
3668{
3669 rb_econv_t *ec1 = check_econv(self);
3670 rb_econv_t *ec2;
3671 int i;
3672
3673 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3674 return Qnil;
3675 }
3676 ec2 = DATA_PTR(other);
3677 if (!ec2) return Qfalse;
3678 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3679 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3680 return Qfalse;
3681 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3682 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3683 return Qfalse;
3684 if (ec1->flags != ec2->flags) return Qfalse;
3685 if (ec1->replacement_enc != ec2->replacement_enc &&
3686 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3687 return Qfalse;
3688 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3689 if (ec1->replacement_str != ec2->replacement_str &&
3690 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3691 return Qfalse;
3692
3693 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3694 for (i = 0; i < ec1->num_trans; i++) {
3695 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3696 return Qfalse;
3697 }
3698 return Qtrue;
3699}
3700
3701static VALUE
3702econv_result_to_symbol(rb_econv_result_t res)
3703{
3704 switch (res) {
3705 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3706 case econv_incomplete_input: return sym_incomplete_input;
3707 case econv_undefined_conversion: return sym_undefined_conversion;
3708 case econv_destination_buffer_full: return sym_destination_buffer_full;
3709 case econv_source_buffer_empty: return sym_source_buffer_empty;
3710 case econv_finished: return sym_finished;
3711 case econv_after_output: return sym_after_output;
3712 default: return INT2NUM(res); /* should not be reached */
3713 }
3714}
3715
3716/*
3717 * call-seq:
3718 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3719 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3720 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3721 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3722 *
3723 * possible opt elements:
3724 * hash form:
3725 * :partial_input => true # source buffer may be part of larger source
3726 * :after_output => true # stop conversion after output before input
3727 * integer form:
3728 * Encoding::Converter::PARTIAL_INPUT
3729 * Encoding::Converter::AFTER_OUTPUT
3730 *
3731 * possible results:
3732 * :invalid_byte_sequence
3733 * :incomplete_input
3734 * :undefined_conversion
3735 * :after_output
3736 * :destination_buffer_full
3737 * :source_buffer_empty
3738 * :finished
3739 *
3740 * primitive_convert converts source_buffer into destination_buffer.
3741 *
3742 * source_buffer should be a string or nil.
3743 * nil means an empty string.
3744 *
3745 * destination_buffer should be a string.
3746 *
3747 * destination_byteoffset should be an integer or nil.
3748 * nil means the end of destination_buffer.
3749 * If it is omitted, nil is assumed.
3750 *
3751 * destination_bytesize should be an integer or nil.
3752 * nil means unlimited.
3753 * If it is omitted, nil is assumed.
3754 *
3755 * opt should be nil, a hash or an integer.
3756 * nil means no flags.
3757 * If it is omitted, nil is assumed.
3758 *
3759 * primitive_convert converts the content of source_buffer from beginning
3760 * and store the result into destination_buffer.
3761 *
3762 * destination_byteoffset and destination_bytesize specify the region which
3763 * the converted result is stored.
3764 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3765 * If destination_byteoffset is nil,
3766 * destination_buffer.bytesize is used for appending the result.
3767 * destination_bytesize specifies maximum number of bytes.
3768 * If destination_bytesize is nil,
3769 * destination size is unlimited.
3770 * After conversion, destination_buffer is resized to
3771 * destination_byteoffset + actually produced number of bytes.
3772 * Also destination_buffer's encoding is set to destination_encoding.
3773 *
3774 * primitive_convert drops the converted part of source_buffer.
3775 * the dropped part is converted in destination_buffer or
3776 * buffered in Encoding::Converter object.
3777 *
3778 * primitive_convert stops conversion when one of following condition met.
3779 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3780 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3781 * - unexpected end of source buffer (:incomplete_input)
3782 * this occur only when :partial_input is not specified.
3783 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3784 * - character not representable in output encoding (:undefined_conversion)
3785 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3786 * - after some output is generated, before input is done (:after_output)
3787 * this occur only when :after_output is specified.
3788 * - destination buffer is full (:destination_buffer_full)
3789 * this occur only when destination_bytesize is non-nil.
3790 * - source buffer is empty (:source_buffer_empty)
3791 * this occur only when :partial_input is specified.
3792 * - conversion is finished (:finished)
3793 *
3794 * example:
3795 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3796 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3797 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3798 *
3799 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3800 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3801 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3802 * ret = ec.primitive_convert(src, dst="", nil, 1)
3803 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3804 * ret = ec.primitive_convert(src, dst="", nil, 1)
3805 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3806 * ret = ec.primitive_convert(src, dst="", nil, 1)
3807 * p [ret, src, dst] #=> [:finished, "", "i"]
3808 *
3809 */
3810static VALUE
3811econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3812{
3813 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3814 rb_econv_t *ec = check_econv(self);
3816 const unsigned char *ip, *is;
3817 unsigned char *op, *os;
3818 long output_byteoffset, output_bytesize;
3819 unsigned long output_byteend;
3820 int flags;
3821
3822 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3823
3824 if (NIL_P(output_byteoffset_v))
3825 output_byteoffset = 0; /* dummy */
3826 else
3827 output_byteoffset = NUM2LONG(output_byteoffset_v);
3828
3829 if (NIL_P(output_bytesize_v))
3830 output_bytesize = 0; /* dummy */
3831 else
3832 output_bytesize = NUM2LONG(output_bytesize_v);
3833
3834 if (!NIL_P(flags_v)) {
3835 if (!NIL_P(opt)) {
3836 rb_error_arity(argc + 1, 2, 5);
3837 }
3838 flags = NUM2INT(rb_to_int(flags_v));
3839 }
3840 else if (!NIL_P(opt)) {
3841 VALUE v;
3842 flags = 0;
3843 v = rb_hash_aref(opt, sym_partial_input);
3844 if (RTEST(v))
3845 flags |= ECONV_PARTIAL_INPUT;
3846 v = rb_hash_aref(opt, sym_after_output);
3847 if (RTEST(v))
3848 flags |= ECONV_AFTER_OUTPUT;
3849 }
3850 else {
3851 flags = 0;
3852 }
3853
3854 StringValue(output);
3855 if (!NIL_P(input))
3856 StringValue(input);
3857 rb_str_modify(output);
3858
3859 if (NIL_P(output_bytesize_v)) {
3860 output_bytesize = rb_str_capacity(output);
3861
3862 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3863 output_bytesize = RSTRING_LEN(input);
3864 }
3865
3866 retry:
3867
3868 if (NIL_P(output_byteoffset_v))
3869 output_byteoffset = RSTRING_LEN(output);
3870
3871 if (output_byteoffset < 0)
3872 rb_raise(rb_eArgError, "negative output_byteoffset");
3873
3874 if (RSTRING_LEN(output) < output_byteoffset)
3875 rb_raise(rb_eArgError, "output_byteoffset too big");
3876
3877 if (output_bytesize < 0)
3878 rb_raise(rb_eArgError, "negative output_bytesize");
3879
3880 output_byteend = (unsigned long)output_byteoffset +
3881 (unsigned long)output_bytesize;
3882
3883 if (output_byteend < (unsigned long)output_byteoffset ||
3884 LONG_MAX < output_byteend)
3885 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3886
3887 if (rb_str_capacity(output) < output_byteend)
3888 rb_str_resize(output, output_byteend);
3889
3890 if (NIL_P(input)) {
3891 ip = is = NULL;
3892 }
3893 else {
3894 ip = (const unsigned char *)RSTRING_PTR(input);
3895 is = ip + RSTRING_LEN(input);
3896 }
3897
3898 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3899 os = op + output_bytesize;
3900
3901 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3902 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3903 if (!NIL_P(input)) {
3904 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3905 }
3906
3907 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3908 if (LONG_MAX / 2 < output_bytesize)
3909 rb_raise(rb_eArgError, "too long conversion result");
3910 output_bytesize *= 2;
3911 output_byteoffset_v = Qnil;
3912 goto retry;
3913 }
3914
3915 if (ec->destination_encoding) {
3916 rb_enc_associate(output, ec->destination_encoding);
3917 }
3918
3919 return econv_result_to_symbol(res);
3920}
3921
3922/*
3923 * call-seq:
3924 * ec.convert(source_string) -> destination_string
3925 *
3926 * Convert source_string and return destination_string.
3927 *
3928 * source_string is assumed as a part of source.
3929 * i.e. :partial_input=>true is specified internally.
3930 * finish method should be used last.
3931 *
3932 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3933 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3934 * puts ec.finish.dump #=> ""
3935 *
3936 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3937 * puts ec.convert("\xA4").dump #=> ""
3938 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3939 * puts ec.finish.dump #=> ""
3940 *
3941 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3942 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3943 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3944 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3945 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3946 *
3947 * If a conversion error occur,
3948 * Encoding::UndefinedConversionError or
3949 * Encoding::InvalidByteSequenceError is raised.
3950 * Encoding::Converter#convert doesn't supply methods to recover or restart
3951 * from these exceptions.
3952 * When you want to handle these conversion errors,
3953 * use Encoding::Converter#primitive_convert.
3954 *
3955 */
3956static VALUE
3957econv_convert(VALUE self, VALUE source_string)
3958{
3959 VALUE ret, dst;
3960 VALUE av[5];
3961 int ac;
3962 rb_econv_t *ec = check_econv(self);
3963
3964 StringValue(source_string);
3965
3966 dst = rb_str_new(NULL, 0);
3967
3968 av[0] = rb_str_dup(source_string);
3969 av[1] = dst;
3970 av[2] = Qnil;
3971 av[3] = Qnil;
3973 ac = 5;
3974
3975 ret = econv_primitive_convert(ac, av, self);
3976
3977 if (ret == sym_invalid_byte_sequence ||
3978 ret == sym_undefined_conversion ||
3979 ret == sym_incomplete_input) {
3980 VALUE exc = make_econv_exception(ec);
3981 rb_exc_raise(exc);
3982 }
3983
3984 if (ret == sym_finished) {
3985 rb_raise(rb_eArgError, "converter already finished");
3986 }
3987
3988 if (ret != sym_source_buffer_empty) {
3989 rb_bug("unexpected result of econv_primitive_convert");
3990 }
3991
3992 return dst;
3993}
3994
3995/*
3996 * call-seq:
3997 * ec.finish -> string
3998 *
3999 * Finishes the converter.
4000 * It returns the last part of the converted string.
4001 *
4002 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4003 * p ec.convert("\u3042") #=> "\e$B$\""
4004 * p ec.finish #=> "\e(B"
4005 */
4006static VALUE
4007econv_finish(VALUE self)
4008{
4009 VALUE ret, dst;
4010 VALUE av[5];
4011 int ac;
4012 rb_econv_t *ec = check_econv(self);
4013
4014 dst = rb_str_new(NULL, 0);
4015
4016 av[0] = Qnil;
4017 av[1] = dst;
4018 av[2] = Qnil;
4019 av[3] = Qnil;
4020 av[4] = INT2FIX(0);
4021 ac = 5;
4022
4023 ret = econv_primitive_convert(ac, av, self);
4024
4025 if (ret == sym_invalid_byte_sequence ||
4026 ret == sym_undefined_conversion ||
4027 ret == sym_incomplete_input) {
4028 VALUE exc = make_econv_exception(ec);
4029 rb_exc_raise(exc);
4030 }
4031
4032 if (ret != sym_finished) {
4033 rb_bug("unexpected result of econv_primitive_convert");
4034 }
4035
4036 return dst;
4037}
4038
4039/*
4040 * call-seq:
4041 * ec.primitive_errinfo -> array
4042 *
4043 * primitive_errinfo returns important information regarding the last error
4044 * as a 5-element array:
4045 *
4046 * [result, enc1, enc2, error_bytes, readagain_bytes]
4047 *
4048 * result is the last result of primitive_convert.
4049 *
4050 * Other elements are only meaningful when result is
4051 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
4052 *
4053 * enc1 and enc2 indicate a conversion step as a pair of strings.
4054 * For example, a converter from EUC-JP to ISO-8859-1 converts
4055 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
4056 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
4057 *
4058 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4059 * error_bytes is discarded portion.
4060 * readagain_bytes is buffered portion which is read again on next conversion.
4061 *
4062 * Example:
4063 *
4064 * # \xff is invalid as EUC-JP.
4065 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4066 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4067 * p ec.primitive_errinfo
4068 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4069 *
4070 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4071 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4072 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4073 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4074 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4075 * p ec.primitive_errinfo
4076 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4077 *
4078 * # partial character is invalid
4079 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4080 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4081 * p ec.primitive_errinfo
4082 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4083 *
4084 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4085 * # partial characters.
4086 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4087 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4088 * p ec.primitive_errinfo
4089 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4090 *
4091 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4092 * # no low surrogate after high surrogate (\xd8\x00).
4093 * # It is detected by 3rd byte (\00) which is part of next character.
4094 * # So the high surrogate (\xd8\x00) is discarded and
4095 * # the 3rd byte is read again later.
4096 * # Since the byte is buffered in ec, it is dropped from src.
4097 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4098 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4099 * p ec.primitive_errinfo
4100 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4101 * p src
4102 * #=> "@"
4103 *
4104 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4105 * # The problem is detected by 4th byte.
4106 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4107 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4108 * p ec.primitive_errinfo
4109 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4110 * p src
4111 * #=> ""
4112 *
4113 */
4114static VALUE
4115econv_primitive_errinfo(VALUE self)
4116{
4117 rb_econv_t *ec = check_econv(self);
4118
4119 VALUE ary;
4120
4121 ary = rb_ary_new2(5);
4122
4123 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4124 rb_ary_store(ary, 4, Qnil);
4125
4126 if (ec->last_error.source_encoding)
4127 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4128
4129 if (ec->last_error.destination_encoding)
4130 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4131
4132 if (ec->last_error.error_bytes_start) {
4133 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4134 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4135 }
4136
4137 return ary;
4138}
4139
4140/*
4141 * call-seq:
4142 * ec.insert_output(string) -> nil
4143 *
4144 * Inserts string into the encoding converter.
4145 * The string will be converted to the destination encoding and
4146 * output on later conversions.
4147 *
4148 * If the destination encoding is stateful,
4149 * string is converted according to the state and the state is updated.
4150 *
4151 * This method should be used only when a conversion error occurs.
4152 *
4153 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4154 * src = "HIRAGANA LETTER A is \u{3042}."
4155 * dst = ""
4156 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4157 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4158 * ec.insert_output("<err>")
4159 * p ec.primitive_convert(src, dst) #=> :finished
4160 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4161 *
4162 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4163 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4164 * dst = ""
4165 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4166 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4167 * ec.insert_output "?" # state change required to output "?".
4168 * p ec.primitive_convert(src, dst) #=> :finished
4169 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4170 *
4171 */
4172static VALUE
4173econv_insert_output(VALUE self, VALUE string)
4174{
4175 const char *insert_enc;
4176
4177 int ret;
4178
4179 rb_econv_t *ec = check_econv(self);
4180
4181 StringValue(string);
4182 insert_enc = rb_econv_encoding_to_insert_output(ec);
4183 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4184
4185 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4186 if (ret == -1) {
4187 rb_raise(rb_eArgError, "too big string");
4188 }
4189
4190 return Qnil;
4191}
4192
4193/*
4194 * call-seq:
4195 * ec.putback -> string
4196 * ec.putback(max_numbytes) -> string
4197 *
4198 * Put back the bytes which will be converted.
4199 *
4200 * The bytes are caused by invalid_byte_sequence error.
4201 * When invalid_byte_sequence error, some bytes are discarded and
4202 * some bytes are buffered to be converted later.
4203 * The latter bytes can be put back.
4204 * It can be observed by
4205 * Encoding::InvalidByteSequenceError#readagain_bytes and
4206 * Encoding::Converter#primitive_errinfo.
4207 *
4208 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4209 * src = "\x00\xd8\x61\x00"
4210 * dst = ""
4211 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4212 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4213 * p ec.putback #=> "a\x00"
4214 * p ec.putback #=> "" # no more bytes to put back
4215 *
4216 */
4217static VALUE
4218econv_putback(int argc, VALUE *argv, VALUE self)
4219{
4220 rb_econv_t *ec = check_econv(self);
4221 int n;
4222 int putbackable;
4223 VALUE str, max;
4224
4225 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4226 n = rb_econv_putbackable(ec);
4227 }
4228 else {
4229 n = NUM2INT(max);
4230 putbackable = rb_econv_putbackable(ec);
4231 if (putbackable < n)
4232 n = putbackable;
4233 }
4234
4235 str = rb_str_new(NULL, n);
4236 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4237
4238 if (ec->source_encoding) {
4239 rb_enc_associate(str, ec->source_encoding);
4240 }
4241
4242 return str;
4243}
4244
4245/*
4246 * call-seq:
4247 * ec.last_error -> exception or nil
4248 *
4249 * Returns an exception object for the last conversion.
4250 * Returns nil if the last conversion did not produce an error.
4251 *
4252 * "error" means that
4253 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4254 * Encoding::Converter#convert and
4255 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4256 * Encoding::Converter#primitive_convert.
4257 *
4258 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4259 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4260 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4261 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4262 * p ec.last_error #=> nil
4263 *
4264 */
4265static VALUE
4266econv_last_error(VALUE self)
4267{
4268 rb_econv_t *ec = check_econv(self);
4269 VALUE exc;
4270
4271 exc = make_econv_exception(ec);
4272 if (NIL_P(exc))
4273 return Qnil;
4274 return exc;
4275}
4276
4277/*
4278 * call-seq:
4279 * ec.replacement -> string
4280 *
4281 * Returns the replacement string.
4282 *
4283 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4284 * p ec.replacement #=> "?"
4285 *
4286 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4287 * p ec.replacement #=> "\uFFFD"
4288 */
4289static VALUE
4290econv_get_replacement(VALUE self)
4291{
4292 rb_econv_t *ec = check_econv(self);
4293 int ret;
4294 rb_encoding *enc;
4295
4296 ret = make_replacement(ec);
4297 if (ret == -1) {
4298 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4299 }
4300
4301 enc = rb_enc_find(ec->replacement_enc);
4302 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4303}
4304
4305/*
4306 * call-seq:
4307 * ec.replacement = string
4308 *
4309 * Sets the replacement string.
4310 *
4311 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4312 * ec.replacement = "<undef>"
4313 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4314 */
4315static VALUE
4316econv_set_replacement(VALUE self, VALUE arg)
4317{
4318 rb_econv_t *ec = check_econv(self);
4319 VALUE string = arg;
4320 int ret;
4321 rb_encoding *enc;
4322
4323 StringValue(string);
4324 enc = rb_enc_get(string);
4325
4326 ret = rb_econv_set_replacement(ec,
4327 (const unsigned char *)RSTRING_PTR(string),
4328 RSTRING_LEN(string),
4329 rb_enc_name(enc));
4330
4331 if (ret == -1) {
4332 /* xxx: rb_eInvalidByteSequenceError? */
4333 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4334 }
4335
4336 return arg;
4337}
4338
4339VALUE
4341{
4342 return make_econv_exception(ec);
4343}
4344
4345void
4347{
4348 VALUE exc;
4349
4350 exc = make_econv_exception(ec);
4351 if (NIL_P(exc))
4352 return;
4353 rb_exc_raise(exc);
4354}
4355
4356/*
4357 * call-seq:
4358 * ecerr.source_encoding_name -> string
4359 *
4360 * Returns the source encoding name as a string.
4361 */
4362static VALUE
4363ecerr_source_encoding_name(VALUE self)
4364{
4365 return rb_attr_get(self, id_source_encoding_name);
4366}
4367
4368/*
4369 * call-seq:
4370 * ecerr.source_encoding -> encoding
4371 *
4372 * Returns the source encoding as an encoding object.
4373 *
4374 * Note that the result may not be equal to the source encoding of
4375 * the encoding converter if the conversion has multiple steps.
4376 *
4377 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4378 * begin
4379 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4380 * rescue Encoding::UndefinedConversionError
4381 * p $!.source_encoding #=> #<Encoding:UTF-8>
4382 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4383 * p $!.source_encoding_name #=> "UTF-8"
4384 * p $!.destination_encoding_name #=> "EUC-JP"
4385 * end
4386 *
4387 */
4388static VALUE
4389ecerr_source_encoding(VALUE self)
4390{
4391 return rb_attr_get(self, id_source_encoding);
4392}
4393
4394/*
4395 * call-seq:
4396 * ecerr.destination_encoding_name -> string
4397 *
4398 * Returns the destination encoding name as a string.
4399 */
4400static VALUE
4401ecerr_destination_encoding_name(VALUE self)
4402{
4403 return rb_attr_get(self, id_destination_encoding_name);
4404}
4405
4406/*
4407 * call-seq:
4408 * ecerr.destination_encoding -> string
4409 *
4410 * Returns the destination encoding as an encoding object.
4411 */
4412static VALUE
4413ecerr_destination_encoding(VALUE self)
4414{
4415 return rb_attr_get(self, id_destination_encoding);
4416}
4417
4418/*
4419 * call-seq:
4420 * ecerr.error_char -> string
4421 *
4422 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4423 *
4424 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4425 * begin
4426 * ec.convert("\xa0")
4427 * rescue Encoding::UndefinedConversionError
4428 * puts $!.error_char.dump #=> "\xC2\xA0"
4429 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4430 * end
4431 *
4432 */
4433static VALUE
4434ecerr_error_char(VALUE self)
4435{
4436 return rb_attr_get(self, id_error_char);
4437}
4438
4439/*
4440 * call-seq:
4441 * ecerr.error_bytes -> string
4442 *
4443 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4444 *
4445 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4446 * begin
4447 * ec.convert("abc\xA1\xFFdef")
4448 * rescue Encoding::InvalidByteSequenceError
4449 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4450 * puts $!.error_bytes.dump #=> "\xA1"
4451 * puts $!.readagain_bytes.dump #=> "\xFF"
4452 * end
4453 */
4454static VALUE
4455ecerr_error_bytes(VALUE self)
4456{
4457 return rb_attr_get(self, id_error_bytes);
4458}
4459
4460/*
4461 * call-seq:
4462 * ecerr.readagain_bytes -> string
4463 *
4464 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4465 */
4466static VALUE
4467ecerr_readagain_bytes(VALUE self)
4468{
4469 return rb_attr_get(self, id_readagain_bytes);
4470}
4471
4472/*
4473 * call-seq:
4474 * ecerr.incomplete_input? -> true or false
4475 *
4476 * Returns true if the invalid byte sequence error is caused by
4477 * premature end of string.
4478 *
4479 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4480 *
4481 * begin
4482 * ec.convert("abc\xA1z")
4483 * rescue Encoding::InvalidByteSequenceError
4484 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4485 * p $!.incomplete_input? #=> false
4486 * end
4487 *
4488 * begin
4489 * ec.convert("abc\xA1")
4490 * ec.finish
4491 * rescue Encoding::InvalidByteSequenceError
4492 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4493 * p $!.incomplete_input? #=> true
4494 * end
4495 */
4496static VALUE
4497ecerr_incomplete_input(VALUE self)
4498{
4499 return rb_attr_get(self, id_incomplete_input);
4500}
4501
4502/*
4503 * Document-class: Encoding::UndefinedConversionError
4504 *
4505 * Raised by Encoding and String methods when a transcoding operation
4506 * fails.
4507 */
4508
4509/*
4510 * Document-class: Encoding::InvalidByteSequenceError
4511 *
4512 * Raised by Encoding and String methods when the string being
4513 * transcoded contains a byte invalid for the either the source or
4514 * target encoding.
4515 */
4516
4517/*
4518 * Document-class: Encoding::ConverterNotFoundError
4519 *
4520 * Raised by transcoding methods when a named encoding does not
4521 * correspond with a known converter.
4522 */
4523
4524void
4525Init_transcode(void)
4526{
4527 transcoder_table = st_init_strcasetable();
4528
4529 id_destination_encoding = rb_intern_const("destination_encoding");
4530 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4531 id_error_bytes = rb_intern_const("error_bytes");
4532 id_error_char = rb_intern_const("error_char");
4533 id_incomplete_input = rb_intern_const("incomplete_input");
4534 id_readagain_bytes = rb_intern_const("readagain_bytes");
4535 id_source_encoding = rb_intern_const("source_encoding");
4536 id_source_encoding_name = rb_intern_const("source_encoding_name");
4537
4538 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4539 sym_undef = ID2SYM(rb_intern_const("undef"));
4540 sym_replace = ID2SYM(rb_intern_const("replace"));
4541 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4542 sym_xml = ID2SYM(rb_intern_const("xml"));
4543 sym_text = ID2SYM(rb_intern_const("text"));
4544 sym_attr = ID2SYM(rb_intern_const("attr"));
4545
4546 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4547 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4548 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4549 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4550 sym_finished = ID2SYM(rb_intern_const("finished"));
4551 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4552 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4553 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4554 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4555 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4556 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4557 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4558
4559#ifdef ENABLE_ECONV_NEWLINE_OPTION
4560 sym_newline = ID2SYM(rb_intern_const("newline"));
4561 sym_universal = ID2SYM(rb_intern_const("universal"));
4562 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4563 sym_cr = ID2SYM(rb_intern_const("cr"));
4564 sym_lf = ID2SYM(rb_intern_const("lf"));
4565#endif
4566
4567 InitVM(transcode);
4568}
4569
4570void
4571InitVM_transcode(void)
4572{
4573 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4574 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4575 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4576
4577 rb_define_method(rb_cString, "encode", str_encode, -1);
4578 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4579
4580 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4581 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4582 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4583 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4584 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4585 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4586 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4587 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4588 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4589 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4590 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4591 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4592 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4593 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4594 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4595 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4596 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4597 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4598 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4599
4600 /*
4601 *Mask for invalid byte sequences
4602 */
4603 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4604
4605 /*
4606 * Replace invalid byte sequences
4607 */
4608 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4609
4610 /*
4611 * Mask for a valid character in the source encoding but no related
4612 * character(s) in destination encoding.
4613 */
4614 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4615
4616 /*
4617 * Replace byte sequences that are undefined in the destination encoding.
4618 */
4619 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4620
4621 /*
4622 * Replace byte sequences that are undefined in the destination encoding
4623 * with an XML hexadecimal character reference. This is valid for XML
4624 * conversion.
4625 */
4626 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4627
4628 /*
4629 * Indicates the source may be part of a larger string. See
4630 * primitive_convert for an example.
4631 */
4632 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4633
4634 /*
4635 * Stop converting after some output is complete but before all of the
4636 * input was consumed. See primitive_convert for an example.
4637 */
4638 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4639
4640 /*
4641 * Decorator for converting CRLF and CR to LF
4642 */
4643 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4644
4645 /*
4646 * Decorator for converting CRLF and CR to LF when writing
4647 */
4648 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4649
4650 /*
4651 * Decorator for converting LF to CRLF
4652 */
4653 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4654
4655 /*
4656 * Decorator for converting LF to CR
4657 */
4658 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4659
4660 /*
4661 * Escape as XML CharData
4662 */
4663 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4664
4665 /*
4666 * Escape as XML AttValue
4667 */
4668 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4669
4670 /*
4671 * Escape as XML AttValue
4672 */
4673 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4674
4675 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4676 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4677 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4678 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4679 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4680
4681 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4682 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4683 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4684 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4685 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4686 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4687 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4688
4689 Init_newline();
4690}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1634
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3252
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1674
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:403
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1682
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:683
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1380
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1397
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1481
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1436
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:265
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:60
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3262
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1336
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:941
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:825
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2660
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2123
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1542
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2705
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1780
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3279
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1106
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1947
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1938
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1824
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1626
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1959
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2711
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:2005
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:2022
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1988
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2974
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4340
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4346
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1953
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1741
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1875
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1791
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2285
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1168
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1006
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1676
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2575
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:120
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1740
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1497
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1782
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:995
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1512
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1990
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3381
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2738
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7342
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1712
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1513
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5755
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1986
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3361
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:993
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:520
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:455
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:515
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:202
Definition st.h:79
Definition string.c:8268
Definition transcode.c:177
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376