Ruby 4.1.0dev (2026-02-28 revision 85b04616269660e2d06441e2a833edad4491564d)
transcode.c (85b04616269660e2d06441e2a833edad4491564d)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/gc.h"
20#include "internal/object.h"
21#include "internal/string.h"
22#include "internal/transcode.h"
23#include "internal/encoding.h"
24#include "ruby/encoding.h"
25#include "vm_sync.h"
26
27#include "transcode_data.h"
28#include "id.h"
29
30#define ENABLE_ECONV_NEWLINE_OPTION 1
31
32/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
33static VALUE rb_eUndefinedConversionError;
34static VALUE rb_eInvalidByteSequenceError;
35static VALUE rb_eConverterNotFoundError;
36
37VALUE rb_cEncodingConverter;
38
39static ID id_destination_encoding;
40static ID id_destination_encoding_name;
41static ID id_error_bytes;
42static ID id_error_char;
43static ID id_incomplete_input;
44static ID id_readagain_bytes;
45static ID id_source_encoding;
46static ID id_source_encoding_name;
47
48static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
49static VALUE sym_xml, sym_text, sym_attr;
50static VALUE sym_universal_newline;
51static VALUE sym_crlf_newline;
52static VALUE sym_cr_newline;
53static VALUE sym_lf_newline;
54#ifdef ENABLE_ECONV_NEWLINE_OPTION
55static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
56#endif
57static VALUE sym_partial_input;
58
59static VALUE sym_invalid_byte_sequence;
60static VALUE sym_undefined_conversion;
61static VALUE sym_destination_buffer_full;
62static VALUE sym_source_buffer_empty;
63static VALUE sym_finished;
64static VALUE sym_after_output;
65static VALUE sym_incomplete_input;
66
67static unsigned char *
68allocate_converted_string(const char *sname, const char *dname,
69 const unsigned char *str, size_t len,
70 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
71 size_t *dst_len_ptr, size_t *dst_bufsize_ptr);
72
73/* dynamic structure, one per conversion (similar to iconv_t) */
74/* may carry conversion state (e.g. for iso-2022-jp) */
75typedef struct rb_transcoding {
76 const rb_transcoder *transcoder;
77
78 int flags;
79
80 int resume_position;
81 unsigned int next_table;
82 VALUE next_info;
83 unsigned char next_byte;
84 unsigned int output_index;
85
86 ssize_t recognized_len; /* already interpreted */
87 ssize_t readagain_len; /* not yet interpreted */
88 union {
89 unsigned char ary[8]; /* max_input <= sizeof(ary) */
90 unsigned char *ptr; /* length: max_input */
91 } readbuf; /* recognized_len + readagain_len used */
92
93 ssize_t writebuf_off;
94 ssize_t writebuf_len;
95 union {
96 unsigned char ary[8]; /* max_output <= sizeof(ary) */
97 unsigned char *ptr; /* length: max_output */
98 } writebuf;
99
100 union rb_transcoding_state_t { /* opaque data for stateful encoding */
101 void *ptr;
102 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
103 double dummy_for_alignment;
104 } state;
106#define TRANSCODING_READBUF(tc) \
107 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
108 (tc)->readbuf.ary : \
109 (tc)->readbuf.ptr)
110#define TRANSCODING_WRITEBUF(tc) \
111 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112 (tc)->writebuf.ary : \
113 (tc)->writebuf.ptr)
114#define TRANSCODING_WRITEBUF_SIZE(tc) \
115 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
116 sizeof((tc)->writebuf.ary) : \
117 (size_t)(tc)->transcoder->max_output)
118#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
119#define TRANSCODING_STATE(tc) \
120 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
121 (tc)->state.ary : \
122 (tc)->state.ptr)
123
124typedef struct {
125 struct rb_transcoding *tc;
126 unsigned char *out_buf_start;
127 unsigned char *out_data_start;
128 unsigned char *out_data_end;
129 unsigned char *out_buf_end;
130 rb_econv_result_t last_result;
132
134 int flags;
135 int started; /* bool */
136
137 const char *source_encoding_name;
138 const char *destination_encoding_name;
139
140 const unsigned char *replacement_str;
141 size_t replacement_len;
142 size_t replacement_bufsize;
143 const char *replacement_enc;
144
145 unsigned char *in_buf_start;
146 unsigned char *in_data_start;
147 unsigned char *in_data_end;
148 unsigned char *in_buf_end;
149 rb_econv_elem_t *elems;
150 int replacement_allocated; /* bool */
151 int num_allocated;
152 int num_trans;
153 int num_finished;
154 struct rb_transcoding *last_tc;
155
156 /* last error */
157 struct {
158 rb_econv_result_t result;
159 struct rb_transcoding *error_tc;
160 const char *source_encoding;
161 const char *destination_encoding;
162 const unsigned char *error_bytes_start;
163 size_t error_bytes_len;
164 size_t readagain_len;
165 } last_error;
166
167 /* The following fields are only for Encoding::Converter.
168 * rb_econv_open set them NULL. */
169 rb_encoding *source_encoding;
170 rb_encoding *destination_encoding;
171};
172
173/*
174 * Dispatch data and logic
175 */
176
177#define DECORATOR_P(sname, dname) (*(sname) == '\0')
178
179typedef struct {
180 const char *sname;
181 const char *dname;
182 const char *lib; /* null means no need to load a library */
183 const rb_transcoder *transcoder;
185
186static st_table *transcoder_table;
187
188static int
189free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
190{
191 SIZED_FREE((transcoder_entry_t *)val);
192 return ST_DELETE;
193}
194
195static int
196free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
197{
198 st_foreach((void *)val, free_inner_transcode_i, 0);
199 st_free_table((void *)val);
200 return ST_DELETE;
201}
202
203void
204rb_free_transcoder_table(void)
205{
206 st_foreach(transcoder_table, free_transcode_i, 0);
207 st_free_table(transcoder_table);
208}
209
210static transcoder_entry_t *
211make_transcoder_entry(const char *sname, const char *dname)
212{
213 st_data_t val;
214 st_table *table2;
215
216 RB_VM_LOCKING() {
217 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
218 val = (st_data_t)st_init_strcasetable();
219 st_add_direct(transcoder_table, (st_data_t)sname, val);
220 }
221 table2 = (st_table *)val;
222 if (!st_lookup(table2, (st_data_t)dname, &val)) {
224 entry->sname = sname;
225 entry->dname = dname;
226 entry->lib = NULL;
227 entry->transcoder = NULL;
228 val = (st_data_t)entry;
229 st_add_direct(table2, (st_data_t)dname, val);
230 }
231 }
232 return (transcoder_entry_t *)val;
233}
234
235static transcoder_entry_t *
236get_transcoder_entry(const char *sname, const char *dname)
237{
238 st_data_t val = 0;
239 st_table *table2;
240 RB_VM_LOCKING() {
241 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
242 table2 = (st_table *)val;
243 if (!st_lookup(table2, (st_data_t)dname, &val)) {
244 val = 0;
245 }
246 }
247 }
248 return (transcoder_entry_t *)val;
249}
250
251void
252rb_register_transcoder(const rb_transcoder *tr)
253{
254 const char *const sname = tr->src_encoding;
255 const char *const dname = tr->dst_encoding;
256
257 transcoder_entry_t *entry;
258
259 RB_VM_LOCKING() {
260 entry = make_transcoder_entry(sname, dname);
261 if (entry->transcoder) {
262 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
263 sname, dname);
264 }
265 entry->transcoder = tr;
266 }
267}
268
269static void
270declare_transcoder(const char *sname, const char *dname, const char *lib)
271{
272 transcoder_entry_t *entry;
273
274 entry = make_transcoder_entry(sname, dname);
275 entry->lib = lib;
276}
277
278static const char transcoder_lib_prefix[] = "enc/trans/";
279
280void
281rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
282{
283 if (!lib) {
284 rb_raise(rb_eArgError, "invalid library name - (null)");
285 }
286 declare_transcoder(enc1, enc2, lib);
287}
288
289#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
290
291typedef struct search_path_queue_tag {
292 struct search_path_queue_tag *next;
293 const char *enc;
295
296typedef struct {
297 st_table *visited;
298 search_path_queue_t *queue;
299 search_path_queue_t **queue_last_ptr;
300 const char *base_enc;
302
303static int
304transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
305{
306 const char *dname = (const char *)key;
309
310 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
311 return ST_CONTINUE;
312 }
313
315 q->enc = dname;
316 q->next = NULL;
317 *bfs->queue_last_ptr = q;
318 bfs->queue_last_ptr = &q->next;
319
320 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
321 return ST_CONTINUE;
322}
323
324static int
325transcode_search_path(const char *sname, const char *dname,
326 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
327 void *arg)
328{
331 st_data_t val;
332 st_table *table2;
333 int pathlen = -1;
334 bool found = false;
335 bool lookup_res;
336
337 if (encoding_equal(sname, dname))
338 return -1;
339
341 q->enc = sname;
342 q->next = NULL;
343 bfs.queue_last_ptr = &q->next;
344 bfs.queue = q;
345
346 bfs.visited = st_init_strcasetable(); // due to base encodings, we need to do search in a loop
347 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
348
349 RB_VM_LOCKING() {
350 while (bfs.queue) {
351 q = bfs.queue;
352 bfs.queue = q->next;
353 if (!bfs.queue) {
354 bfs.queue_last_ptr = &bfs.queue;
355 }
356
357 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val); // src => table2
358 if (!lookup_res) {
359 SIZED_FREE(q);
360 continue;
361 }
362 table2 = (st_table *)val;
363
364 if (st_lookup(table2, (st_data_t)dname, &val)) { // dest => econv
365 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
366 SIZED_FREE(q);
367 found = true;
368 break;
369 }
370
371 bfs.base_enc = q->enc;
372 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
373
374 bfs.base_enc = NULL;
375 SIZED_FREE(q);
376 }
377 }
378
379 while (bfs.queue) {
380 q = bfs.queue;
381 bfs.queue = q->next;
382 SIZED_FREE(q);
383 }
384
385 if (found) {
386 const char *enc = dname;
387 int depth;
388 pathlen = 0;
389 while (1) {
390 st_lookup(bfs.visited, (st_data_t)enc, &val);
391 if (!val)
392 break;
393 pathlen++;
394 enc = (const char *)val;
395 }
396 depth = pathlen;
397 enc = dname;
398 while (1) {
399 st_lookup(bfs.visited, (st_data_t)enc, &val);
400 if (!val)
401 break;
402 callback((const char *)val, enc, --depth, arg);
403 enc = (const char *)val;
404 }
405 }
406
407 st_free_table(bfs.visited);
408
409 return pathlen; /* is -1 if not found */
410}
411
412int rb_require_internal_silent(VALUE fname);
413
414static const rb_transcoder *
415load_transcoder_entry(transcoder_entry_t *entry)
416{
417 ASSERT_vm_unlocking();
418 if (entry->transcoder)
419 return entry->transcoder;
420
421 if (entry->lib) {
422 const char *const lib = entry->lib;
423 const size_t len = strlen(lib);
424 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
425 const VALUE fn = rb_str_new(0, total_len);
426 char *const path = RSTRING_PTR(fn);
427
428 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
429 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
430 rb_str_set_len(fn, total_len);
431 OBJ_FREEZE(fn);
432 rb_require_internal_silent(fn); // Sets entry->transcoder
433 }
434
435 if (entry->transcoder)
436 return entry->transcoder;
437
438 return NULL;
439}
440
441static const char*
442get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
443{
444 if (encoding_equal(encname, "UTF-8")) {
445 *len_ret = 3;
446 *repl_encname_ptr = "UTF-8";
447 return "\xEF\xBF\xBD";
448 }
449 else {
450 *len_ret = 1;
451 *repl_encname_ptr = "US-ASCII";
452 return "?";
453 }
454}
455
456/*
457 * Transcoding engine logic
458 */
459
460static const unsigned char *
461transcode_char_start(rb_transcoding *tc,
462 const unsigned char *in_start,
463 const unsigned char *inchar_start,
464 const unsigned char *in_p,
465 size_t *char_len_ptr)
466{
467 const unsigned char *ptr;
468 if (inchar_start - in_start < tc->recognized_len) {
469 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
470 inchar_start, unsigned char, in_p - inchar_start);
471 ptr = TRANSCODING_READBUF(tc);
472 }
473 else {
474 ptr = inchar_start - tc->recognized_len;
475 }
476 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
477 return ptr;
478}
479
481transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
482 const unsigned char *in_stop, unsigned char *out_stop,
483 rb_transcoding *tc,
484 const int opt)
485{
486 const rb_transcoder *tr = tc->transcoder;
487 int unitlen = tr->input_unit_length;
488 ssize_t readagain_len = 0;
489
490 const unsigned char *inchar_start;
491 const unsigned char *in_p;
492
493 unsigned char *out_p;
494
495 in_p = inchar_start = *in_pos;
496
497 out_p = *out_pos;
498
499#define SUSPEND(ret, num) \
500 do { \
501 tc->resume_position = (num); \
502 if (0 < in_p - inchar_start) \
503 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
504 inchar_start, unsigned char, in_p - inchar_start); \
505 *in_pos = in_p; \
506 *out_pos = out_p; \
507 tc->recognized_len += in_p - inchar_start; \
508 if (readagain_len) { \
509 tc->recognized_len -= readagain_len; \
510 tc->readagain_len = readagain_len; \
511 } \
512 return (ret); \
513 resume_label ## num:; \
514 } while (0)
515#define SUSPEND_OBUF(num) \
516 do { \
517 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
518 } while (0)
519
520#define SUSPEND_AFTER_OUTPUT(num) \
521 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
522 SUSPEND(econv_after_output, num); \
523 }
524
525#define next_table (tc->next_table)
526#define next_info (tc->next_info)
527#define next_byte (tc->next_byte)
528#define writebuf_len (tc->writebuf_len)
529#define writebuf_off (tc->writebuf_off)
530
531 switch (tc->resume_position) {
532 case 0: break;
533 case 1: goto resume_label1;
534 case 2: goto resume_label2;
535 case 3: goto resume_label3;
536 case 4: goto resume_label4;
537 case 5: goto resume_label5;
538 case 6: goto resume_label6;
539 case 7: goto resume_label7;
540 case 8: goto resume_label8;
541 case 9: goto resume_label9;
542 case 10: goto resume_label10;
543 case 11: goto resume_label11;
544 case 12: goto resume_label12;
545 case 13: goto resume_label13;
546 case 14: goto resume_label14;
547 case 15: goto resume_label15;
548 case 16: goto resume_label16;
549 case 17: goto resume_label17;
550 case 18: goto resume_label18;
551 case 19: goto resume_label19;
552 case 20: goto resume_label20;
553 case 21: goto resume_label21;
554 case 22: goto resume_label22;
555 case 23: goto resume_label23;
556 case 24: goto resume_label24;
557 case 25: goto resume_label25;
558 case 26: goto resume_label26;
559 case 27: goto resume_label27;
560 case 28: goto resume_label28;
561 case 29: goto resume_label29;
562 case 30: goto resume_label30;
563 case 31: goto resume_label31;
564 case 32: goto resume_label32;
565 case 33: goto resume_label33;
566 case 34: goto resume_label34;
567 }
568
569 while (1) {
570 inchar_start = in_p;
571 tc->recognized_len = 0;
572 next_table = tr->conv_tree_start;
573
574 SUSPEND_AFTER_OUTPUT(24);
575
576 if (in_stop <= in_p) {
577 if (!(opt & ECONV_PARTIAL_INPUT))
578 break;
579 SUSPEND(econv_source_buffer_empty, 7);
580 continue;
581 }
582
583#define BYTE_ADDR(index) (tr->byte_array + (index))
584#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
585#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
586#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
587#define BL_MIN_BYTE (BL_BASE[0])
588#define BL_MAX_BYTE (BL_BASE[1])
589#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
590#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
591
592 next_byte = (unsigned char)*in_p++;
593 follow_byte:
594 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
595 next_info = INVALID;
596 else {
597 next_info = (VALUE)BL_ACTION(next_byte);
598 }
599 follow_info:
600 switch (next_info & 0x1F) {
601 case NOMAP:
602 {
603 const unsigned char *p = inchar_start;
604 writebuf_off = 0;
605 while (p < in_p) {
606 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
607 }
608 writebuf_len = writebuf_off;
609 writebuf_off = 0;
610 while (writebuf_off < writebuf_len) {
611 SUSPEND_OBUF(3);
612 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
613 }
614 }
615 continue;
616 case 0x00: case 0x04: case 0x08: case 0x0C:
617 case 0x10: case 0x14: case 0x18: case 0x1C:
618 SUSPEND_AFTER_OUTPUT(25);
619 while (in_p >= in_stop) {
620 if (!(opt & ECONV_PARTIAL_INPUT))
621 goto incomplete;
622 SUSPEND(econv_source_buffer_empty, 5);
623 }
624 next_byte = (unsigned char)*in_p++;
625 next_table = (unsigned int)next_info;
626 goto follow_byte;
627 case ZERObt: /* drop input */
628 continue;
629 case ONEbt:
630 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
631 continue;
632 case TWObt:
633 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
634 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
635 continue;
636 case THREEbt:
637 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
638 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
639 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
640 continue;
641 case FOURbt:
642 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
643 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
644 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
645 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
646 continue;
647 case GB4bt:
648 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
649 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
650 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
651 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
652 continue;
653 case STR1:
654 tc->output_index = 0;
655 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
656 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
657 tc->output_index++;
658 }
659 continue;
660 case FUNii:
661 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
662 goto follow_info;
663 case FUNsi:
664 {
665 const unsigned char *char_start;
666 size_t char_len;
667 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
668 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
669 goto follow_info;
670 }
671 case FUNio:
672 SUSPEND_OBUF(13);
673 if (tr->max_output <= out_stop - out_p)
674 out_p += tr->func_io(TRANSCODING_STATE(tc),
675 next_info, out_p, out_stop - out_p);
676 else {
677 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
678 next_info,
679 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
680 writebuf_off = 0;
681 while (writebuf_off < writebuf_len) {
682 SUSPEND_OBUF(20);
683 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
684 }
685 }
686 break;
687 case FUNso:
688 {
689 const unsigned char *char_start;
690 size_t char_len;
691 SUSPEND_OBUF(14);
692 if (tr->max_output <= out_stop - out_p) {
693 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
694 out_p += tr->func_so(TRANSCODING_STATE(tc),
695 char_start, (size_t)char_len,
696 out_p, out_stop - out_p);
697 }
698 else {
699 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
700 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
701 char_start, (size_t)char_len,
702 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
703 writebuf_off = 0;
704 while (writebuf_off < writebuf_len) {
705 SUSPEND_OBUF(22);
706 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
707 }
708 }
709 break;
710 }
711 case FUNsio:
712 {
713 const unsigned char *char_start;
714 size_t char_len;
715 SUSPEND_OBUF(33);
716 if (tr->max_output <= out_stop - out_p) {
717 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
718 out_p += tr->func_sio(TRANSCODING_STATE(tc),
719 char_start, (size_t)char_len, next_info,
720 out_p, out_stop - out_p);
721 }
722 else {
723 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
724 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
725 char_start, (size_t)char_len, next_info,
726 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
727 writebuf_off = 0;
728 while (writebuf_off < writebuf_len) {
729 SUSPEND_OBUF(34);
730 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
731 }
732 }
733 break;
734 }
735 case INVALID:
736 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
737 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
738 SUSPEND_AFTER_OUTPUT(26);
739 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
740 in_p = in_stop;
741 SUSPEND(econv_source_buffer_empty, 8);
742 }
743 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
744 in_p = in_stop;
745 }
746 else {
747 in_p = inchar_start + (unitlen - tc->recognized_len);
748 }
749 }
750 else {
751 ssize_t invalid_len; /* including the last byte which causes invalid */
752 ssize_t discard_len;
753 invalid_len = tc->recognized_len + (in_p - inchar_start);
754 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
755 readagain_len = invalid_len - discard_len;
756 }
757 goto invalid;
758 case UNDEF:
759 goto undef;
760 default:
761 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
762 }
763 continue;
764
765 invalid:
766 SUSPEND(econv_invalid_byte_sequence, 1);
767 continue;
768
769 incomplete:
770 SUSPEND(econv_incomplete_input, 27);
771 continue;
772
773 undef:
774 SUSPEND(econv_undefined_conversion, 2);
775 continue;
776 }
777
778 /* cleanup */
779 if (tr->finish_func) {
780 SUSPEND_OBUF(4);
781 if (tr->max_output <= out_stop - out_p) {
782 out_p += tr->finish_func(TRANSCODING_STATE(tc),
783 out_p, out_stop - out_p);
784 }
785 else {
786 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
787 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
788 writebuf_off = 0;
789 while (writebuf_off < writebuf_len) {
790 SUSPEND_OBUF(23);
791 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
792 }
793 }
794 }
795 while (1)
796 SUSPEND(econv_finished, 6);
797#undef SUSPEND
798#undef next_table
799#undef next_info
800#undef next_byte
801#undef writebuf_len
802#undef writebuf_off
803}
804
806transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
807 const unsigned char *in_stop, unsigned char *out_stop,
808 rb_transcoding *tc,
809 const int opt)
810{
811 if (tc->readagain_len) {
812 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
813 const unsigned char *readagain_pos = readagain_buf;
814 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
816
817 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
818 unsigned char, tc->readagain_len);
819 tc->readagain_len = 0;
820 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
821 if (res != econv_source_buffer_empty) {
822 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
823 readagain_pos, unsigned char, readagain_stop - readagain_pos);
824 tc->readagain_len += readagain_stop - readagain_pos;
825 return res;
826 }
827 }
828 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
829}
830
831static rb_transcoding *
832rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
833{
834 rb_transcoding *tc;
835
836 tc = ALLOC(rb_transcoding);
837 tc->transcoder = tr;
838 tc->flags = flags;
839 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
840 tc->state.ptr = xmalloc(tr->state_size);
841 if (tr->state_init_func) {
842 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
843 }
844 tc->resume_position = 0;
845 tc->recognized_len = 0;
846 tc->readagain_len = 0;
847 tc->writebuf_len = 0;
848 tc->writebuf_off = 0;
849 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
850 tc->readbuf.ptr = xmalloc(tr->max_input);
851 }
852 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
853 tc->writebuf.ptr = xmalloc(tr->max_output);
854 }
855 return tc;
856}
857
859rb_transcoding_convert(rb_transcoding *tc,
860 const unsigned char **input_ptr, const unsigned char *input_stop,
861 unsigned char **output_ptr, unsigned char *output_stop,
862 int flags)
863{
864 return transcode_restartable(
865 input_ptr, output_ptr,
866 input_stop, output_stop,
867 tc, flags);
868}
869
870static void
871rb_transcoding_close(rb_transcoding *tc)
872{
873 const rb_transcoder *tr = tc->transcoder;
874 if (tr->state_fini_func) {
875 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
876 }
877 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
878 ruby_sized_xfree(tc->state.ptr, tr->state_size);
879 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
880 ruby_sized_xfree(tc->readbuf.ptr, tr->max_input);
881 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
882 ruby_sized_xfree(tc->writebuf.ptr, tr->max_output);
883 SIZED_FREE(tc);
884}
885
886static size_t
887rb_transcoding_memsize(rb_transcoding *tc)
888{
889 size_t size = sizeof(rb_transcoding);
890 const rb_transcoder *tr = tc->transcoder;
891
892 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
893 size += tr->state_size;
894 }
895 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
896 size += tr->max_input;
897 }
898 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
899 size += tr->max_output;
900 }
901 return size;
902}
903
904static rb_econv_t *
905rb_econv_alloc(int n_hint)
906{
907 rb_econv_t *ec;
908
909 if (n_hint <= 0)
910 n_hint = 1;
911
912 ec = ALLOC(rb_econv_t);
913 ec->flags = 0;
914 ec->source_encoding_name = NULL;
915 ec->destination_encoding_name = NULL;
916 ec->started = 0;
917 ec->replacement_str = NULL;
918 ec->replacement_len = 0;
919 ec->replacement_bufsize = 0;
920 ec->replacement_enc = NULL;
921 ec->replacement_allocated = 0;
922 ec->in_buf_start = NULL;
923 ec->in_data_start = NULL;
924 ec->in_data_end = NULL;
925 ec->in_buf_end = NULL;
926 ec->num_allocated = n_hint;
927 ec->num_trans = 0;
928 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
929 ec->num_finished = 0;
930 ec->last_tc = NULL;
931 ec->last_error.result = econv_source_buffer_empty;
932 ec->last_error.error_tc = NULL;
933 ec->last_error.source_encoding = NULL;
934 ec->last_error.destination_encoding = NULL;
935 ec->last_error.error_bytes_start = NULL;
936 ec->last_error.error_bytes_len = 0;
937 ec->last_error.readagain_len = 0;
938 ec->source_encoding = NULL;
939 ec->destination_encoding = NULL;
940 return ec;
941}
942
943static int
944rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
945{
946 int n, j;
947 int bufsize = 4096;
948 unsigned char *p;
949
950 if (ec->num_trans == ec->num_allocated) {
951 n = ec->num_allocated * 2;
952 SIZED_REALLOC_N(ec->elems, rb_econv_elem_t, n, ec->num_allocated);
953 ec->num_allocated = n;
954 }
955
956 p = xmalloc(bufsize);
957
958 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
959
960 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
961 ec->elems[i].out_buf_start = p;
962 ec->elems[i].out_buf_end = p + bufsize;
963 ec->elems[i].out_data_start = p;
964 ec->elems[i].out_data_end = p;
965 ec->elems[i].last_result = econv_source_buffer_empty;
966
967 ec->num_trans++;
968
969 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
970 for (j = ec->num_trans-1; i <= j; j--) {
971 rb_transcoding *tc = ec->elems[j].tc;
972 const rb_transcoder *tr2 = tc->transcoder;
973 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
974 ec->last_tc = tc;
975 break;
976 }
977 }
978
979 return 0;
980}
981
982static rb_econv_t *
983rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
984{
985 rb_econv_t *ec;
986 int i, ret;
987
988 for (i = 0; i < n; i++) {
989 const rb_transcoder *tr;
990 tr = load_transcoder_entry(entries[i]);
991 if (!tr)
992 return NULL;
993 }
994
995 ec = rb_econv_alloc(n);
996
997 for (i = 0; i < n; i++) {
998 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
999 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
1000 if (ret == -1) {
1001 rb_econv_close(ec);
1002 return NULL;
1003 }
1004 }
1005
1006 return ec;
1007}
1008
1010 transcoder_entry_t **entries;
1011};
1012
1013static void
1014trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1015{
1016 struct trans_open_t *toarg = arg;
1017
1018 if (!toarg->entries) {
1019 toarg->entries = ALLOC_N(transcoder_entry_t *, depth + 1);
1020 }
1021 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1022}
1023
1024static rb_econv_t *
1025rb_econv_open0(const char *sname, const char *dname, int ecflags)
1026{
1027 transcoder_entry_t **entries = NULL;
1028 int num_trans;
1029 rb_econv_t *ec;
1030
1031 // loads encodings if not loaded already
1032 if (*sname) rb_enc_find_index(sname);
1033 if (*dname) rb_enc_find_index(dname);
1034
1035 if (*sname == '\0' && *dname == '\0') {
1036 num_trans = 0;
1037 entries = NULL;
1038 sname = dname = "";
1039 }
1040 else {
1041 struct trans_open_t toarg = {0};
1042 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1043 entries = toarg.entries;
1044 if (num_trans < 0) {
1045 SIZED_FREE_N(entries, num_trans);
1046 return NULL;
1047 }
1048 }
1049
1050 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1051 SIZED_FREE_N(entries, num_trans);
1052 if (!ec)
1053 return NULL;
1054
1055 ec->flags = ecflags;
1056 ec->source_encoding_name = sname;
1057 ec->destination_encoding_name = dname;
1058
1059 return ec;
1060}
1061
1062#define MAX_ECFLAGS_DECORATORS 32
1063
1064static int
1065decorator_names(int ecflags, const char **decorators_ret)
1066{
1067 int num_decorators;
1068
1069 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1074 case 0:
1075 break;
1076 default:
1077 return -1;
1078 }
1079
1080 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1082 return -1;
1083
1084 num_decorators = 0;
1085
1086 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1087 decorators_ret[num_decorators++] = "xml_text_escape";
1089 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1090 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1091 decorators_ret[num_decorators++] = "xml_attr_quote";
1092
1093 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1094 decorators_ret[num_decorators++] = "crlf_newline";
1095 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1096 decorators_ret[num_decorators++] = "cr_newline";
1097 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1098 decorators_ret[num_decorators++] = "lf_newline";
1100 decorators_ret[num_decorators++] = "universal_newline";
1101
1102 return num_decorators;
1103}
1104
1105rb_econv_t *
1106rb_econv_open(const char *sname, const char *dname, int ecflags)
1107{
1108 rb_econv_t *ec;
1109 int num_decorators;
1110 const char *decorators[MAX_ECFLAGS_DECORATORS];
1111 int i;
1112
1113 num_decorators = decorator_names(ecflags, decorators);
1114 if (num_decorators == -1)
1115 return NULL;
1116
1117 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1118 if (ec) {
1119 for (i = 0; i < num_decorators; i++) {
1120 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1121 rb_econv_close(ec);
1122 ec = NULL;
1123 break;
1124 }
1125 }
1126 }
1127
1128 if (ec) {
1129 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1130 }
1131 return ec; // can be NULL
1132}
1133
1134static int
1135trans_sweep(rb_econv_t *ec,
1136 const unsigned char **input_ptr, const unsigned char *input_stop,
1137 unsigned char **output_ptr, unsigned char *output_stop,
1138 int flags,
1139 int start)
1140{
1141 int try;
1142 int i, f;
1143
1144 const unsigned char **ipp, *is, *iold;
1145 unsigned char **opp, *os, *oold;
1147
1148 try = 1;
1149 while (try) {
1150 try = 0;
1151 for (i = start; i < ec->num_trans; i++) {
1152 rb_econv_elem_t *te = &ec->elems[i];
1153
1154 if (i == 0) {
1155 ipp = input_ptr;
1156 is = input_stop;
1157 }
1158 else {
1159 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1160 ipp = (const unsigned char **)&prev_te->out_data_start;
1161 is = prev_te->out_data_end;
1162 }
1163
1164 if (i == ec->num_trans-1) {
1165 opp = output_ptr;
1166 os = output_stop;
1167 }
1168 else {
1169 if (te->out_buf_start != te->out_data_start) {
1170 ssize_t len = te->out_data_end - te->out_data_start;
1171 ssize_t off = te->out_data_start - te->out_buf_start;
1172 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1173 te->out_data_start = te->out_buf_start;
1174 te->out_data_end -= off;
1175 }
1176 opp = &te->out_data_end;
1177 os = te->out_buf_end;
1178 }
1179
1180 f = flags;
1181 if (ec->num_finished != i)
1183 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1184 start = 1;
1185 flags &= ~ECONV_AFTER_OUTPUT;
1186 }
1187 if (i != 0)
1188 f &= ~ECONV_AFTER_OUTPUT;
1189 iold = *ipp;
1190 oold = *opp;
1191 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1192 if (iold != *ipp || oold != *opp)
1193 try = 1;
1194
1195 switch (res) {
1199 case econv_after_output:
1200 return i;
1201
1204 break;
1205
1206 case econv_finished:
1207 ec->num_finished = i+1;
1208 break;
1209 }
1210 }
1211 }
1212 return -1;
1213}
1214
1215static rb_econv_result_t
1216rb_trans_conv(rb_econv_t *ec,
1217 const unsigned char **input_ptr, const unsigned char *input_stop,
1218 unsigned char **output_ptr, unsigned char *output_stop,
1219 int flags,
1220 int *result_position_ptr)
1221{
1222 int i;
1223 int needreport_index;
1224 int sweep_start;
1225
1226 unsigned char empty_buf;
1227 unsigned char *empty_ptr = &empty_buf;
1228
1229 if (!input_ptr) {
1230 input_ptr = (const unsigned char **)&empty_ptr;
1231 input_stop = empty_ptr;
1232 }
1233
1234 if (!output_ptr) {
1235 output_ptr = &empty_ptr;
1236 output_stop = empty_ptr;
1237 }
1238
1239 if (ec->elems[0].last_result == econv_after_output)
1240 ec->elems[0].last_result = econv_source_buffer_empty;
1241
1242 for (i = ec->num_trans-1; 0 <= i; i--) {
1243 switch (ec->elems[i].last_result) {
1247 case econv_after_output:
1248 case econv_finished:
1249 sweep_start = i+1;
1250 goto found_needreport;
1251
1254 break;
1255
1256 default:
1257 rb_bug("unexpected transcode last result");
1258 }
1259 }
1260
1261 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1262
1263 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1264 (flags & ECONV_AFTER_OUTPUT)) {
1266
1267 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1269 result_position_ptr);
1270
1271 if (res == econv_source_buffer_empty)
1272 return econv_after_output;
1273 return res;
1274 }
1275
1276 sweep_start = 0;
1277
1278 found_needreport:
1279
1280 do {
1281 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1282 sweep_start = needreport_index + 1;
1283 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1284
1285 for (i = ec->num_trans-1; 0 <= i; i--) {
1286 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1287 rb_econv_result_t res = ec->elems[i].last_result;
1288 if (res == econv_invalid_byte_sequence ||
1289 res == econv_incomplete_input ||
1291 res == econv_after_output) {
1292 ec->elems[i].last_result = econv_source_buffer_empty;
1293 }
1294 if (result_position_ptr)
1295 *result_position_ptr = i;
1296 return res;
1297 }
1298 }
1299 if (result_position_ptr)
1300 *result_position_ptr = -1;
1302}
1303
1304static rb_econv_result_t
1305rb_econv_convert0(rb_econv_t *ec,
1306 const unsigned char **input_ptr, const unsigned char *input_stop,
1307 unsigned char **output_ptr, unsigned char *output_stop,
1308 int flags)
1309{
1311 int result_position;
1312 int has_output = 0;
1313
1314 memset(&ec->last_error, 0, sizeof(ec->last_error));
1315
1316 if (ec->num_trans == 0) {
1317 size_t len;
1318 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1319 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1320 len = output_stop - *output_ptr;
1321 memcpy(*output_ptr, ec->in_data_start, len);
1322 *output_ptr = output_stop;
1323 ec->in_data_start += len;
1325 goto gotresult;
1326 }
1327 len = ec->in_data_end - ec->in_data_start;
1328 memcpy(*output_ptr, ec->in_data_start, len);
1329 *output_ptr += len;
1330 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1331 if (flags & ECONV_AFTER_OUTPUT) {
1332 res = econv_after_output;
1333 goto gotresult;
1334 }
1335 }
1336 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1337 len = output_stop - *output_ptr;
1338 }
1339 else {
1340 len = input_stop - *input_ptr;
1341 }
1342 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1343 *(*output_ptr)++ = *(*input_ptr)++;
1344 res = econv_after_output;
1345 goto gotresult;
1346 }
1347 memcpy(*output_ptr, *input_ptr, len);
1348 *output_ptr += len;
1349 *input_ptr += len;
1350 if (*input_ptr != input_stop)
1352 else if (flags & ECONV_PARTIAL_INPUT)
1354 else
1355 res = econv_finished;
1356 goto gotresult;
1357 }
1358
1359 if (ec->elems[ec->num_trans-1].out_data_start) {
1360 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1361 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1362 if (data_start != data_end) {
1363 size_t len;
1364 if (output_stop - *output_ptr < data_end - data_start) {
1365 len = output_stop - *output_ptr;
1366 memcpy(*output_ptr, data_start, len);
1367 *output_ptr = output_stop;
1368 ec->elems[ec->num_trans-1].out_data_start += len;
1370 goto gotresult;
1371 }
1372 len = data_end - data_start;
1373 memcpy(*output_ptr, data_start, len);
1374 *output_ptr += len;
1375 ec->elems[ec->num_trans-1].out_data_start =
1376 ec->elems[ec->num_trans-1].out_data_end =
1377 ec->elems[ec->num_trans-1].out_buf_start;
1378 has_output = 1;
1379 }
1380 }
1381
1382 if (ec->in_buf_start &&
1383 ec->in_data_start != ec->in_data_end) {
1384 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1385 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1386 if (res != econv_source_buffer_empty)
1387 goto gotresult;
1388 }
1389
1390 if (has_output &&
1391 (flags & ECONV_AFTER_OUTPUT) &&
1392 *input_ptr != input_stop) {
1393 input_stop = *input_ptr;
1394 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1395 if (res == econv_source_buffer_empty)
1396 res = econv_after_output;
1397 }
1398 else if ((flags & ECONV_AFTER_OUTPUT) ||
1399 ec->num_trans == 1) {
1400 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1401 }
1402 else {
1403 flags |= ECONV_AFTER_OUTPUT;
1404 do {
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1406 } while (res == econv_after_output);
1407 }
1408
1409 gotresult:
1410 ec->last_error.result = res;
1411 if (res == econv_invalid_byte_sequence ||
1412 res == econv_incomplete_input ||
1414 rb_transcoding *error_tc = ec->elems[result_position].tc;
1415 ec->last_error.error_tc = error_tc;
1416 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1417 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1418 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1419 ec->last_error.error_bytes_len = error_tc->recognized_len;
1420 ec->last_error.readagain_len = error_tc->readagain_len;
1421 }
1422
1423 return res;
1424}
1425
1426static int output_replacement_character(rb_econv_t *ec);
1427
1428static int
1429output_hex_charref(rb_econv_t *ec)
1430{
1431 int ret;
1432 unsigned char utfbuf[1024];
1433 const unsigned char *utf;
1434 size_t utf_len, utf_bufsize;
1435 int utf_allocated = 0;
1436 char charef_buf[16];
1437 const unsigned char *p;
1438
1439 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1440 utf = ec->last_error.error_bytes_start;
1441 utf_len = ec->last_error.error_bytes_len;
1442 }
1443 else {
1444 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1445 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1446 utfbuf, sizeof(utfbuf),
1447 &utf_len, &utf_bufsize);
1448 if (!utf)
1449 return -1;
1450 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1451 utf_allocated = 1;
1452 }
1453
1454 if (utf_len % 4 != 0)
1455 goto fail;
1456
1457 p = utf;
1458 while (4 <= utf_len) {
1459 unsigned int u = 0;
1460 u += p[0] << 24;
1461 u += p[1] << 16;
1462 u += p[2] << 8;
1463 u += p[3];
1464 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1465
1466 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1467 if (ret == -1)
1468 goto fail;
1469
1470 p += 4;
1471 utf_len -= 4;
1472 }
1473
1474 if (utf_allocated)
1475 ruby_sized_xfree((void *)utf, utf_bufsize);
1476 return 0;
1477
1478 fail:
1479 if (utf_allocated)
1480 ruby_sized_xfree((void *)utf, utf_bufsize);
1481 return -1;
1482}
1483
1486 const unsigned char **input_ptr, const unsigned char *input_stop,
1487 unsigned char **output_ptr, unsigned char *output_stop,
1488 int flags)
1489{
1491
1492 unsigned char empty_buf;
1493 unsigned char *empty_ptr = &empty_buf;
1494
1495 ec->started = 1;
1496
1497 if (!input_ptr) {
1498 input_ptr = (const unsigned char **)&empty_ptr;
1499 input_stop = empty_ptr;
1500 }
1501
1502 if (!output_ptr) {
1503 output_ptr = &empty_ptr;
1504 output_stop = empty_ptr;
1505 }
1506
1507 resume:
1508 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1509
1510 if (ret == econv_invalid_byte_sequence ||
1511 ret == econv_incomplete_input) {
1512 /* deal with invalid byte sequence */
1513 /* todo: add more alternative behaviors */
1514 switch (ec->flags & ECONV_INVALID_MASK) {
1516 if (output_replacement_character(ec) == 0)
1517 goto resume;
1518 }
1519 }
1520
1521 if (ret == econv_undefined_conversion) {
1522 /* valid character in source encoding
1523 * but no related character(s) in destination encoding */
1524 /* todo: add more alternative behaviors */
1525 switch (ec->flags & ECONV_UNDEF_MASK) {
1527 if (output_replacement_character(ec) == 0)
1528 goto resume;
1529 break;
1530
1532 if (output_hex_charref(ec) == 0)
1533 goto resume;
1534 break;
1535 }
1536 }
1537
1538 return ret;
1539}
1540
1541const char *
1543{
1544 rb_transcoding *tc = ec->last_tc;
1545 const rb_transcoder *tr;
1546
1547 if (tc == NULL)
1548 return "";
1549
1550 tr = tc->transcoder;
1551
1552 if (tr->asciicompat_type == asciicompat_encoder)
1553 return tr->src_encoding;
1554 return tr->dst_encoding;
1555}
1556
1557static unsigned char *
1558allocate_converted_string(const char *sname, const char *dname,
1559 const unsigned char *str, size_t len,
1560 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1561 size_t *dst_len_ptr, size_t *dst_bufsize_ptr)
1562{
1563 unsigned char *dst_str;
1564 size_t dst_len;
1565 size_t dst_bufsize;
1566
1567 rb_econv_t *ec;
1569
1570 const unsigned char *sp;
1571 unsigned char *dp;
1572
1573 if (caller_dst_buf)
1574 dst_bufsize = caller_dst_bufsize;
1575 else if (len == 0)
1576 dst_bufsize = 1;
1577 else
1578 dst_bufsize = len;
1579
1580 ec = rb_econv_open(sname, dname, 0);
1581 if (ec == NULL)
1582 return NULL;
1583 if (caller_dst_buf)
1584 dst_str = caller_dst_buf;
1585 else
1586 dst_str = xmalloc(dst_bufsize);
1587 dst_len = 0;
1588 sp = str;
1589 dp = dst_str+dst_len;
1590 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1591 dst_len = dp - dst_str;
1592 while (res == econv_destination_buffer_full) {
1593 if (SIZE_MAX/2 < dst_bufsize) {
1594 goto fail;
1595 }
1596 dst_bufsize *= 2;
1597 if (dst_str == caller_dst_buf) {
1598 unsigned char *tmp;
1599 tmp = xmalloc(dst_bufsize);
1600 memcpy(tmp, dst_str, dst_bufsize/2);
1601 dst_str = tmp;
1602 }
1603 else {
1604 dst_str = ruby_sized_xrealloc(dst_str, dst_bufsize, dst_bufsize / 2);
1605 }
1606 dp = dst_str+dst_len;
1607 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1608 dst_len = dp - dst_str;
1609 }
1610 if (res != econv_finished) {
1611 goto fail;
1612 }
1613 rb_econv_close(ec);
1614 *dst_len_ptr = dst_len;
1615 *dst_bufsize_ptr = dst_bufsize;
1616 return dst_str;
1617
1618 fail:
1619 if (dst_str != caller_dst_buf)
1620 ruby_sized_xfree(dst_str, dst_bufsize);
1621 rb_econv_close(ec);
1622 return NULL;
1623}
1624
1625/* result: 0:success -1:failure */
1626int
1628 const unsigned char *str, size_t len, const char *str_encoding)
1629{
1630 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1631 unsigned char insert_buf[4096];
1632 const unsigned char *insert_str = NULL;
1633 size_t insert_len, insert_bufsize;
1634
1635 int last_trans_index;
1636 rb_transcoding *tc;
1637
1638 unsigned char **buf_start_p;
1639 unsigned char **data_start_p;
1640 unsigned char **data_end_p;
1641 unsigned char **buf_end_p;
1642
1643 size_t need;
1644
1645 ec->started = 1;
1646
1647 if (len == 0)
1648 return 0;
1649
1650 if (encoding_equal(insert_encoding, str_encoding)) {
1651 insert_str = str;
1652 insert_len = len;
1653 }
1654 else {
1655 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1656 str, len, insert_buf, sizeof(insert_buf), &insert_len, &insert_bufsize);
1657 if (insert_str == NULL)
1658 return -1;
1659 }
1660
1661 need = insert_len;
1662
1663 last_trans_index = ec->num_trans-1;
1664 if (ec->num_trans == 0) {
1665 tc = NULL;
1666 buf_start_p = &ec->in_buf_start;
1667 data_start_p = &ec->in_data_start;
1668 data_end_p = &ec->in_data_end;
1669 buf_end_p = &ec->in_buf_end;
1670 }
1671 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1672 tc = ec->elems[last_trans_index].tc;
1673 need += tc->readagain_len;
1674 if (need < insert_len)
1675 goto fail;
1676 if (last_trans_index == 0) {
1677 buf_start_p = &ec->in_buf_start;
1678 data_start_p = &ec->in_data_start;
1679 data_end_p = &ec->in_data_end;
1680 buf_end_p = &ec->in_buf_end;
1681 }
1682 else {
1683 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1684 buf_start_p = &ee->out_buf_start;
1685 data_start_p = &ee->out_data_start;
1686 data_end_p = &ee->out_data_end;
1687 buf_end_p = &ee->out_buf_end;
1688 }
1689 }
1690 else {
1691 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1692 buf_start_p = &ee->out_buf_start;
1693 data_start_p = &ee->out_data_start;
1694 data_end_p = &ee->out_data_end;
1695 buf_end_p = &ee->out_buf_end;
1696 tc = ec->elems[last_trans_index].tc;
1697 }
1698
1699 if (*buf_start_p == NULL) {
1700 unsigned char *buf = xmalloc(need);
1701 *buf_start_p = buf;
1702 *data_start_p = buf;
1703 *data_end_p = buf;
1704 *buf_end_p = buf+need;
1705 }
1706 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1707 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1708 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1709 *data_start_p = *buf_start_p;
1710 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1711 unsigned char *buf;
1712 size_t s = (*data_end_p - *buf_start_p) + need;
1713 if (s < need)
1714 goto fail;
1715 buf = ruby_sized_xrealloc(*buf_start_p, s, buf_end_p - buf_start_p);
1716 *data_start_p = buf;
1717 *data_end_p = buf + (*data_end_p - *buf_start_p);
1718 *buf_start_p = buf;
1719 *buf_end_p = buf + s;
1720 }
1721 }
1722
1723 memcpy(*data_end_p, insert_str, insert_len);
1724 *data_end_p += insert_len;
1725 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1726 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1727 *data_end_p += tc->readagain_len;
1728 tc->readagain_len = 0;
1729 }
1730
1731 if (insert_str != str && insert_str != insert_buf)
1732 ruby_sized_xfree((void *)insert_str, insert_bufsize);
1733 return 0;
1734
1735 fail:
1736 if (insert_str != str && insert_str != insert_buf)
1737 ruby_sized_xfree((void *)insert_str, insert_bufsize);
1738 return -1;
1739}
1740
1741void
1743{
1744 int i;
1745
1746 if (ec->replacement_allocated) {
1747 SIZED_FREE_N((char *)ec->replacement_str, ec->replacement_len);
1748 }
1749 for (i = 0; i < ec->num_trans; i++) {
1750 rb_transcoding_close(ec->elems[i].tc);
1751 ruby_sized_xfree(ec->elems[i].out_buf_start, ec->elems[i].out_buf_end - ec->elems[i].out_buf_start);
1752 }
1753 SIZED_FREE_N(ec->in_buf_start, ec->in_buf_end - ec->in_buf_start);
1754 SIZED_FREE_N(ec->elems, ec->num_allocated);
1755 SIZED_FREE(ec);
1756}
1757
1758size_t
1759rb_econv_memsize(rb_econv_t *ec)
1760{
1761 size_t size = sizeof(rb_econv_t);
1762 int i;
1763
1764 if (ec->replacement_allocated) {
1765 size += ec->replacement_len;
1766 }
1767 for (i = 0; i < ec->num_trans; i++) {
1768 size += rb_transcoding_memsize(ec->elems[i].tc);
1769
1770 if (ec->elems[i].out_buf_start) {
1771 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1772 }
1773 }
1774 size += ec->in_buf_end - ec->in_buf_start;
1775 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1776
1777 return size;
1778}
1779
1780int
1782{
1783 if (ec->num_trans == 0)
1784 return 0;
1785#if SIZEOF_SIZE_T > SIZEOF_INT
1786 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1787#endif
1788 return (int)ec->elems[0].tc->readagain_len;
1789}
1790
1791void
1792rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1793{
1794 rb_transcoding *tc;
1795 if (ec->num_trans == 0 || n == 0)
1796 return;
1797 tc = ec->elems[0].tc;
1798 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1799 tc->readagain_len -= n;
1800}
1801
1803 const char *ascii_compat_name;
1804 const char *ascii_incompat_name;
1805};
1806
1807static int
1808asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1809{
1810 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1811 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1812 const rb_transcoder *tr;
1813
1814 if (DECORATOR_P(entry->sname, entry->dname))
1815 return ST_CONTINUE;
1816 tr = load_transcoder_entry(entry);
1817 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1818 data->ascii_compat_name = tr->dst_encoding;
1819 return ST_STOP;
1820 }
1821 return ST_CONTINUE;
1822}
1823
1824const char *
1825rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1826{
1827 st_data_t v;
1828 st_table *table2;
1829 struct asciicompat_encoding_t data = {0};
1830
1831 unsigned int lev;
1832 RB_VM_LOCK_ENTER_LEV(&lev);
1833 {
1834 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1835 table2 = (st_table *)v;
1836 /*
1837 * Assumption:
1838 * There is at most one transcoder for
1839 * converting from ASCII incompatible encoding.
1840 *
1841 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1842 */
1843 if (table2->num_entries == 1) {
1844 data.ascii_incompat_name = ascii_incompat_name;
1845 data.ascii_compat_name = NULL;
1846 if (rb_multi_ractor_p()) {
1847 /*
1848 * We need to unlock in case `load_transcoder_entry` actually loads the encoding
1849 * and table2 could be inserted into when we unlock.
1850 */
1851 st_table *dup_table2 = st_copy(table2);
1852 RB_VM_LOCK_LEAVE_LEV(&lev);
1853 st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data);
1854 st_free_table(dup_table2);
1855 RB_VM_LOCK_ENTER_LEV(&lev);
1856 }
1857 else {
1858 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1859 }
1860 }
1861
1862 }
1863 }
1864 RB_VM_LOCK_LEAVE_LEV(&lev);
1865
1866 return data.ascii_compat_name; // can be NULL
1867}
1868
1869/*
1870 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1871 *
1872 * If the result of the conversion is not compatible with the encoding of
1873 * `dst`, `dst` may not be valid encoding.
1874 */
1875VALUE
1876rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1877{
1878 unsigned const char *sp, *se;
1879 unsigned char *ds, *dp, *de;
1881 int max_output;
1882 enum ruby_coderange_type coderange;
1883 rb_encoding *dst_enc = ec->destination_encoding;
1884
1885 if (NIL_P(dst)) {
1886 dst = rb_str_buf_new(len);
1887 if (dst_enc) {
1888 rb_enc_associate(dst, dst_enc);
1889 }
1890 coderange = ENC_CODERANGE_7BIT; // scan from the start
1891 }
1892 else {
1893 dst_enc = rb_enc_get(dst);
1894 coderange = rb_enc_str_coderange(dst);
1895 }
1896
1897 if (ec->last_tc)
1898 max_output = ec->last_tc->transcoder->max_output;
1899 else
1900 max_output = 1;
1901
1902 do {
1903 int cr;
1904 long dlen = RSTRING_LEN(dst);
1905 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1906 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1907 if (LONG_MAX < new_capa)
1908 rb_raise(rb_eArgError, "too long string");
1909 rb_str_modify_expand(dst, new_capa - dlen);
1910 }
1911 sp = (const unsigned char *)ss;
1912 se = sp + len;
1913 ds = (unsigned char *)RSTRING_PTR(dst);
1914 de = ds + rb_str_capacity(dst);
1915 dp = ds += dlen;
1916 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1917 switch (coderange) {
1918 case ENC_CODERANGE_7BIT:
1920 cr = (int)coderange;
1921 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1922 coderange = cr;
1923 ENC_CODERANGE_SET(dst, coderange);
1924 break;
1927 break;
1928 }
1929 len -= (const char *)sp - ss;
1930 ss = (const char *)sp;
1931 rb_str_set_len(dst, dlen + (dp - ds));
1933 } while (res == econv_destination_buffer_full);
1934
1935 return dst;
1936}
1937
1938VALUE
1939rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1940{
1941 src = rb_str_new_frozen(src);
1942 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1943 RB_GC_GUARD(src);
1944 return dst;
1945}
1946
1947VALUE
1949{
1950 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1951}
1952
1953VALUE
1954rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1955{
1956 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1957}
1958
1959VALUE
1961{
1962 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1963}
1964
1965static int
1966rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1967{
1968 transcoder_entry_t *entry;
1969 const rb_transcoder *tr = NULL;
1970
1971 if (ec->started != 0)
1972 return -1;
1973
1974 entry = get_transcoder_entry(sname, dname);
1975 if (entry) {
1976 tr = load_transcoder_entry(entry);
1977 }
1978
1979 return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
1980}
1981
1982static int
1983rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1984{
1985 return rb_econv_add_converter(ec, "", decorator_name, n);
1986}
1987
1988int
1989rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1990{
1991 const rb_transcoder *tr;
1992
1993 if (ec->num_trans == 0)
1994 return rb_econv_decorate_at(ec, decorator_name, 0);
1995
1996 tr = ec->elems[0].tc->transcoder;
1997
1998 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1999 tr->asciicompat_type == asciicompat_decoder)
2000 return rb_econv_decorate_at(ec, decorator_name, 1);
2001
2002 return rb_econv_decorate_at(ec, decorator_name, 0);
2003}
2004
2005int
2006rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
2007{
2008 const rb_transcoder *tr;
2009
2010 if (ec->num_trans == 0)
2011 return rb_econv_decorate_at(ec, decorator_name, 0);
2012
2013 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2014
2015 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
2016 tr->asciicompat_type == asciicompat_encoder)
2017 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2018
2019 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2020}
2021
2022void
2024{
2025 const char *dname = 0;
2026
2027 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2029 dname = "universal_newline";
2030 break;
2032 dname = "crlf_newline";
2033 break;
2035 dname = "cr_newline";
2036 break;
2038 dname = "lf_newline";
2039 break;
2040 }
2041
2042 if (dname) {
2043 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2044 int num_trans = ec->num_trans;
2045 int i, j = 0;
2046
2047 for (i=0; i < num_trans; i++) {
2048 if (transcoder == ec->elems[i].tc->transcoder) {
2049 rb_transcoding_close(ec->elems[i].tc);
2050 ruby_sized_xfree(ec->elems[i].out_buf_start, ec->elems[i].out_buf_end - ec->elems[i].out_buf_start);
2051 ec->num_trans--;
2052 }
2053 else
2054 ec->elems[j++] = ec->elems[i];
2055 }
2056 }
2057
2058 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2059}
2060
2061static VALUE
2062econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2063{
2064 int has_description = 0;
2065
2066 if (NIL_P(mesg))
2067 mesg = rb_str_new(NULL, 0);
2068
2069 if (*sname != '\0' || *dname != '\0') {
2070 if (*sname == '\0')
2071 rb_str_cat2(mesg, dname);
2072 else if (*dname == '\0')
2073 rb_str_cat2(mesg, sname);
2074 else
2075 rb_str_catf(mesg, "%s to %s", sname, dname);
2076 has_description = 1;
2077 }
2078
2079 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2083 const char *pre = "";
2084 if (has_description)
2085 rb_str_cat2(mesg, " with ");
2086 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2087 rb_str_cat2(mesg, pre); pre = ",";
2088 rb_str_cat2(mesg, "universal_newline");
2089 }
2090 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2091 rb_str_cat2(mesg, pre); pre = ",";
2092 rb_str_cat2(mesg, "crlf_newline");
2093 }
2094 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2095 rb_str_cat2(mesg, pre); pre = ",";
2096 rb_str_cat2(mesg, "cr_newline");
2097 }
2098 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2099 rb_str_cat2(mesg, pre); pre = ",";
2100 rb_str_cat2(mesg, "lf_newline");
2101 }
2102 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2103 rb_str_cat2(mesg, pre); pre = ",";
2104 rb_str_cat2(mesg, "xml_text");
2105 }
2106 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2107 rb_str_cat2(mesg, pre); pre = ",";
2108 rb_str_cat2(mesg, "xml_attr_content");
2109 }
2110 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2111 rb_str_cat2(mesg, pre); pre = ",";
2112 rb_str_cat2(mesg, "xml_attr_quote");
2113 }
2114 has_description = 1;
2115 }
2116 if (!has_description) {
2117 rb_str_cat2(mesg, "no-conversion");
2118 }
2119
2120 return mesg;
2121}
2122
2123VALUE
2124rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2125{
2126 VALUE mesg, exc;
2127 mesg = rb_str_new_cstr("code converter not found (");
2128 econv_description(sname, dname, ecflags, mesg);
2129 rb_str_cat2(mesg, ")");
2130 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2131 return exc;
2132}
2133
2134static VALUE
2135make_econv_exception(rb_econv_t *ec)
2136{
2137 VALUE mesg, exc;
2138 if (ec->last_error.result == econv_invalid_byte_sequence ||
2139 ec->last_error.result == econv_incomplete_input) {
2140 const char *err = (const char *)ec->last_error.error_bytes_start;
2141 size_t error_len = ec->last_error.error_bytes_len;
2142 VALUE bytes = rb_str_new(err, error_len);
2143 VALUE dumped = rb_str_dump(bytes);
2144 size_t readagain_len = ec->last_error.readagain_len;
2145 VALUE bytes2 = Qnil;
2146 VALUE dumped2;
2147 if (ec->last_error.result == econv_incomplete_input) {
2148 mesg = rb_sprintf("incomplete %s on %s",
2149 StringValueCStr(dumped),
2150 ec->last_error.source_encoding);
2151 }
2152 else if (readagain_len) {
2153 bytes2 = rb_str_new(err+error_len, readagain_len);
2154 dumped2 = rb_str_dump(bytes2);
2155 mesg = rb_sprintf("%s followed by %s on %s",
2156 StringValueCStr(dumped),
2157 StringValueCStr(dumped2),
2158 ec->last_error.source_encoding);
2159 }
2160 else {
2161 mesg = rb_sprintf("%s on %s",
2162 StringValueCStr(dumped),
2163 ec->last_error.source_encoding);
2164 }
2165
2166 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2167 rb_ivar_set(exc, id_error_bytes, bytes);
2168 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2169 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2170 goto set_encs;
2171 }
2172 if (ec->last_error.result == econv_undefined_conversion) {
2173 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2174 ec->last_error.error_bytes_len);
2175 VALUE dumped = Qnil;
2176 int idx;
2177 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2178 rb_encoding *utf8 = rb_utf8_encoding();
2179 const char *start, *end;
2180 int n;
2181 start = (const char *)ec->last_error.error_bytes_start;
2182 end = start + ec->last_error.error_bytes_len;
2183 n = rb_enc_precise_mbclen(start, end, utf8);
2184 if (MBCLEN_CHARFOUND_P(n) &&
2185 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2186 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2187 dumped = rb_sprintf("U+%04X", cc);
2188 }
2189 }
2190 if (NIL_P(dumped))
2191 dumped = rb_str_dump(bytes);
2192 if (strcmp(ec->last_error.source_encoding,
2193 ec->source_encoding_name) == 0 &&
2194 strcmp(ec->last_error.destination_encoding,
2195 ec->destination_encoding_name) == 0) {
2196 mesg = rb_sprintf("%s from %s to %s",
2197 StringValueCStr(dumped),
2198 ec->last_error.source_encoding,
2199 ec->last_error.destination_encoding);
2200 }
2201 else {
2202 int i;
2203 mesg = rb_sprintf("%s to %s in conversion from %s",
2204 StringValueCStr(dumped),
2205 ec->last_error.destination_encoding,
2206 ec->source_encoding_name);
2207 for (i = 0; i < ec->num_trans; i++) {
2208 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2209 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2210 rb_str_catf(mesg, " to %s",
2211 ec->elems[i].tc->transcoder->dst_encoding);
2212 }
2213 }
2214 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2215 idx = rb_enc_find_index(ec->last_error.source_encoding);
2216 if (0 <= idx)
2217 rb_enc_associate_index(bytes, idx);
2218 rb_ivar_set(exc, id_error_char, bytes);
2219 goto set_encs;
2220 }
2221 return Qnil;
2222
2223 set_encs:
2224 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2225 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2226 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2227 if (0 <= idx)
2228 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2229 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2230 if (0 <= idx)
2231 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2232 return exc;
2233}
2234
2235static void
2236more_output_buffer(
2237 VALUE destination,
2238 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2239 int max_output,
2240 unsigned char **out_start_ptr,
2241 unsigned char **out_pos,
2242 unsigned char **out_stop_ptr)
2243{
2244 size_t len = (*out_pos - *out_start_ptr);
2245 size_t new_len = (len + max_output) * 2;
2246 *out_start_ptr = resize_destination(destination, len, new_len);
2247 *out_pos = *out_start_ptr + len;
2248 *out_stop_ptr = *out_start_ptr + new_len;
2249}
2250
2251static int
2252make_replacement(rb_econv_t *ec)
2253{
2254 rb_transcoding *tc;
2255 const rb_transcoder *tr;
2256 const unsigned char *replacement;
2257 const char *repl_enc;
2258 const char *ins_enc;
2259 size_t len;
2260
2261 if (ec->replacement_str)
2262 return 0;
2263
2265
2266 tc = ec->last_tc;
2267 if (*ins_enc) {
2268 tr = tc->transcoder;
2269 rb_enc_find(tr->dst_encoding);
2270 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2271 }
2272 else {
2273 replacement = (unsigned char *)"?";
2274 len = 1;
2275 repl_enc = "";
2276 }
2277
2278 ec->replacement_str = replacement;
2279 ec->replacement_len = len;
2280 ec->replacement_bufsize = len;
2281 ec->replacement_enc = repl_enc;
2282 ec->replacement_allocated = 0;
2283 return 0;
2284}
2285
2286int
2288 const unsigned char *str, size_t len, const char *encname)
2289{
2290 unsigned char *str2;
2291 size_t len2, buf_size2;
2292 const char *encname2;
2293
2295
2296 if (!*encname2 || encoding_equal(encname, encname2)) {
2297 str2 = xmalloc(len);
2298 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2299 buf_size2 = len2 = len;
2300 encname2 = encname;
2301 }
2302 else {
2303 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2, &buf_size2);
2304 if (!str2)
2305 return -1;
2306 }
2307
2308 if (ec->replacement_allocated) {
2309 SIZED_FREE_N((char *)ec->replacement_str, ec->replacement_bufsize);
2310 }
2311 ec->replacement_allocated = 1;
2312 ec->replacement_str = str2;
2313 ec->replacement_len = len2;
2314 ec->replacement_bufsize = buf_size2;
2315 ec->replacement_enc = encname2;
2316 return 0;
2317}
2318
2319static int
2320output_replacement_character(rb_econv_t *ec)
2321{
2322 int ret;
2323
2324 if (make_replacement(ec) == -1)
2325 return -1;
2326
2327 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2328 if (ret == -1)
2329 return -1;
2330
2331 return 0;
2332}
2333
2334#if 1
2335#define hash_fallback rb_hash_aref
2336
2337static VALUE
2338proc_fallback(VALUE fallback, VALUE c)
2339{
2340 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2341}
2342
2343static VALUE
2344method_fallback(VALUE fallback, VALUE c)
2345{
2346 return rb_method_call(1, &c, fallback);
2347}
2348
2349static VALUE
2350aref_fallback(VALUE fallback, VALUE c)
2351{
2352 return rb_funcallv_public(fallback, idAREF, 1, &c);
2353}
2354
2356 VALUE (*fallback_func)(VALUE, VALUE);
2357 VALUE fallback;
2358 VALUE rep;
2359};
2360
2361static VALUE
2362transcode_loop_fallback_try(VALUE a)
2363{
2365
2366 VALUE ret = args->fallback_func(args->fallback, args->rep);
2367
2368 if (!UNDEF_P(ret) && !NIL_P(ret)) {
2369 StringValue(ret);
2370 }
2371
2372 return ret;
2373}
2374
2375static void
2376transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2377 const unsigned char *in_stop, unsigned char *out_stop,
2378 VALUE destination,
2379 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2380 const char *src_encoding,
2381 const char *dst_encoding,
2382 int ecflags,
2383 VALUE ecopts)
2384{
2385 rb_econv_t *ec;
2386 rb_transcoding *last_tc;
2388 unsigned char *out_start = *out_pos;
2389 int max_output;
2390 VALUE exc;
2391 VALUE fallback = Qnil;
2392 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2393
2394 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2395 if (!ec)
2396 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2397
2398 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2399 fallback = rb_hash_aref(ecopts, sym_fallback);
2400 if (RB_TYPE_P(fallback, T_HASH)) {
2401 fallback_func = hash_fallback;
2402 }
2403 else if (rb_obj_is_proc(fallback)) {
2404 fallback_func = proc_fallback;
2405 }
2406 else if (rb_obj_is_method(fallback)) {
2407 fallback_func = method_fallback;
2408 }
2409 else {
2410 fallback_func = aref_fallback;
2411 }
2412 }
2413 last_tc = ec->last_tc;
2414 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2415
2416 resume:
2417 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2418
2419 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2420 VALUE rep = rb_enc_str_new(
2421 (const char *)ec->last_error.error_bytes_start,
2422 ec->last_error.error_bytes_len,
2423 rb_enc_find(ec->last_error.source_encoding));
2424
2425
2426 struct transcode_loop_fallback_args args = {
2427 .fallback_func = fallback_func,
2428 .fallback = fallback,
2429 .rep = rep,
2430 };
2431
2432 int state;
2433 rep = rb_protect(transcode_loop_fallback_try, (VALUE)&args, &state);
2434 if (state) {
2435 rb_econv_close(ec);
2436 rb_jump_tag(state);
2437 }
2438
2439 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2440 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2441 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2442 if ((int)ret == -1) {
2443 rb_econv_close(ec);
2444 rb_raise(rb_eArgError, "too big fallback string");
2445 }
2446 goto resume;
2447 }
2448 }
2449
2450 if (ret == econv_invalid_byte_sequence ||
2451 ret == econv_incomplete_input ||
2453 exc = make_econv_exception(ec);
2454 rb_econv_close(ec);
2455 rb_exc_raise(exc);
2456 }
2457
2458 if (ret == econv_destination_buffer_full) {
2459 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2460 goto resume;
2461 }
2462
2463 rb_econv_close(ec);
2464 return;
2465}
2466#else
2467/* sample transcode_loop implementation in byte-by-byte stream style */
2468static void
2469transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2470 const unsigned char *in_stop, unsigned char *out_stop,
2471 VALUE destination,
2472 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2473 const char *src_encoding,
2474 const char *dst_encoding,
2475 int ecflags,
2476 VALUE ecopts)
2477{
2478 rb_econv_t *ec;
2479 rb_transcoding *last_tc;
2481 unsigned char *out_start = *out_pos;
2482 const unsigned char *ptr;
2483 int max_output;
2484 VALUE exc;
2485
2486 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2487 if (!ec)
2488 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2489
2490 last_tc = ec->last_tc;
2491 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2492
2494 ptr = *in_pos;
2495 while (ret != econv_finished) {
2496 unsigned char input_byte;
2497 const unsigned char *p = &input_byte;
2498
2499 if (ret == econv_source_buffer_empty) {
2500 if (ptr < in_stop) {
2501 input_byte = *ptr;
2502 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2503 }
2504 else {
2505 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2506 }
2507 }
2508 else {
2509 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2510 }
2511 if (&input_byte != p)
2512 ptr += p - &input_byte;
2513 switch (ret) {
2517 exc = make_econv_exception(ec);
2518 rb_econv_close(ec);
2519 rb_exc_raise(exc);
2520 break;
2521
2523 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2524 break;
2525
2527 break;
2528
2529 case econv_finished:
2530 break;
2531 }
2532 }
2533 rb_econv_close(ec);
2534 *in_pos = in_stop;
2535 return;
2536}
2537#endif
2538
2539
2540/*
2541 * String-specific code
2542 */
2543
2544static unsigned char *
2545str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2546{
2547 rb_str_resize(destination, new_len);
2548 return (unsigned char *)RSTRING_PTR(destination);
2549}
2550
2551static int
2552econv_opts(VALUE opt, int ecflags)
2553{
2554 VALUE v;
2555 int newlineflag = 0;
2556
2557 v = rb_hash_aref(opt, sym_invalid);
2558 if (NIL_P(v)) {
2559 }
2560 else if (v==sym_replace) {
2561 ecflags |= ECONV_INVALID_REPLACE;
2562 }
2563 else {
2564 rb_raise(rb_eArgError, "unknown value for invalid character option");
2565 }
2566
2567 v = rb_hash_aref(opt, sym_undef);
2568 if (NIL_P(v)) {
2569 }
2570 else if (v==sym_replace) {
2571 ecflags |= ECONV_UNDEF_REPLACE;
2572 }
2573 else {
2574 rb_raise(rb_eArgError, "unknown value for undefined character option");
2575 }
2576
2577 v = rb_hash_aref(opt, sym_replace);
2578 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2579 ecflags |= ECONV_UNDEF_REPLACE;
2580 }
2581
2582 v = rb_hash_aref(opt, sym_xml);
2583 if (!NIL_P(v)) {
2584 if (v==sym_text) {
2586 }
2587 else if (v==sym_attr) {
2589 }
2590 else if (SYMBOL_P(v)) {
2591 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2592 }
2593 else {
2594 rb_raise(rb_eArgError, "unexpected value for xml option");
2595 }
2596 }
2597
2598#ifdef ENABLE_ECONV_NEWLINE_OPTION
2599 v = rb_hash_aref(opt, sym_newline);
2600 if (!NIL_P(v)) {
2601 newlineflag = 2;
2602 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2603 if (v == sym_universal) {
2605 }
2606 else if (v == sym_crlf) {
2608 }
2609 else if (v == sym_cr) {
2610 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2611 }
2612 else if (v == sym_lf) {
2613 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2614 }
2615 else if (SYMBOL_P(v)) {
2616 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2617 rb_sym2str(v));
2618 }
2619 else {
2620 rb_raise(rb_eArgError, "unexpected value for newline option");
2621 }
2622 }
2623#endif
2624 {
2625 int setflags = 0;
2626
2627 v = rb_hash_aref(opt, sym_universal_newline);
2628 if (RTEST(v))
2630 newlineflag |= !NIL_P(v);
2631
2632 v = rb_hash_aref(opt, sym_crlf_newline);
2633 if (RTEST(v))
2634 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2635 newlineflag |= !NIL_P(v);
2636
2637 v = rb_hash_aref(opt, sym_cr_newline);
2638 if (RTEST(v))
2639 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2640 newlineflag |= !NIL_P(v);
2641
2642 v = rb_hash_aref(opt, sym_lf_newline);
2643 if (RTEST(v))
2644 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2645 newlineflag |= !NIL_P(v);
2646
2647 switch (newlineflag) {
2648 case 1:
2649 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2650 ecflags |= setflags;
2651 break;
2652
2653 case 3:
2654 rb_warning(":newline option precedes other newline options");
2655 break;
2656 }
2657 }
2658
2659 return ecflags;
2660}
2661
2662int
2663rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2664{
2665 VALUE newhash = Qnil;
2666 VALUE v;
2667
2668 if (NIL_P(opthash)) {
2669 *opts = Qnil;
2670 return ecflags;
2671 }
2672 ecflags = econv_opts(opthash, ecflags);
2673
2674 v = rb_hash_aref(opthash, sym_replace);
2675 if (!NIL_P(v)) {
2676 StringValue(v);
2677 if (is_broken_string(v)) {
2678 VALUE dumped = rb_str_dump(v);
2679 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2680 StringValueCStr(dumped),
2681 rb_enc_name(rb_enc_get(v)));
2682 }
2683 v = rb_str_new_frozen(v);
2684 newhash = rb_hash_new();
2685 rb_hash_aset(newhash, sym_replace, v);
2686 }
2687
2688 v = rb_hash_aref(opthash, sym_fallback);
2689 if (!NIL_P(v)) {
2690 VALUE h = rb_check_hash_type(v);
2691 if (NIL_P(h)
2692 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2693 : (v = h, 1)) {
2694 if (NIL_P(newhash))
2695 newhash = rb_hash_new();
2696 rb_hash_aset(newhash, sym_fallback, v);
2697 }
2698 }
2699
2700 if (!NIL_P(newhash))
2701 rb_hash_freeze(newhash);
2702 *opts = newhash;
2703
2704 return ecflags;
2705}
2706
2707int
2709{
2710 return rb_econv_prepare_options(opthash, opts, 0);
2711}
2712
2713rb_econv_t *
2714rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2715{
2716 rb_econv_t *ec;
2717 VALUE replacement;
2718
2719 if (NIL_P(opthash)) {
2720 replacement = Qnil;
2721 }
2722 else {
2723 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2724 rb_bug("rb_econv_open_opts called with invalid opthash");
2725 replacement = rb_hash_aref(opthash, sym_replace);
2726 }
2727
2728 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2729 if (ec) {
2730 if (!NIL_P(replacement)) {
2731 int ret;
2732 rb_encoding *enc = rb_enc_get(replacement);
2733
2734 ret = rb_econv_set_replacement(ec,
2735 (const unsigned char *)RSTRING_PTR(replacement),
2736 RSTRING_LEN(replacement),
2737 rb_enc_name(enc));
2738 if (ret == -1) {
2739 rb_econv_close(ec);
2740 ec = NULL;
2741 }
2742 }
2743 }
2744 return ec; // can be NULL
2745}
2746
2747static int
2748enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2749{
2750 rb_encoding *enc;
2751 const char *n;
2752 int encidx;
2753 VALUE encval;
2754
2755 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2756 !(enc = rb_enc_from_index(encidx))) {
2757 enc = NULL;
2758 encidx = 0;
2759 n = StringValueCStr(*arg);
2760 }
2761 else {
2762 n = rb_enc_name(enc);
2763 }
2764
2765 *name_p = n;
2766 *enc_p = enc;
2767
2768 return encidx;
2769}
2770
2771static int
2772str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2773 const char **sname_p, rb_encoding **senc_p,
2774 const char **dname_p, rb_encoding **denc_p)
2775{
2776 rb_encoding *senc, *denc;
2777 const char *sname, *dname;
2778 int sencidx, dencidx;
2779
2780 dencidx = enc_arg(arg1, &dname, &denc);
2781
2782 if (NIL_P(*arg2)) {
2783 sencidx = rb_enc_get_index(str);
2784 senc = rb_enc_from_index(sencidx);
2785 sname = rb_enc_name(senc);
2786 }
2787 else {
2788 sencidx = enc_arg(arg2, &sname, &senc);
2789 }
2790
2791 *sname_p = sname;
2792 *senc_p = senc;
2793 *dname_p = dname;
2794 *denc_p = denc;
2795 return dencidx;
2796}
2797
2798static int
2799str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2800{
2801 VALUE dest;
2802 VALUE str = *self;
2803 VALUE arg1, arg2;
2804 long blen, slen;
2805 unsigned char *buf, *bp, *sp;
2806 const unsigned char *fromp;
2807 rb_encoding *senc, *denc;
2808 const char *sname, *dname;
2809 int dencidx;
2810 int explicitly_invalid_replace = TRUE;
2811
2812 rb_check_arity(argc, 0, 2);
2813
2814 if (argc == 0) {
2815 arg1 = rb_enc_default_internal();
2816 if (NIL_P(arg1)) {
2817 if (!ecflags) return -1;
2818 arg1 = rb_obj_encoding(str);
2819 }
2820 if (!(ecflags & ECONV_INVALID_MASK)) {
2821 explicitly_invalid_replace = FALSE;
2822 }
2824 }
2825 else {
2826 arg1 = argv[0];
2827 }
2828 arg2 = argc<=1 ? Qnil : argv[1];
2829 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2830
2831 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2835 if (senc && senc == denc) {
2836 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2837 VALUE rep = Qnil;
2838 if (!NIL_P(ecopts)) {
2839 rep = rb_hash_aref(ecopts, sym_replace);
2840 }
2841 dest = rb_enc_str_scrub(senc, str, rep);
2842 if (NIL_P(dest)) dest = str;
2843 *self = dest;
2844 return dencidx;
2845 }
2846 return NIL_P(arg2) ? -1 : dencidx;
2847 }
2848 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2849 if (is_ascii_string(str)) {
2850 return dencidx;
2851 }
2852 }
2853 if (encoding_equal(sname, dname)) {
2854 return NIL_P(arg2) ? -1 : dencidx;
2855 }
2856 }
2857 else {
2858 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2859 rb_encoding *utf8 = rb_utf8_encoding();
2860 str = rb_str_conv_enc(str, senc, utf8);
2861 senc = utf8;
2862 sname = "UTF-8";
2863 }
2864 if (encoding_equal(sname, dname)) {
2865 sname = "";
2866 dname = "";
2867 }
2868 }
2869
2870 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2871 slen = RSTRING_LEN(str);
2872 blen = slen + 30; /* len + margin */
2873 dest = rb_str_tmp_new(blen);
2874 bp = (unsigned char *)RSTRING_PTR(dest);
2875
2876 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2877 if (fromp != sp+slen) {
2878 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2879 }
2880 buf = (unsigned char *)RSTRING_PTR(dest);
2881 *bp = '\0';
2882 rb_str_set_len(dest, bp - buf);
2883
2884 /* set encoding */
2885 if (!denc) {
2886 dencidx = rb_define_dummy_encoding(dname);
2887 RB_GC_GUARD(arg1);
2888 RB_GC_GUARD(arg2);
2889 }
2890 *self = dest;
2891
2892 return dencidx;
2893}
2894
2895static int
2896str_transcode(int argc, VALUE *argv, VALUE *self)
2897{
2898 VALUE opt;
2899 int ecflags = 0;
2900 VALUE ecopts = Qnil;
2901
2902 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2903 if (!NIL_P(opt)) {
2904 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2905 }
2906 return str_transcode0(argc, argv, self, ecflags, ecopts);
2907}
2908
2909static inline VALUE
2910str_encode_associate(VALUE str, int encidx)
2911{
2912 int cr = 0;
2913
2914 rb_enc_associate_index(str, encidx);
2915
2916 /* transcoded string never be broken. */
2917 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2918 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2919 }
2920 else {
2922 }
2923 ENC_CODERANGE_SET(str, cr);
2924 return str;
2925}
2926
2927/*
2928 * call-seq:
2929 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2930 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2931 *
2932 * Like #encode, but applies encoding changes to +self+; returns +self+.
2933 *
2934 * Related: see {Modifying}[rdoc-ref:String@Modifying].
2935 */
2936
2937static VALUE
2938str_encode_bang(int argc, VALUE *argv, VALUE str)
2939{
2940 VALUE newstr;
2941 int encidx;
2942
2943 rb_check_frozen(str);
2944
2945 newstr = str;
2946 encidx = str_transcode(argc, argv, &newstr);
2947
2948 if (encidx < 0) return str;
2949 if (newstr == str) {
2950 rb_enc_associate_index(str, encidx);
2951 return str;
2952 }
2953 rb_str_shared_replace(str, newstr);
2954 return str_encode_associate(str, encidx);
2955}
2956
2957static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2958
2959/*
2960 * call-seq:
2961 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2962 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2963 *
2964 * :include: doc/string/encode.rdoc
2965 *
2966 */
2967
2968static VALUE
2969str_encode(int argc, VALUE *argv, VALUE str)
2970{
2971 VALUE newstr = str;
2972 int encidx = str_transcode(argc, argv, &newstr);
2973 return encoded_dup(newstr, str, encidx);
2974}
2975
2976VALUE
2977rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2978{
2979 int argc = 1;
2980 VALUE *argv = &to;
2981 VALUE newstr = str;
2982 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2983 return encoded_dup(newstr, str, encidx);
2984}
2985
2986static VALUE
2987encoded_dup(VALUE newstr, VALUE str, int encidx)
2988{
2989 if (encidx < 0) return rb_str_dup(str);
2990 if (newstr == str) {
2991 newstr = rb_str_dup(str);
2992 rb_enc_associate_index(newstr, encidx);
2993 return newstr;
2994 }
2995 else {
2996 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2997 }
2998 return str_encode_associate(newstr, encidx);
2999}
3000
3001/*
3002 * Document-class: Encoding::Converter
3003 *
3004 * Encoding conversion class.
3005 */
3006static void
3007econv_free(void *ptr)
3008{
3009 rb_econv_t *ec = ptr;
3010 rb_econv_close(ec);
3011}
3012
3013static size_t
3014econv_memsize(const void *ptr)
3015{
3016 return sizeof(rb_econv_t);
3017}
3018
3019static const rb_data_type_t econv_data_type = {
3020 "econv",
3021 {0, econv_free, econv_memsize,},
3023};
3024
3025static VALUE
3026econv_s_allocate(VALUE klass)
3027{
3028 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
3029}
3030
3031static rb_encoding *
3032make_dummy_encoding(const char *name)
3033{
3034 rb_encoding *enc;
3035 int idx;
3036 idx = rb_define_dummy_encoding(name);
3037 enc = rb_enc_from_index(idx);
3038 return enc;
3039}
3040
3041static rb_encoding *
3042make_encoding(const char *name)
3043{
3044 rb_encoding *enc;
3045 enc = rb_enc_find(name);
3046 if (!enc) {
3047 RB_VM_LOCKING() {
3048 if (rb_enc_registered(name)) {
3049 enc = NULL;
3050 }
3051 else {
3052 enc = make_dummy_encoding(name);
3053 }
3054 }
3055 }
3056 return enc;
3057}
3058
3059static VALUE
3060make_encobj(const char *name)
3061{
3062 return rb_enc_from_encoding(make_encoding(name));
3063}
3064
3065/*
3066 * call-seq:
3067 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3068 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3069 *
3070 * Returns the corresponding ASCII compatible encoding.
3071 *
3072 * Returns nil if the argument is an ASCII compatible encoding.
3073 *
3074 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3075 * can represents exactly the same characters as the given ASCII incompatible encoding.
3076 * So, no conversion undefined error occurs when converting between the two encodings.
3077 *
3078 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3079 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3080 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3081 *
3082 */
3083static VALUE
3084econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3085{
3086 const char *arg_name, *result_name;
3087 rb_encoding *arg_enc, *result_enc;
3088 VALUE enc = Qnil;
3089
3090 enc_arg(&arg, &arg_name, &arg_enc);
3091 result_name = rb_econv_asciicompat_encoding(arg_name);
3092 if (result_name) {
3093 result_enc = make_encoding(result_name);
3094 enc = rb_enc_from_encoding(result_enc);
3095 }
3096 return enc;
3097}
3098
3099static void
3100econv_args(int argc, VALUE *argv,
3101 VALUE *snamev_p, VALUE *dnamev_p,
3102 const char **sname_p, const char **dname_p,
3103 rb_encoding **senc_p, rb_encoding **denc_p,
3104 int *ecflags_p,
3105 VALUE *ecopts_p)
3106{
3107 VALUE opt, flags_v, ecopts;
3108 int sidx, didx;
3109 const char *sname, *dname;
3110 rb_encoding *senc, *denc;
3111 int ecflags;
3112
3113 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3114
3115 if (!NIL_P(flags_v)) {
3116 if (!NIL_P(opt)) {
3117 rb_error_arity(argc + 1, 2, 3);
3118 }
3119 ecflags = NUM2INT(rb_to_int(flags_v));
3120 ecopts = Qnil;
3121 }
3122 else if (!NIL_P(opt)) {
3123 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3124 }
3125 else {
3126 ecflags = 0;
3127 ecopts = Qnil;
3128 }
3129
3130 senc = NULL;
3131 sidx = rb_to_encoding_index(*snamev_p);
3132 if (0 <= sidx) {
3133 senc = rb_enc_from_index(sidx);
3134 }
3135 else {
3136 StringValue(*snamev_p);
3137 }
3138
3139 denc = NULL;
3140 didx = rb_to_encoding_index(*dnamev_p);
3141 if (0 <= didx) {
3142 denc = rb_enc_from_index(didx);
3143 }
3144 else {
3145 StringValue(*dnamev_p);
3146 }
3147
3148 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3149 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3150
3151 *sname_p = sname;
3152 *dname_p = dname;
3153 *senc_p = senc;
3154 *denc_p = denc;
3155 *ecflags_p = ecflags;
3156 *ecopts_p = ecopts;
3157}
3158
3159static int
3160decorate_convpath(VALUE convpath, int ecflags)
3161{
3162 int num_decorators;
3163 const char *decorators[MAX_ECFLAGS_DECORATORS];
3164 int i;
3165 int n, len;
3166
3167 num_decorators = decorator_names(ecflags, decorators);
3168 if (num_decorators == -1)
3169 return -1;
3170
3171 len = n = RARRAY_LENINT(convpath);
3172 if (n != 0) {
3173 VALUE pair = RARRAY_AREF(convpath, n-1);
3174 if (RB_TYPE_P(pair, T_ARRAY)) {
3175 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3176 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3177 transcoder_entry_t *entry;
3178 const rb_transcoder *tr;
3179 entry = get_transcoder_entry(sname, dname);
3180 tr = load_transcoder_entry(entry);
3181 if (!tr)
3182 return -1;
3183 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3184 tr->asciicompat_type == asciicompat_encoder) {
3185 n--;
3186 rb_ary_store(convpath, len + num_decorators - 1, pair);
3187 }
3188 }
3189 else {
3190 rb_ary_store(convpath, len + num_decorators - 1, pair);
3191 }
3192 }
3193
3194 for (i = 0; i < num_decorators; i++)
3195 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3196
3197 return 0;
3198}
3199
3200static void
3201search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3202{
3203 VALUE *ary_p = arg;
3204 VALUE v;
3205
3206 if (NIL_P(*ary_p)) {
3207 *ary_p = rb_ary_new();
3208 }
3209
3210 if (DECORATOR_P(sname, dname)) {
3211 v = rb_str_new_cstr(dname);
3212 }
3213 else {
3214 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3215 }
3216 rb_ary_store(*ary_p, depth, v);
3217}
3218
3219/*
3220 * call-seq:
3221 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3222 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3223 *
3224 * Returns a conversion path.
3225 *
3226 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3227 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3228 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3229 *
3230 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3231 * or
3232 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3233 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3234 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3235 * # "universal_newline"]
3236 *
3237 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3238 * or
3239 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3240 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3241 * # "universal_newline",
3242 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3243 */
3244static VALUE
3245econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3246{
3247 VALUE snamev, dnamev;
3248 const char *sname, *dname;
3249 rb_encoding *senc, *denc;
3250 int ecflags;
3251 VALUE ecopts;
3252 VALUE convpath;
3253
3254 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3255
3256 convpath = Qnil;
3257 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3258
3259 if (NIL_P(convpath)) {
3260 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3261 RB_GC_GUARD(snamev);
3262 RB_GC_GUARD(dnamev);
3263 rb_exc_raise(exc);
3264 }
3265
3266 if (decorate_convpath(convpath, ecflags) == -1) {
3267 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3268 RB_GC_GUARD(snamev);
3269 RB_GC_GUARD(dnamev);
3270 rb_exc_raise(exc);
3271 }
3272
3273 return convpath;
3274}
3275
3276/*
3277 * Check the existence of a conversion path.
3278 * Returns the number of converters in the conversion path.
3279 * result: >=0:success -1:failure
3280 */
3281int
3282rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3283{
3284 VALUE convpath = Qnil;
3285 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3286 &convpath);
3287 return RTEST(convpath);
3288}
3289
3291 rb_econv_t *ec;
3292 int index;
3293 int ret;
3294};
3295
3296static void
3297rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3298{
3300 int ret;
3301
3302 if (a->ret == -1)
3303 return;
3304
3305 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3306
3307 a->ret = ret;
3308 return;
3309}
3310
3311static rb_econv_t *
3312rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3313 const char **sname_p, const char **dname_p,
3314 rb_encoding **senc_p, rb_encoding**denc_p)
3315{
3316 rb_econv_t *ec;
3317 long i;
3318 int ret, first=1;
3319 VALUE elt;
3320 rb_encoding *senc = 0, *denc = 0;
3321 const char *sname, *dname;
3322
3323 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3324 DATA_PTR(self) = ec;
3325
3326 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3327 VALUE snamev, dnamev;
3328 VALUE pair;
3329 elt = rb_ary_entry(convpath, i);
3330 if (!NIL_P(pair = rb_check_array_type(elt))) {
3331 if (RARRAY_LEN(pair) != 2)
3332 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3333 snamev = rb_ary_entry(pair, 0);
3334 enc_arg(&snamev, &sname, &senc);
3335 dnamev = rb_ary_entry(pair, 1);
3336 enc_arg(&dnamev, &dname, &denc);
3337 }
3338 else {
3339 sname = "";
3340 dname = StringValueCStr(elt);
3341 }
3342 if (DECORATOR_P(sname, dname)) {
3343 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3344 if (ret == -1) {
3345 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3346 RB_GC_GUARD(snamev);
3347 RB_GC_GUARD(dnamev);
3348 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3349 }
3350 }
3351 else {
3352 int j = ec->num_trans;
3353 struct rb_econv_init_by_convpath_t arg;
3354 arg.ec = ec;
3355 arg.index = ec->num_trans;
3356 arg.ret = 0;
3357 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3358 if (ret == -1 || arg.ret == -1) {
3359 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3360 RB_GC_GUARD(snamev);
3361 RB_GC_GUARD(dnamev);
3362 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3363 }
3364 if (first) {
3365 first = 0;
3366 *senc_p = senc;
3367 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3368 }
3369 *denc_p = denc;
3370 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3371 }
3372 }
3373
3374 if (first) {
3375 *senc_p = NULL;
3376 *denc_p = NULL;
3377 *sname_p = "";
3378 *dname_p = "";
3379 }
3380
3381 ec->source_encoding_name = *sname_p;
3382 ec->destination_encoding_name = *dname_p;
3383
3384 return ec;
3385}
3386
3387/*
3388 * call-seq:
3389 * Encoding::Converter.new(source_encoding, destination_encoding)
3390 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3391 * Encoding::Converter.new(convpath)
3392 *
3393 * possible options elements:
3394 * hash form:
3395 * :invalid => nil # raise error on invalid byte sequence (default)
3396 * :invalid => :replace # replace invalid byte sequence
3397 * :undef => nil # raise error on undefined conversion (default)
3398 * :undef => :replace # replace undefined conversion
3399 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3400 * :newline => :universal # decorator for converting CRLF and CR to LF
3401 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3402 * :newline => :crlf # decorator for converting LF to CRLF
3403 * :newline => :cr # decorator for converting LF to CR
3404 * :universal_newline => true # decorator for converting CRLF and CR to LF
3405 * :crlf_newline => true # decorator for converting LF to CRLF
3406 * :cr_newline => true # decorator for converting LF to CR
3407 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3408 * :xml => :text # escape as XML CharData.
3409 * :xml => :attr # escape as XML AttValue
3410 * integer form:
3411 * Encoding::Converter::INVALID_REPLACE
3412 * Encoding::Converter::UNDEF_REPLACE
3413 * Encoding::Converter::UNDEF_HEX_CHARREF
3414 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3415 * Encoding::Converter::LF_NEWLINE_DECORATOR
3416 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3417 * Encoding::Converter::CR_NEWLINE_DECORATOR
3418 * Encoding::Converter::XML_TEXT_DECORATOR
3419 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3420 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3421 *
3422 * Encoding::Converter.new creates an instance of Encoding::Converter.
3423 *
3424 * Source_encoding and destination_encoding should be a string or
3425 * Encoding object.
3426 *
3427 * opt should be nil, a hash or an integer.
3428 *
3429 * convpath should be an array.
3430 * convpath may contain
3431 * - two-element arrays which contain encodings or encoding names, or
3432 * - strings representing decorator names.
3433 *
3434 * Encoding::Converter.new optionally takes an option.
3435 * The option should be a hash or an integer.
3436 * The option hash can contain :invalid => nil, etc.
3437 * The option integer should be logical-or of constants such as
3438 * Encoding::Converter::INVALID_REPLACE, etc.
3439 *
3440 * [:invalid => nil]
3441 * Raise error on invalid byte sequence. This is a default behavior.
3442 * [:invalid => :replace]
3443 * Replace invalid byte sequence by replacement string.
3444 * [:undef => nil]
3445 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3446 * This is a default behavior.
3447 * [:undef => :replace]
3448 * Replace undefined character in destination_encoding with replacement string.
3449 * [:replace => string]
3450 * Specify the replacement string.
3451 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3452 * [:universal_newline => true]
3453 * Convert CRLF and CR to LF.
3454 * [:crlf_newline => true]
3455 * Convert LF to CRLF.
3456 * [:cr_newline => true]
3457 * Convert LF to CR.
3458 * [:lf_newline => true]
3459 * Convert CRLF and CR to LF (when writing).
3460 * [:xml => :text]
3461 * Escape as XML CharData.
3462 * This form can be used as an HTML 4.0 #PCDATA.
3463 * - '&' -> '&amp;'
3464 * - '<' -> '&lt;'
3465 * - '>' -> '&gt;'
3466 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3467 * [:xml => :attr]
3468 * Escape as XML AttValue.
3469 * The converted result is quoted as "...".
3470 * This form can be used as an HTML 4.0 attribute value.
3471 * - '&' -> '&amp;'
3472 * - '<' -> '&lt;'
3473 * - '>' -> '&gt;'
3474 * - '"' -> '&quot;'
3475 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3476 *
3477 * Examples:
3478 * # UTF-16BE to UTF-8
3479 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3480 *
3481 * # Usually, decorators such as newline conversion are inserted last.
3482 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3483 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3484 * # "universal_newline"]
3485 *
3486 * # But, if the last encoding is ASCII incompatible,
3487 * # decorators are inserted before the last conversion.
3488 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3489 * p ec.convpath #=> ["crlf_newline",
3490 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3491 *
3492 * # Conversion path can be specified directly.
3493 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3494 * p ec.convpath #=> ["universal_newline",
3495 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3496 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3497 */
3498static VALUE
3499econv_init(int argc, VALUE *argv, VALUE self)
3500{
3501 VALUE ecopts;
3502 VALUE snamev, dnamev;
3503 const char *sname, *dname;
3504 rb_encoding *senc, *denc;
3505 rb_econv_t *ec;
3506 int ecflags;
3507 VALUE convpath;
3508
3509 if (rb_check_typeddata(self, &econv_data_type)) {
3510 rb_raise(rb_eTypeError, "already initialized");
3511 }
3512
3513 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3514 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3515 ecflags = 0;
3516 ecopts = Qnil;
3517 }
3518 else {
3519 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3520 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3521 }
3522
3523 if (!ec) {
3524 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3525 RB_GC_GUARD(snamev);
3526 RB_GC_GUARD(dnamev);
3527 rb_exc_raise(exc);
3528 }
3529
3530 if (!DECORATOR_P(sname, dname)) {
3531 if (!senc)
3532 senc = make_dummy_encoding(sname);
3533 if (!denc)
3534 denc = make_dummy_encoding(dname);
3535 RB_GC_GUARD(snamev);
3536 RB_GC_GUARD(dnamev);
3537 }
3538
3539 ec->source_encoding = senc;
3540 ec->destination_encoding = denc;
3541
3542 DATA_PTR(self) = ec;
3543
3544 return self;
3545}
3546
3547/*
3548 * call-seq:
3549 * ec.inspect -> string
3550 *
3551 * Returns a printable version of <i>ec</i>
3552 *
3553 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3554 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3555 *
3556 */
3557static VALUE
3558econv_inspect(VALUE self)
3559{
3560 const char *cname = rb_obj_classname(self);
3561 rb_econv_t *ec;
3562
3563 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3564 if (!ec)
3565 return rb_sprintf("#<%s: uninitialized>", cname);
3566 else {
3567 const char *sname = ec->source_encoding_name;
3568 const char *dname = ec->destination_encoding_name;
3569 VALUE str;
3570 str = rb_sprintf("#<%s: ", cname);
3571 econv_description(sname, dname, ec->flags, str);
3572 rb_str_cat2(str, ">");
3573 return str;
3574 }
3575}
3576
3577static rb_econv_t *
3578check_econv(VALUE self)
3579{
3580 rb_econv_t *ec;
3581
3582 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3583 if (!ec) {
3584 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3585 }
3586 return ec;
3587}
3588
3589static VALUE
3590econv_get_encoding(rb_encoding *encoding)
3591{
3592 if (!encoding)
3593 return Qnil;
3594 return rb_enc_from_encoding(encoding);
3595}
3596
3597/*
3598 * call-seq:
3599 * ec.source_encoding -> encoding
3600 *
3601 * Returns the source encoding as an Encoding object.
3602 */
3603static VALUE
3604econv_source_encoding(VALUE self)
3605{
3606 rb_econv_t *ec = check_econv(self);
3607 return econv_get_encoding(ec->source_encoding);
3608}
3609
3610/*
3611 * call-seq:
3612 * ec.destination_encoding -> encoding
3613 *
3614 * Returns the destination encoding as an Encoding object.
3615 */
3616static VALUE
3617econv_destination_encoding(VALUE self)
3618{
3619 rb_econv_t *ec = check_econv(self);
3620 return econv_get_encoding(ec->destination_encoding);
3621}
3622
3623/*
3624 * call-seq:
3625 * ec.convpath -> ary
3626 *
3627 * Returns the conversion path of ec.
3628 *
3629 * The result is an array of conversions.
3630 *
3631 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3632 * p ec.convpath
3633 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3634 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3635 * # "crlf_newline"]
3636 *
3637 * Each element of the array is a pair of encodings or a string.
3638 * A pair means an encoding conversion.
3639 * A string means a decorator.
3640 *
3641 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3642 * a converter from ISO-8859-1 to UTF-8.
3643 * "crlf_newline" means newline converter from LF to CRLF.
3644 */
3645static VALUE
3646econv_convpath(VALUE self)
3647{
3648 rb_econv_t *ec = check_econv(self);
3649 VALUE result;
3650 int i;
3651
3652 result = rb_ary_new();
3653 for (i = 0; i < ec->num_trans; i++) {
3654 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3655 VALUE v;
3656 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3657 v = rb_str_new_cstr(tr->dst_encoding);
3658 else
3659 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3660 rb_ary_push(result, v);
3661 }
3662 return result;
3663}
3664
3665/*
3666 * call-seq:
3667 * ec == other -> true or false
3668 */
3669static VALUE
3670econv_equal(VALUE self, VALUE other)
3671{
3672 rb_econv_t *ec1 = check_econv(self);
3673 rb_econv_t *ec2;
3674 int i;
3675
3676 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3677 return Qnil;
3678 }
3679 ec2 = DATA_PTR(other);
3680 if (!ec2) return Qfalse;
3681 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3682 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3683 return Qfalse;
3684 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3685 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3686 return Qfalse;
3687 if (ec1->flags != ec2->flags) return Qfalse;
3688 if (ec1->replacement_enc != ec2->replacement_enc &&
3689 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3690 return Qfalse;
3691 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3692 if (ec1->replacement_str != ec2->replacement_str &&
3693 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3694 return Qfalse;
3695
3696 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3697 for (i = 0; i < ec1->num_trans; i++) {
3698 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3699 return Qfalse;
3700 }
3701 return Qtrue;
3702}
3703
3704static VALUE
3705econv_result_to_symbol(rb_econv_result_t res)
3706{
3707 switch (res) {
3708 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3709 case econv_incomplete_input: return sym_incomplete_input;
3710 case econv_undefined_conversion: return sym_undefined_conversion;
3711 case econv_destination_buffer_full: return sym_destination_buffer_full;
3712 case econv_source_buffer_empty: return sym_source_buffer_empty;
3713 case econv_finished: return sym_finished;
3714 case econv_after_output: return sym_after_output;
3715 default: return INT2NUM(res); /* should not be reached */
3716 }
3717}
3718
3719/*
3720 * call-seq:
3721 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3722 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3723 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3724 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3725 *
3726 * possible opt elements:
3727 * hash form:
3728 * :partial_input => true # source buffer may be part of larger source
3729 * :after_output => true # stop conversion after output before input
3730 * integer form:
3731 * Encoding::Converter::PARTIAL_INPUT
3732 * Encoding::Converter::AFTER_OUTPUT
3733 *
3734 * possible results:
3735 * :invalid_byte_sequence
3736 * :incomplete_input
3737 * :undefined_conversion
3738 * :after_output
3739 * :destination_buffer_full
3740 * :source_buffer_empty
3741 * :finished
3742 *
3743 * primitive_convert converts source_buffer into destination_buffer.
3744 *
3745 * source_buffer should be a string or nil.
3746 * nil means an empty string.
3747 *
3748 * destination_buffer should be a string.
3749 *
3750 * destination_byteoffset should be an integer or nil.
3751 * nil means the end of destination_buffer.
3752 * If it is omitted, nil is assumed.
3753 *
3754 * destination_bytesize should be an integer or nil.
3755 * nil means unlimited.
3756 * If it is omitted, nil is assumed.
3757 *
3758 * opt should be nil, a hash or an integer.
3759 * nil means no flags.
3760 * If it is omitted, nil is assumed.
3761 *
3762 * primitive_convert converts the content of source_buffer from beginning
3763 * and store the result into destination_buffer.
3764 *
3765 * destination_byteoffset and destination_bytesize specify the region which
3766 * the converted result is stored.
3767 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3768 * If destination_byteoffset is nil,
3769 * destination_buffer.bytesize is used for appending the result.
3770 * destination_bytesize specifies maximum number of bytes.
3771 * If destination_bytesize is nil,
3772 * destination size is unlimited.
3773 * After conversion, destination_buffer is resized to
3774 * destination_byteoffset + actually produced number of bytes.
3775 * Also destination_buffer's encoding is set to destination_encoding.
3776 *
3777 * primitive_convert drops the converted part of source_buffer.
3778 * the dropped part is converted in destination_buffer or
3779 * buffered in Encoding::Converter object.
3780 *
3781 * primitive_convert stops conversion when one of following condition met.
3782 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3783 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3784 * - unexpected end of source buffer (:incomplete_input)
3785 * this occur only when :partial_input is not specified.
3786 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3787 * - character not representable in output encoding (:undefined_conversion)
3788 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3789 * - after some output is generated, before input is done (:after_output)
3790 * this occur only when :after_output is specified.
3791 * - destination buffer is full (:destination_buffer_full)
3792 * this occur only when destination_bytesize is non-nil.
3793 * - source buffer is empty (:source_buffer_empty)
3794 * this occur only when :partial_input is specified.
3795 * - conversion is finished (:finished)
3796 *
3797 * example:
3798 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3799 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3800 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3801 *
3802 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3803 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3804 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3805 * ret = ec.primitive_convert(src, dst="", nil, 1)
3806 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3807 * ret = ec.primitive_convert(src, dst="", nil, 1)
3808 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3809 * ret = ec.primitive_convert(src, dst="", nil, 1)
3810 * p [ret, src, dst] #=> [:finished, "", "i"]
3811 *
3812 */
3813static VALUE
3814econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3815{
3816 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3817 rb_econv_t *ec = check_econv(self);
3819 const unsigned char *ip, *is;
3820 unsigned char *op, *os;
3821 long output_byteoffset, output_bytesize;
3822 unsigned long output_byteend;
3823 int flags;
3824
3825 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3826
3827 if (NIL_P(output_byteoffset_v))
3828 output_byteoffset = 0; /* dummy */
3829 else
3830 output_byteoffset = NUM2LONG(output_byteoffset_v);
3831
3832 if (NIL_P(output_bytesize_v))
3833 output_bytesize = 0; /* dummy */
3834 else
3835 output_bytesize = NUM2LONG(output_bytesize_v);
3836
3837 if (!NIL_P(flags_v)) {
3838 if (!NIL_P(opt)) {
3839 rb_error_arity(argc + 1, 2, 5);
3840 }
3841 flags = NUM2INT(rb_to_int(flags_v));
3842 }
3843 else if (!NIL_P(opt)) {
3844 VALUE v;
3845 flags = 0;
3846 v = rb_hash_aref(opt, sym_partial_input);
3847 if (RTEST(v))
3848 flags |= ECONV_PARTIAL_INPUT;
3849 v = rb_hash_aref(opt, sym_after_output);
3850 if (RTEST(v))
3851 flags |= ECONV_AFTER_OUTPUT;
3852 }
3853 else {
3854 flags = 0;
3855 }
3856
3857 StringValue(output);
3858 if (!NIL_P(input))
3859 StringValue(input);
3860 rb_str_modify(output);
3861
3862 if (NIL_P(output_bytesize_v)) {
3863 output_bytesize = rb_str_capacity(output);
3864
3865 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3866 output_bytesize = RSTRING_LEN(input);
3867 }
3868
3869 retry:
3870
3871 if (NIL_P(output_byteoffset_v))
3872 output_byteoffset = RSTRING_LEN(output);
3873
3874 if (output_byteoffset < 0)
3875 rb_raise(rb_eArgError, "negative output_byteoffset");
3876
3877 if (RSTRING_LEN(output) < output_byteoffset)
3878 rb_raise(rb_eArgError, "output_byteoffset too big");
3879
3880 if (output_bytesize < 0)
3881 rb_raise(rb_eArgError, "negative output_bytesize");
3882
3883 output_byteend = (unsigned long)output_byteoffset +
3884 (unsigned long)output_bytesize;
3885
3886 if (output_byteend < (unsigned long)output_byteoffset ||
3887 LONG_MAX < output_byteend)
3888 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3889
3890 if (rb_str_capacity(output) < output_byteend)
3891 rb_str_resize(output, output_byteend);
3892
3893 if (NIL_P(input)) {
3894 ip = is = NULL;
3895 }
3896 else {
3897 ip = (const unsigned char *)RSTRING_PTR(input);
3898 is = ip + RSTRING_LEN(input);
3899 }
3900
3901 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3902 os = op + output_bytesize;
3903
3904 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3905 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3906 if (!NIL_P(input)) {
3907 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3908 }
3909
3910 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3911 if (LONG_MAX / 2 < output_bytesize)
3912 rb_raise(rb_eArgError, "too long conversion result");
3913 output_bytesize *= 2;
3914 output_byteoffset_v = Qnil;
3915 goto retry;
3916 }
3917
3918 if (ec->destination_encoding) {
3919 rb_enc_associate(output, ec->destination_encoding);
3920 }
3921
3922 return econv_result_to_symbol(res);
3923}
3924
3925/*
3926 * call-seq:
3927 * ec.convert(source_string) -> destination_string
3928 *
3929 * Convert source_string and return destination_string.
3930 *
3931 * source_string is assumed as a part of source.
3932 * i.e. :partial_input=>true is specified internally.
3933 * finish method should be used last.
3934 *
3935 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3936 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3937 * puts ec.finish.dump #=> ""
3938 *
3939 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3940 * puts ec.convert("\xA4").dump #=> ""
3941 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3942 * puts ec.finish.dump #=> ""
3943 *
3944 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3945 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3946 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3947 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3948 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3949 *
3950 * If a conversion error occur,
3951 * Encoding::UndefinedConversionError or
3952 * Encoding::InvalidByteSequenceError is raised.
3953 * Encoding::Converter#convert doesn't supply methods to recover or restart
3954 * from these exceptions.
3955 * When you want to handle these conversion errors,
3956 * use Encoding::Converter#primitive_convert.
3957 *
3958 */
3959static VALUE
3960econv_convert(VALUE self, VALUE source_string)
3961{
3962 VALUE ret, dst;
3963 VALUE av[5];
3964 int ac;
3965 rb_econv_t *ec = check_econv(self);
3966
3967 StringValue(source_string);
3968
3969 dst = rb_str_new(NULL, 0);
3970
3971 av[0] = rb_str_dup(source_string);
3972 av[1] = dst;
3973 av[2] = Qnil;
3974 av[3] = Qnil;
3976 ac = 5;
3977
3978 ret = econv_primitive_convert(ac, av, self);
3979
3980 if (ret == sym_invalid_byte_sequence ||
3981 ret == sym_undefined_conversion ||
3982 ret == sym_incomplete_input) {
3983 VALUE exc = make_econv_exception(ec);
3984 rb_exc_raise(exc);
3985 }
3986
3987 if (ret == sym_finished) {
3988 rb_raise(rb_eArgError, "converter already finished");
3989 }
3990
3991 if (ret != sym_source_buffer_empty) {
3992 rb_bug("unexpected result of econv_primitive_convert");
3993 }
3994
3995 return dst;
3996}
3997
3998/*
3999 * call-seq:
4000 * ec.finish -> string
4001 *
4002 * Finishes the converter.
4003 * It returns the last part of the converted string.
4004 *
4005 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4006 * p ec.convert("\u3042") #=> "\e$B$\""
4007 * p ec.finish #=> "\e(B"
4008 */
4009static VALUE
4010econv_finish(VALUE self)
4011{
4012 VALUE ret, dst;
4013 VALUE av[5];
4014 int ac;
4015 rb_econv_t *ec = check_econv(self);
4016
4017 dst = rb_str_new(NULL, 0);
4018
4019 av[0] = Qnil;
4020 av[1] = dst;
4021 av[2] = Qnil;
4022 av[3] = Qnil;
4023 av[4] = INT2FIX(0);
4024 ac = 5;
4025
4026 ret = econv_primitive_convert(ac, av, self);
4027
4028 if (ret == sym_invalid_byte_sequence ||
4029 ret == sym_undefined_conversion ||
4030 ret == sym_incomplete_input) {
4031 VALUE exc = make_econv_exception(ec);
4032 rb_exc_raise(exc);
4033 }
4034
4035 if (ret != sym_finished) {
4036 rb_bug("unexpected result of econv_primitive_convert");
4037 }
4038
4039 return dst;
4040}
4041
4042/*
4043 * call-seq:
4044 * ec.primitive_errinfo -> array
4045 *
4046 * primitive_errinfo returns important information regarding the last error
4047 * as a 5-element array:
4048 *
4049 * [result, enc1, enc2, error_bytes, readagain_bytes]
4050 *
4051 * result is the last result of primitive_convert.
4052 *
4053 * Other elements are only meaningful when result is
4054 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
4055 *
4056 * enc1 and enc2 indicate a conversion step as a pair of strings.
4057 * For example, a converter from EUC-JP to ISO-8859-1 converts
4058 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
4059 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
4060 *
4061 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4062 * error_bytes is discarded portion.
4063 * readagain_bytes is buffered portion which is read again on next conversion.
4064 *
4065 * Example:
4066 *
4067 * # \xff is invalid as EUC-JP.
4068 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4069 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4070 * p ec.primitive_errinfo
4071 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4072 *
4073 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4074 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4075 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4076 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4077 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4078 * p ec.primitive_errinfo
4079 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4080 *
4081 * # partial character is invalid
4082 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4083 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4084 * p ec.primitive_errinfo
4085 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4086 *
4087 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4088 * # partial characters.
4089 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4090 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4091 * p ec.primitive_errinfo
4092 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4093 *
4094 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4095 * # no low surrogate after high surrogate (\xd8\x00).
4096 * # It is detected by 3rd byte (\00) which is part of next character.
4097 * # So the high surrogate (\xd8\x00) is discarded and
4098 * # the 3rd byte is read again later.
4099 * # Since the byte is buffered in ec, it is dropped from src.
4100 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4101 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4102 * p ec.primitive_errinfo
4103 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4104 * p src
4105 * #=> "@"
4106 *
4107 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4108 * # The problem is detected by 4th byte.
4109 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4110 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4111 * p ec.primitive_errinfo
4112 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4113 * p src
4114 * #=> ""
4115 *
4116 */
4117static VALUE
4118econv_primitive_errinfo(VALUE self)
4119{
4120 rb_econv_t *ec = check_econv(self);
4121
4122 VALUE ary;
4123
4124 ary = rb_ary_new2(5);
4125
4126 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4127 rb_ary_store(ary, 4, Qnil);
4128
4129 if (ec->last_error.source_encoding)
4130 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4131
4132 if (ec->last_error.destination_encoding)
4133 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4134
4135 if (ec->last_error.error_bytes_start) {
4136 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4137 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4138 }
4139
4140 return ary;
4141}
4142
4143/*
4144 * call-seq:
4145 * ec.insert_output(string) -> nil
4146 *
4147 * Inserts string into the encoding converter.
4148 * The string will be converted to the destination encoding and
4149 * output on later conversions.
4150 *
4151 * If the destination encoding is stateful,
4152 * string is converted according to the state and the state is updated.
4153 *
4154 * This method should be used only when a conversion error occurs.
4155 *
4156 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4157 * src = "HIRAGANA LETTER A is \u{3042}."
4158 * dst = ""
4159 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4160 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4161 * ec.insert_output("<err>")
4162 * p ec.primitive_convert(src, dst) #=> :finished
4163 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4164 *
4165 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4166 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4167 * dst = ""
4168 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4169 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4170 * ec.insert_output "?" # state change required to output "?".
4171 * p ec.primitive_convert(src, dst) #=> :finished
4172 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4173 *
4174 */
4175static VALUE
4176econv_insert_output(VALUE self, VALUE string)
4177{
4178 const char *insert_enc;
4179
4180 int ret;
4181
4182 rb_econv_t *ec = check_econv(self);
4183
4184 StringValue(string);
4185 insert_enc = rb_econv_encoding_to_insert_output(ec);
4186 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4187
4188 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4189 if (ret == -1) {
4190 rb_raise(rb_eArgError, "too big string");
4191 }
4192
4193 return Qnil;
4194}
4195
4196/*
4197 * call-seq:
4198 * ec.putback -> string
4199 * ec.putback(max_numbytes) -> string
4200 *
4201 * Put back the bytes which will be converted.
4202 *
4203 * The bytes are caused by invalid_byte_sequence error.
4204 * When invalid_byte_sequence error, some bytes are discarded and
4205 * some bytes are buffered to be converted later.
4206 * The latter bytes can be put back.
4207 * It can be observed by
4208 * Encoding::InvalidByteSequenceError#readagain_bytes and
4209 * Encoding::Converter#primitive_errinfo.
4210 *
4211 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4212 * src = "\x00\xd8\x61\x00"
4213 * dst = ""
4214 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4215 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4216 * p ec.putback #=> "a\x00"
4217 * p ec.putback #=> "" # no more bytes to put back
4218 *
4219 */
4220static VALUE
4221econv_putback(int argc, VALUE *argv, VALUE self)
4222{
4223 rb_econv_t *ec = check_econv(self);
4224 int n;
4225 int putbackable;
4226 VALUE str, max;
4227
4228 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4229 n = rb_econv_putbackable(ec);
4230 }
4231 else {
4232 n = NUM2INT(max);
4233 putbackable = rb_econv_putbackable(ec);
4234 if (putbackable < n)
4235 n = putbackable;
4236 }
4237
4238 str = rb_str_new(NULL, n);
4239 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4240
4241 if (ec->source_encoding) {
4242 rb_enc_associate(str, ec->source_encoding);
4243 }
4244
4245 return str;
4246}
4247
4248/*
4249 * call-seq:
4250 * ec.last_error -> exception or nil
4251 *
4252 * Returns an exception object for the last conversion.
4253 * Returns nil if the last conversion did not produce an error.
4254 *
4255 * "error" means that
4256 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4257 * Encoding::Converter#convert and
4258 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4259 * Encoding::Converter#primitive_convert.
4260 *
4261 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4262 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4263 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4264 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4265 * p ec.last_error #=> nil
4266 *
4267 */
4268static VALUE
4269econv_last_error(VALUE self)
4270{
4271 rb_econv_t *ec = check_econv(self);
4272 VALUE exc;
4273
4274 exc = make_econv_exception(ec);
4275 if (NIL_P(exc))
4276 return Qnil;
4277 return exc;
4278}
4279
4280/*
4281 * call-seq:
4282 * ec.replacement -> string
4283 *
4284 * Returns the replacement string.
4285 *
4286 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4287 * p ec.replacement #=> "?"
4288 *
4289 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4290 * p ec.replacement #=> "\uFFFD"
4291 */
4292static VALUE
4293econv_get_replacement(VALUE self)
4294{
4295 rb_econv_t *ec = check_econv(self);
4296 int ret;
4297 rb_encoding *enc;
4298
4299 ret = make_replacement(ec);
4300 if (ret == -1) {
4301 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4302 }
4303
4304 enc = rb_enc_find(ec->replacement_enc);
4305 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4306}
4307
4308/*
4309 * call-seq:
4310 * ec.replacement = string
4311 *
4312 * Sets the replacement string.
4313 *
4314 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4315 * ec.replacement = "<undef>"
4316 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4317 */
4318static VALUE
4319econv_set_replacement(VALUE self, VALUE arg)
4320{
4321 rb_econv_t *ec = check_econv(self);
4322 VALUE string = arg;
4323 int ret;
4324 rb_encoding *enc;
4325
4326 StringValue(string);
4327 enc = rb_enc_get(string);
4328
4329 ret = rb_econv_set_replacement(ec,
4330 (const unsigned char *)RSTRING_PTR(string),
4331 RSTRING_LEN(string),
4332 rb_enc_name(enc));
4333
4334 if (ret == -1) {
4335 /* xxx: rb_eInvalidByteSequenceError? */
4336 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4337 }
4338
4339 return arg;
4340}
4341
4342VALUE
4344{
4345 return make_econv_exception(ec);
4346}
4347
4348void
4350{
4351 VALUE exc;
4352
4353 exc = make_econv_exception(ec);
4354 if (NIL_P(exc))
4355 return;
4356 rb_exc_raise(exc);
4357}
4358
4359/*
4360 * call-seq:
4361 * ecerr.source_encoding_name -> string
4362 *
4363 * Returns the source encoding name as a string.
4364 */
4365static VALUE
4366ecerr_source_encoding_name(VALUE self)
4367{
4368 return rb_attr_get(self, id_source_encoding_name);
4369}
4370
4371/*
4372 * call-seq:
4373 * ecerr.source_encoding -> encoding
4374 *
4375 * Returns the source encoding as an encoding object.
4376 *
4377 * Note that the result may not be equal to the source encoding of
4378 * the encoding converter if the conversion has multiple steps.
4379 *
4380 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4381 * begin
4382 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4383 * rescue Encoding::UndefinedConversionError
4384 * p $!.source_encoding #=> #<Encoding:UTF-8>
4385 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4386 * p $!.source_encoding_name #=> "UTF-8"
4387 * p $!.destination_encoding_name #=> "EUC-JP"
4388 * end
4389 *
4390 */
4391static VALUE
4392ecerr_source_encoding(VALUE self)
4393{
4394 return rb_attr_get(self, id_source_encoding);
4395}
4396
4397/*
4398 * call-seq:
4399 * ecerr.destination_encoding_name -> string
4400 *
4401 * Returns the destination encoding name as a string.
4402 */
4403static VALUE
4404ecerr_destination_encoding_name(VALUE self)
4405{
4406 return rb_attr_get(self, id_destination_encoding_name);
4407}
4408
4409/*
4410 * call-seq:
4411 * ecerr.destination_encoding -> string
4412 *
4413 * Returns the destination encoding as an encoding object.
4414 */
4415static VALUE
4416ecerr_destination_encoding(VALUE self)
4417{
4418 return rb_attr_get(self, id_destination_encoding);
4419}
4420
4421/*
4422 * call-seq:
4423 * ecerr.error_char -> string
4424 *
4425 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4426 *
4427 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4428 * begin
4429 * ec.convert("\xa0")
4430 * rescue Encoding::UndefinedConversionError
4431 * puts $!.error_char.dump #=> "\xC2\xA0"
4432 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4433 * end
4434 *
4435 */
4436static VALUE
4437ecerr_error_char(VALUE self)
4438{
4439 return rb_attr_get(self, id_error_char);
4440}
4441
4442/*
4443 * call-seq:
4444 * ecerr.error_bytes -> string
4445 *
4446 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4447 *
4448 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4449 * begin
4450 * ec.convert("abc\xA1\xFFdef")
4451 * rescue Encoding::InvalidByteSequenceError
4452 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4453 * puts $!.error_bytes.dump #=> "\xA1"
4454 * puts $!.readagain_bytes.dump #=> "\xFF"
4455 * end
4456 */
4457static VALUE
4458ecerr_error_bytes(VALUE self)
4459{
4460 return rb_attr_get(self, id_error_bytes);
4461}
4462
4463/*
4464 * call-seq:
4465 * ecerr.readagain_bytes -> string
4466 *
4467 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4468 */
4469static VALUE
4470ecerr_readagain_bytes(VALUE self)
4471{
4472 return rb_attr_get(self, id_readagain_bytes);
4473}
4474
4475/*
4476 * call-seq:
4477 * ecerr.incomplete_input? -> true or false
4478 *
4479 * Returns true if the invalid byte sequence error is caused by
4480 * premature end of string.
4481 *
4482 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4483 *
4484 * begin
4485 * ec.convert("abc\xA1z")
4486 * rescue Encoding::InvalidByteSequenceError
4487 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4488 * p $!.incomplete_input? #=> false
4489 * end
4490 *
4491 * begin
4492 * ec.convert("abc\xA1")
4493 * ec.finish
4494 * rescue Encoding::InvalidByteSequenceError
4495 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4496 * p $!.incomplete_input? #=> true
4497 * end
4498 */
4499static VALUE
4500ecerr_incomplete_input(VALUE self)
4501{
4502 return rb_attr_get(self, id_incomplete_input);
4503}
4504
4505/*
4506 * Document-class: Encoding::UndefinedConversionError
4507 *
4508 * Raised by Encoding and String methods when a transcoding operation
4509 * fails.
4510 */
4511
4512/*
4513 * Document-class: Encoding::InvalidByteSequenceError
4514 *
4515 * Raised by Encoding and String methods when the string being
4516 * transcoded contains a byte invalid for the either the source or
4517 * target encoding.
4518 */
4519
4520/*
4521 * Document-class: Encoding::ConverterNotFoundError
4522 *
4523 * Raised by transcoding methods when a named encoding does not
4524 * correspond with a known converter.
4525 */
4526
4527void
4528Init_transcode(void)
4529{
4530 transcoder_table = st_init_strcasetable();
4531
4532 id_destination_encoding = rb_intern_const("destination_encoding");
4533 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4534 id_error_bytes = rb_intern_const("error_bytes");
4535 id_error_char = rb_intern_const("error_char");
4536 id_incomplete_input = rb_intern_const("incomplete_input");
4537 id_readagain_bytes = rb_intern_const("readagain_bytes");
4538 id_source_encoding = rb_intern_const("source_encoding");
4539 id_source_encoding_name = rb_intern_const("source_encoding_name");
4540
4541 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4542 sym_undef = ID2SYM(rb_intern_const("undef"));
4543 sym_replace = ID2SYM(rb_intern_const("replace"));
4544 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4545 sym_xml = ID2SYM(rb_intern_const("xml"));
4546 sym_text = ID2SYM(rb_intern_const("text"));
4547 sym_attr = ID2SYM(rb_intern_const("attr"));
4548
4549 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4550 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4551 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4552 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4553 sym_finished = ID2SYM(rb_intern_const("finished"));
4554 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4555 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4556 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4557 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4558 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4559 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4560 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4561
4562#ifdef ENABLE_ECONV_NEWLINE_OPTION
4563 sym_newline = ID2SYM(rb_intern_const("newline"));
4564 sym_universal = ID2SYM(rb_intern_const("universal"));
4565 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4566 sym_cr = ID2SYM(rb_intern_const("cr"));
4567 sym_lf = ID2SYM(rb_intern_const("lf"));
4568#endif
4569
4570 InitVM(transcode);
4571}
4572
4573void
4574InitVM_transcode(void)
4575{
4576 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4577 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4578 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4579
4580 rb_define_method(rb_cString, "encode", str_encode, -1);
4581 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4582
4583 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4584 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4585 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4586 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4587 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4588 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4589 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4590 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4591 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4592 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4593 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4594 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4595 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4596 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4597 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4598 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4599 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4600 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4601 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4602
4603 /*
4604 *Mask for invalid byte sequences
4605 */
4606 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4607
4608 /*
4609 * Replace invalid byte sequences
4610 */
4611 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4612
4613 /*
4614 * Mask for a valid character in the source encoding but no related
4615 * character(s) in destination encoding.
4616 */
4617 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4618
4619 /*
4620 * Replace byte sequences that are undefined in the destination encoding.
4621 */
4622 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4623
4624 /*
4625 * Replace byte sequences that are undefined in the destination encoding
4626 * with an XML hexadecimal character reference. This is valid for XML
4627 * conversion.
4628 */
4629 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4630
4631 /*
4632 * Indicates the source may be part of a larger string. See
4633 * primitive_convert for an example.
4634 */
4635 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4636
4637 /*
4638 * Stop converting after some output is complete but before all of the
4639 * input was consumed. See primitive_convert for an example.
4640 */
4641 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4642
4643 /*
4644 * Decorator for converting CRLF and CR to LF
4645 */
4646 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4647
4648 /*
4649 * Decorator for converting CRLF and CR to LF when writing
4650 */
4651 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4652
4653 /*
4654 * Decorator for converting LF to CRLF
4655 */
4656 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4657
4658 /*
4659 * Decorator for converting LF to CR
4660 */
4661 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4662
4663 /*
4664 * Escape as XML CharData
4665 */
4666 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4667
4668 /*
4669 * Escape as XML AttValue
4670 */
4671 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4672
4673 /*
4674 * Escape as XML AttValue
4675 */
4676 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4677
4678 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4679 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4680 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4681 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4682 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4683
4684 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4685 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4686 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4687 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4688 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4689 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4690 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4691
4692 Init_newline();
4693}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1627
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3255
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1676
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:133
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1684
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:131
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:660
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1418
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1416
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1404
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1469
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1424
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_cObject
Object class.
Definition object.c:61
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:264
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:60
VALUE rb_cString
String class.
Definition string.c:84
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3365
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1325
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:930
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:814
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2663
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2124
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1542
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2708
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1485
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1781
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3282
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1106
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1948
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1939
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1825
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1627
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1960
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2714
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:2006
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:2023
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1989
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2977
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4343
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4349
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1954
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1742
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1876
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1792
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2287
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1168
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1145
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1815
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2695
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:122
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1729
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1499
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1771
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:984
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1501
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1979
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3403
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2727
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7361
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1701
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1515
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5763
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2024
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3457
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:285
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:1031
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:409
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
Definition rtypeddata.h:119
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:736
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:514
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:515
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:211
Definition st.h:79
Definition string.c:8248
Definition transcode.c:179
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376