Ruby 3.5.0dev (2025-07-12 revision 22b81b5bf56d7c5053008697d9e6b2a9c4eb79f4)
transcode.c (22b81b5bf56d7c5053008697d9e6b2a9c4eb79f4)
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "ruby/encoding.h"
23#include "vm_sync.h"
24
25#include "transcode_data.h"
26#include "id.h"
27
28#define ENABLE_ECONV_NEWLINE_OPTION 1
29
30/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
31static VALUE rb_eUndefinedConversionError;
32static VALUE rb_eInvalidByteSequenceError;
33static VALUE rb_eConverterNotFoundError;
34
35VALUE rb_cEncodingConverter;
36
37static ID id_destination_encoding;
38static ID id_destination_encoding_name;
39static ID id_error_bytes;
40static ID id_error_char;
41static ID id_incomplete_input;
42static ID id_readagain_bytes;
43static ID id_source_encoding;
44static ID id_source_encoding_name;
45
46static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
47static VALUE sym_xml, sym_text, sym_attr;
48static VALUE sym_universal_newline;
49static VALUE sym_crlf_newline;
50static VALUE sym_cr_newline;
51static VALUE sym_lf_newline;
52#ifdef ENABLE_ECONV_NEWLINE_OPTION
53static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
54#endif
55static VALUE sym_partial_input;
56
57static VALUE sym_invalid_byte_sequence;
58static VALUE sym_undefined_conversion;
59static VALUE sym_destination_buffer_full;
60static VALUE sym_source_buffer_empty;
61static VALUE sym_finished;
62static VALUE sym_after_output;
63static VALUE sym_incomplete_input;
64
65static unsigned char *
66allocate_converted_string(const char *sname, const char *dname,
67 const unsigned char *str, size_t len,
68 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
69 size_t *dst_len_ptr);
70
71/* dynamic structure, one per conversion (similar to iconv_t) */
72/* may carry conversion state (e.g. for iso-2022-jp) */
73typedef struct rb_transcoding {
74 const rb_transcoder *transcoder;
75
76 int flags;
77
78 int resume_position;
79 unsigned int next_table;
80 VALUE next_info;
81 unsigned char next_byte;
82 unsigned int output_index;
83
84 ssize_t recognized_len; /* already interpreted */
85 ssize_t readagain_len; /* not yet interpreted */
86 union {
87 unsigned char ary[8]; /* max_input <= sizeof(ary) */
88 unsigned char *ptr; /* length: max_input */
89 } readbuf; /* recognized_len + readagain_len used */
90
91 ssize_t writebuf_off;
92 ssize_t writebuf_len;
93 union {
94 unsigned char ary[8]; /* max_output <= sizeof(ary) */
95 unsigned char *ptr; /* length: max_output */
96 } writebuf;
97
98 union rb_transcoding_state_t { /* opaque data for stateful encoding */
99 void *ptr;
100 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
101 double dummy_for_alignment;
102 } state;
104#define TRANSCODING_READBUF(tc) \
105 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
106 (tc)->readbuf.ary : \
107 (tc)->readbuf.ptr)
108#define TRANSCODING_WRITEBUF(tc) \
109 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
110 (tc)->writebuf.ary : \
111 (tc)->writebuf.ptr)
112#define TRANSCODING_WRITEBUF_SIZE(tc) \
113 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
114 sizeof((tc)->writebuf.ary) : \
115 (size_t)(tc)->transcoder->max_output)
116#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
117#define TRANSCODING_STATE(tc) \
118 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
119 (tc)->state.ary : \
120 (tc)->state.ptr)
121
122typedef struct {
123 struct rb_transcoding *tc;
124 unsigned char *out_buf_start;
125 unsigned char *out_data_start;
126 unsigned char *out_data_end;
127 unsigned char *out_buf_end;
128 rb_econv_result_t last_result;
130
132 int flags;
133 int started; /* bool */
134
135 const char *source_encoding_name;
136 const char *destination_encoding_name;
137
138 const unsigned char *replacement_str;
139 size_t replacement_len;
140 const char *replacement_enc;
141
142 unsigned char *in_buf_start;
143 unsigned char *in_data_start;
144 unsigned char *in_data_end;
145 unsigned char *in_buf_end;
146 rb_econv_elem_t *elems;
147 int replacement_allocated; /* bool */
148 int num_allocated;
149 int num_trans;
150 int num_finished;
151 struct rb_transcoding *last_tc;
152
153 /* last error */
154 struct {
155 rb_econv_result_t result;
156 struct rb_transcoding *error_tc;
157 const char *source_encoding;
158 const char *destination_encoding;
159 const unsigned char *error_bytes_start;
160 size_t error_bytes_len;
161 size_t readagain_len;
162 } last_error;
163
164 /* The following fields are only for Encoding::Converter.
165 * rb_econv_open set them NULL. */
166 rb_encoding *source_encoding;
167 rb_encoding *destination_encoding;
168};
169
170/*
171 * Dispatch data and logic
172 */
173
174#define DECORATOR_P(sname, dname) (*(sname) == '\0')
175
176typedef struct {
177 const char *sname;
178 const char *dname;
179 const char *lib; /* null means no need to load a library */
180 const rb_transcoder *transcoder;
182
183static st_table *transcoder_table;
184
185static int
186free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
187{
188 xfree((void *)val);
189 return ST_DELETE;
190}
191
192static int
193free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
194{
195 st_foreach((void *)val, free_inner_transcode_i, 0);
196 st_free_table((void *)val);
197 return ST_DELETE;
198}
199
200void
201rb_free_transcoder_table(void)
202{
203 st_foreach(transcoder_table, free_transcode_i, 0);
204 st_free_table(transcoder_table);
205}
206
207static transcoder_entry_t *
208make_transcoder_entry(const char *sname, const char *dname)
209{
210 st_data_t val;
211 st_table *table2;
212
213 RB_VM_LOCKING() {
214 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
215 val = (st_data_t)st_init_strcasetable();
216 st_add_direct(transcoder_table, (st_data_t)sname, val);
217 }
218 table2 = (st_table *)val;
219 if (!st_lookup(table2, (st_data_t)dname, &val)) {
221 entry->sname = sname;
222 entry->dname = dname;
223 entry->lib = NULL;
224 entry->transcoder = NULL;
225 val = (st_data_t)entry;
226 st_add_direct(table2, (st_data_t)dname, val);
227 }
228 }
229 return (transcoder_entry_t *)val;
230}
231
232static transcoder_entry_t *
233get_transcoder_entry(const char *sname, const char *dname)
234{
235 st_data_t val = 0;
236 st_table *table2;
237 RB_VM_LOCKING() {
238 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
239 table2 = (st_table *)val;
240 if (!st_lookup(table2, (st_data_t)dname, &val)) {
241 val = 0;
242 }
243 }
244 }
245 return (transcoder_entry_t *)val;
246}
247
248void
249rb_register_transcoder(const rb_transcoder *tr)
250{
251 const char *const sname = tr->src_encoding;
252 const char *const dname = tr->dst_encoding;
253
254 transcoder_entry_t *entry;
255
256 RB_VM_LOCKING() {
257 entry = make_transcoder_entry(sname, dname);
258 if (entry->transcoder) {
259 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
260 sname, dname);
261 }
262 entry->transcoder = tr;
263 }
264}
265
266static void
267declare_transcoder(const char *sname, const char *dname, const char *lib)
268{
269 transcoder_entry_t *entry;
270
271 entry = make_transcoder_entry(sname, dname);
272 entry->lib = lib;
273}
274
275static const char transcoder_lib_prefix[] = "enc/trans/";
276
277void
278rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
279{
280 if (!lib) {
281 rb_raise(rb_eArgError, "invalid library name - (null)");
282 }
283 declare_transcoder(enc1, enc2, lib);
284}
285
286#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
287
288typedef struct search_path_queue_tag {
289 struct search_path_queue_tag *next;
290 const char *enc;
292
293typedef struct {
294 st_table *visited;
295 search_path_queue_t *queue;
296 search_path_queue_t **queue_last_ptr;
297 const char *base_enc;
299
300static int
301transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
302{
303 const char *dname = (const char *)key;
306
307 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
308 return ST_CONTINUE;
309 }
310
312 q->enc = dname;
313 q->next = NULL;
314 *bfs->queue_last_ptr = q;
315 bfs->queue_last_ptr = &q->next;
316
317 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
318 return ST_CONTINUE;
319}
320
321static int
322transcode_search_path(const char *sname, const char *dname,
323 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
324 void *arg)
325{
328 st_data_t val;
329 st_table *table2;
330 int pathlen = -1;
331 bool found = false;
332 bool lookup_res;
333
334 if (encoding_equal(sname, dname))
335 return -1;
336
338 q->enc = sname;
339 q->next = NULL;
340 bfs.queue_last_ptr = &q->next;
341 bfs.queue = q;
342
343 bfs.visited = st_init_strcasetable();
344 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
345
346 RB_VM_LOCKING() {
347 while (bfs.queue) {
348 q = bfs.queue;
349 bfs.queue = q->next;
350 if (!bfs.queue) {
351 bfs.queue_last_ptr = &bfs.queue;
352 }
353
354 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val);
355 if (!lookup_res) {
356 xfree(q);
357 continue;
358 }
359 table2 = (st_table *)val;
360
361 if (st_lookup(table2, (st_data_t)dname, &val)) {
362 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
363 xfree(q);
364 found = true;
365 break;
366 }
367
368 bfs.base_enc = q->enc;
369 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
370
371 bfs.base_enc = NULL;
372 xfree(q);
373 }
374 }
375
376 while (bfs.queue) {
377 q = bfs.queue;
378 bfs.queue = q->next;
379 xfree(q);
380 }
381
382 if (found) {
383 const char *enc = dname;
384 int depth;
385 pathlen = 0;
386 while (1) {
387 st_lookup(bfs.visited, (st_data_t)enc, &val);
388 if (!val)
389 break;
390 pathlen++;
391 enc = (const char *)val;
392 }
393 depth = pathlen;
394 enc = dname;
395 while (1) {
396 st_lookup(bfs.visited, (st_data_t)enc, &val);
397 if (!val)
398 break;
399 callback((const char *)val, enc, --depth, arg);
400 enc = (const char *)val;
401 }
402 }
403
404 st_free_table(bfs.visited);
405
406 return pathlen; /* is -1 if not found */
407}
408
409int rb_require_internal_silent(VALUE fname);
410
411static const rb_transcoder *
412load_transcoder_entry(transcoder_entry_t *entry)
413{
414 // changes result of entry->transcoder depending on if it's required or not, so needs lock
415 ASSERT_vm_locking();
416 if (entry->transcoder)
417 return entry->transcoder;
418
419 if (entry->lib) {
420 const char *const lib = entry->lib;
421 const size_t len = strlen(lib);
422 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
423 const VALUE fn = rb_str_new(0, total_len);
424 char *const path = RSTRING_PTR(fn);
425
426 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
427 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
428 rb_str_set_len(fn, total_len);
429 OBJ_FREEZE(fn);
430 rb_require_internal_silent(fn);
431 }
432
433 if (entry->transcoder)
434 return entry->transcoder;
435
436 return NULL;
437}
438
439static const char*
440get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
441{
442 if (encoding_equal(encname, "UTF-8")) {
443 *len_ret = 3;
444 *repl_encname_ptr = "UTF-8";
445 return "\xEF\xBF\xBD";
446 }
447 else {
448 *len_ret = 1;
449 *repl_encname_ptr = "US-ASCII";
450 return "?";
451 }
452}
453
454/*
455 * Transcoding engine logic
456 */
457
458static const unsigned char *
459transcode_char_start(rb_transcoding *tc,
460 const unsigned char *in_start,
461 const unsigned char *inchar_start,
462 const unsigned char *in_p,
463 size_t *char_len_ptr)
464{
465 const unsigned char *ptr;
466 if (inchar_start - in_start < tc->recognized_len) {
467 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
468 inchar_start, unsigned char, in_p - inchar_start);
469 ptr = TRANSCODING_READBUF(tc);
470 }
471 else {
472 ptr = inchar_start - tc->recognized_len;
473 }
474 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
475 return ptr;
476}
477
479transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
480 const unsigned char *in_stop, unsigned char *out_stop,
481 rb_transcoding *tc,
482 const int opt)
483{
484 const rb_transcoder *tr = tc->transcoder;
485 int unitlen = tr->input_unit_length;
486 ssize_t readagain_len = 0;
487
488 const unsigned char *inchar_start;
489 const unsigned char *in_p;
490
491 unsigned char *out_p;
492
493 in_p = inchar_start = *in_pos;
494
495 out_p = *out_pos;
496
497#define SUSPEND(ret, num) \
498 do { \
499 tc->resume_position = (num); \
500 if (0 < in_p - inchar_start) \
501 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
502 inchar_start, unsigned char, in_p - inchar_start); \
503 *in_pos = in_p; \
504 *out_pos = out_p; \
505 tc->recognized_len += in_p - inchar_start; \
506 if (readagain_len) { \
507 tc->recognized_len -= readagain_len; \
508 tc->readagain_len = readagain_len; \
509 } \
510 return (ret); \
511 resume_label ## num:; \
512 } while (0)
513#define SUSPEND_OBUF(num) \
514 do { \
515 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
516 } while (0)
517
518#define SUSPEND_AFTER_OUTPUT(num) \
519 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
520 SUSPEND(econv_after_output, num); \
521 }
522
523#define next_table (tc->next_table)
524#define next_info (tc->next_info)
525#define next_byte (tc->next_byte)
526#define writebuf_len (tc->writebuf_len)
527#define writebuf_off (tc->writebuf_off)
528
529 switch (tc->resume_position) {
530 case 0: break;
531 case 1: goto resume_label1;
532 case 2: goto resume_label2;
533 case 3: goto resume_label3;
534 case 4: goto resume_label4;
535 case 5: goto resume_label5;
536 case 6: goto resume_label6;
537 case 7: goto resume_label7;
538 case 8: goto resume_label8;
539 case 9: goto resume_label9;
540 case 10: goto resume_label10;
541 case 11: goto resume_label11;
542 case 12: goto resume_label12;
543 case 13: goto resume_label13;
544 case 14: goto resume_label14;
545 case 15: goto resume_label15;
546 case 16: goto resume_label16;
547 case 17: goto resume_label17;
548 case 18: goto resume_label18;
549 case 19: goto resume_label19;
550 case 20: goto resume_label20;
551 case 21: goto resume_label21;
552 case 22: goto resume_label22;
553 case 23: goto resume_label23;
554 case 24: goto resume_label24;
555 case 25: goto resume_label25;
556 case 26: goto resume_label26;
557 case 27: goto resume_label27;
558 case 28: goto resume_label28;
559 case 29: goto resume_label29;
560 case 30: goto resume_label30;
561 case 31: goto resume_label31;
562 case 32: goto resume_label32;
563 case 33: goto resume_label33;
564 case 34: goto resume_label34;
565 }
566
567 while (1) {
568 inchar_start = in_p;
569 tc->recognized_len = 0;
570 next_table = tr->conv_tree_start;
571
572 SUSPEND_AFTER_OUTPUT(24);
573
574 if (in_stop <= in_p) {
575 if (!(opt & ECONV_PARTIAL_INPUT))
576 break;
577 SUSPEND(econv_source_buffer_empty, 7);
578 continue;
579 }
580
581#define BYTE_ADDR(index) (tr->byte_array + (index))
582#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
583#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
584#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
585#define BL_MIN_BYTE (BL_BASE[0])
586#define BL_MAX_BYTE (BL_BASE[1])
587#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
588#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
589
590 next_byte = (unsigned char)*in_p++;
591 follow_byte:
592 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
593 next_info = INVALID;
594 else {
595 next_info = (VALUE)BL_ACTION(next_byte);
596 }
597 follow_info:
598 switch (next_info & 0x1F) {
599 case NOMAP:
600 {
601 const unsigned char *p = inchar_start;
602 writebuf_off = 0;
603 while (p < in_p) {
604 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
605 }
606 writebuf_len = writebuf_off;
607 writebuf_off = 0;
608 while (writebuf_off < writebuf_len) {
609 SUSPEND_OBUF(3);
610 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
611 }
612 }
613 continue;
614 case 0x00: case 0x04: case 0x08: case 0x0C:
615 case 0x10: case 0x14: case 0x18: case 0x1C:
616 SUSPEND_AFTER_OUTPUT(25);
617 while (in_p >= in_stop) {
618 if (!(opt & ECONV_PARTIAL_INPUT))
619 goto incomplete;
620 SUSPEND(econv_source_buffer_empty, 5);
621 }
622 next_byte = (unsigned char)*in_p++;
623 next_table = (unsigned int)next_info;
624 goto follow_byte;
625 case ZERObt: /* drop input */
626 continue;
627 case ONEbt:
628 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
629 continue;
630 case TWObt:
631 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
632 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
633 continue;
634 case THREEbt:
635 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
636 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
637 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
638 continue;
639 case FOURbt:
640 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
641 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
642 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
643 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
644 continue;
645 case GB4bt:
646 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
647 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
648 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
649 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
650 continue;
651 case STR1:
652 tc->output_index = 0;
653 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
654 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
655 tc->output_index++;
656 }
657 continue;
658 case FUNii:
659 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
660 goto follow_info;
661 case FUNsi:
662 {
663 const unsigned char *char_start;
664 size_t char_len;
665 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
666 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
667 goto follow_info;
668 }
669 case FUNio:
670 SUSPEND_OBUF(13);
671 if (tr->max_output <= out_stop - out_p)
672 out_p += tr->func_io(TRANSCODING_STATE(tc),
673 next_info, out_p, out_stop - out_p);
674 else {
675 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
676 next_info,
677 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
678 writebuf_off = 0;
679 while (writebuf_off < writebuf_len) {
680 SUSPEND_OBUF(20);
681 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682 }
683 }
684 break;
685 case FUNso:
686 {
687 const unsigned char *char_start;
688 size_t char_len;
689 SUSPEND_OBUF(14);
690 if (tr->max_output <= out_stop - out_p) {
691 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
692 out_p += tr->func_so(TRANSCODING_STATE(tc),
693 char_start, (size_t)char_len,
694 out_p, out_stop - out_p);
695 }
696 else {
697 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
698 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
699 char_start, (size_t)char_len,
700 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
701 writebuf_off = 0;
702 while (writebuf_off < writebuf_len) {
703 SUSPEND_OBUF(22);
704 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
705 }
706 }
707 break;
708 }
709 case FUNsio:
710 {
711 const unsigned char *char_start;
712 size_t char_len;
713 SUSPEND_OBUF(33);
714 if (tr->max_output <= out_stop - out_p) {
715 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
716 out_p += tr->func_sio(TRANSCODING_STATE(tc),
717 char_start, (size_t)char_len, next_info,
718 out_p, out_stop - out_p);
719 }
720 else {
721 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
722 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
723 char_start, (size_t)char_len, next_info,
724 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
725 writebuf_off = 0;
726 while (writebuf_off < writebuf_len) {
727 SUSPEND_OBUF(34);
728 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
729 }
730 }
731 break;
732 }
733 case INVALID:
734 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
735 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
736 SUSPEND_AFTER_OUTPUT(26);
737 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
738 in_p = in_stop;
739 SUSPEND(econv_source_buffer_empty, 8);
740 }
741 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
742 in_p = in_stop;
743 }
744 else {
745 in_p = inchar_start + (unitlen - tc->recognized_len);
746 }
747 }
748 else {
749 ssize_t invalid_len; /* including the last byte which causes invalid */
750 ssize_t discard_len;
751 invalid_len = tc->recognized_len + (in_p - inchar_start);
752 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
753 readagain_len = invalid_len - discard_len;
754 }
755 goto invalid;
756 case UNDEF:
757 goto undef;
758 default:
759 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
760 }
761 continue;
762
763 invalid:
764 SUSPEND(econv_invalid_byte_sequence, 1);
765 continue;
766
767 incomplete:
768 SUSPEND(econv_incomplete_input, 27);
769 continue;
770
771 undef:
772 SUSPEND(econv_undefined_conversion, 2);
773 continue;
774 }
775
776 /* cleanup */
777 if (tr->finish_func) {
778 SUSPEND_OBUF(4);
779 if (tr->max_output <= out_stop - out_p) {
780 out_p += tr->finish_func(TRANSCODING_STATE(tc),
781 out_p, out_stop - out_p);
782 }
783 else {
784 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
785 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
786 writebuf_off = 0;
787 while (writebuf_off < writebuf_len) {
788 SUSPEND_OBUF(23);
789 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
790 }
791 }
792 }
793 while (1)
794 SUSPEND(econv_finished, 6);
795#undef SUSPEND
796#undef next_table
797#undef next_info
798#undef next_byte
799#undef writebuf_len
800#undef writebuf_off
801}
802
804transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
805 const unsigned char *in_stop, unsigned char *out_stop,
806 rb_transcoding *tc,
807 const int opt)
808{
809 if (tc->readagain_len) {
810 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
811 const unsigned char *readagain_pos = readagain_buf;
812 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
814
815 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
816 unsigned char, tc->readagain_len);
817 tc->readagain_len = 0;
818 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
819 if (res != econv_source_buffer_empty) {
820 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
821 readagain_pos, unsigned char, readagain_stop - readagain_pos);
822 tc->readagain_len += readagain_stop - readagain_pos;
823 return res;
824 }
825 }
826 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
827}
828
829static rb_transcoding *
830rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
831{
832 rb_transcoding *tc;
833
834 tc = ALLOC(rb_transcoding);
835 tc->transcoder = tr;
836 tc->flags = flags;
837 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
838 tc->state.ptr = xmalloc(tr->state_size);
839 if (tr->state_init_func) {
840 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
841 }
842 tc->resume_position = 0;
843 tc->recognized_len = 0;
844 tc->readagain_len = 0;
845 tc->writebuf_len = 0;
846 tc->writebuf_off = 0;
847 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
848 tc->readbuf.ptr = xmalloc(tr->max_input);
849 }
850 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
851 tc->writebuf.ptr = xmalloc(tr->max_output);
852 }
853 return tc;
854}
855
857rb_transcoding_convert(rb_transcoding *tc,
858 const unsigned char **input_ptr, const unsigned char *input_stop,
859 unsigned char **output_ptr, unsigned char *output_stop,
860 int flags)
861{
862 return transcode_restartable(
863 input_ptr, output_ptr,
864 input_stop, output_stop,
865 tc, flags);
866}
867
868static void
869rb_transcoding_close(rb_transcoding *tc)
870{
871 const rb_transcoder *tr = tc->transcoder;
872 if (tr->state_fini_func) {
873 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
874 }
875 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
876 xfree(tc->state.ptr);
877 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
878 xfree(tc->readbuf.ptr);
879 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
880 xfree(tc->writebuf.ptr);
881 xfree(tc);
882}
883
884static size_t
885rb_transcoding_memsize(rb_transcoding *tc)
886{
887 size_t size = sizeof(rb_transcoding);
888 const rb_transcoder *tr = tc->transcoder;
889
890 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
891 size += tr->state_size;
892 }
893 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
894 size += tr->max_input;
895 }
896 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
897 size += tr->max_output;
898 }
899 return size;
900}
901
902static rb_econv_t *
903rb_econv_alloc(int n_hint)
904{
905 rb_econv_t *ec;
906
907 if (n_hint <= 0)
908 n_hint = 1;
909
910 ec = ALLOC(rb_econv_t);
911 ec->flags = 0;
912 ec->source_encoding_name = NULL;
913 ec->destination_encoding_name = NULL;
914 ec->started = 0;
915 ec->replacement_str = NULL;
916 ec->replacement_len = 0;
917 ec->replacement_enc = NULL;
918 ec->replacement_allocated = 0;
919 ec->in_buf_start = NULL;
920 ec->in_data_start = NULL;
921 ec->in_data_end = NULL;
922 ec->in_buf_end = NULL;
923 ec->num_allocated = n_hint;
924 ec->num_trans = 0;
925 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
926 ec->num_finished = 0;
927 ec->last_tc = NULL;
928 ec->last_error.result = econv_source_buffer_empty;
929 ec->last_error.error_tc = NULL;
930 ec->last_error.source_encoding = NULL;
931 ec->last_error.destination_encoding = NULL;
932 ec->last_error.error_bytes_start = NULL;
933 ec->last_error.error_bytes_len = 0;
934 ec->last_error.readagain_len = 0;
935 ec->source_encoding = NULL;
936 ec->destination_encoding = NULL;
937 return ec;
938}
939
940static int
941rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
942{
943 int n, j;
944 int bufsize = 4096;
945 unsigned char *p;
946
947 if (ec->num_trans == ec->num_allocated) {
948 n = ec->num_allocated * 2;
949 REALLOC_N(ec->elems, rb_econv_elem_t, n);
950 ec->num_allocated = n;
951 }
952
953 p = xmalloc(bufsize);
954
955 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
956
957 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
958 ec->elems[i].out_buf_start = p;
959 ec->elems[i].out_buf_end = p + bufsize;
960 ec->elems[i].out_data_start = p;
961 ec->elems[i].out_data_end = p;
962 ec->elems[i].last_result = econv_source_buffer_empty;
963
964 ec->num_trans++;
965
966 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
967 for (j = ec->num_trans-1; i <= j; j--) {
968 rb_transcoding *tc = ec->elems[j].tc;
969 const rb_transcoder *tr2 = tc->transcoder;
970 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
971 ec->last_tc = tc;
972 break;
973 }
974 }
975
976 return 0;
977}
978
979static rb_econv_t *
980rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
981{
982 rb_econv_t *ec;
983 int i, ret;
984 ASSERT_vm_locking();
985
986 for (i = 0; i < n; i++) {
987 const rb_transcoder *tr;
988 tr = load_transcoder_entry(entries[i]);
989 if (!tr)
990 return NULL;
991 }
992
993 ec = rb_econv_alloc(n);
994
995 for (i = 0; i < n; i++) {
996 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
997 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
998 if (ret == -1) {
999 rb_econv_close(ec);
1000 return NULL;
1001 }
1002 }
1003
1004 return ec;
1005}
1006
1008 transcoder_entry_t **entries;
1009 int num_additional;
1010};
1011
1012static void
1013trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1014{
1015 struct trans_open_t *toarg = arg;
1016
1017 if (!toarg->entries) {
1018 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1019 }
1020 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1021}
1022
1023static rb_econv_t *
1024rb_econv_open0(const char *sname, const char *dname, int ecflags)
1025{
1026 transcoder_entry_t **entries = NULL;
1027 int num_trans;
1028 rb_econv_t *ec;
1029 ASSERT_vm_locking();
1030
1031 /* Just check if sname and dname are defined */
1032 /* (This check is needed?) */
1033 if (*sname) rb_enc_find_index(sname);
1034 if (*dname) rb_enc_find_index(dname);
1035
1036 if (*sname == '\0' && *dname == '\0') {
1037 num_trans = 0;
1038 entries = NULL;
1039 sname = dname = "";
1040 }
1041 else {
1042 struct trans_open_t toarg;
1043 toarg.entries = NULL;
1044 toarg.num_additional = 0;
1045 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1046 entries = toarg.entries;
1047 if (num_trans < 0) {
1048 xfree(entries);
1049 return NULL;
1050 }
1051 }
1052
1053 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1054 xfree(entries);
1055 if (!ec)
1056 return NULL;
1057
1058 ec->flags = ecflags;
1059 ec->source_encoding_name = sname;
1060 ec->destination_encoding_name = dname;
1061
1062 return ec;
1063}
1064
1065#define MAX_ECFLAGS_DECORATORS 32
1066
1067static int
1068decorator_names(int ecflags, const char **decorators_ret)
1069{
1070 int num_decorators;
1071
1072 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1077 case 0:
1078 break;
1079 default:
1080 return -1;
1081 }
1082
1083 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1085 return -1;
1086
1087 num_decorators = 0;
1088
1089 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1090 decorators_ret[num_decorators++] = "xml_text_escape";
1092 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1093 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1094 decorators_ret[num_decorators++] = "xml_attr_quote";
1095
1096 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1097 decorators_ret[num_decorators++] = "crlf_newline";
1098 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1099 decorators_ret[num_decorators++] = "cr_newline";
1100 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1101 decorators_ret[num_decorators++] = "lf_newline";
1103 decorators_ret[num_decorators++] = "universal_newline";
1104
1105 return num_decorators;
1106}
1107
1108rb_econv_t *
1109rb_econv_open(const char *sname, const char *dname, int ecflags)
1110{
1111 rb_econv_t *ec;
1112 int num_decorators;
1113 const char *decorators[MAX_ECFLAGS_DECORATORS];
1114 int i;
1115
1116 num_decorators = decorator_names(ecflags, decorators);
1117 if (num_decorators == -1)
1118 return NULL;
1119
1120 RB_VM_LOCKING() {
1121 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1122 if (ec) {
1123 for (i = 0; i < num_decorators; i++) {
1124 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1125 rb_econv_close(ec);
1126 ec = NULL;
1127 break;
1128 }
1129 }
1130 }
1131 }
1132
1133 if (ec) {
1134 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1135 }
1136 return ec; // can be NULL
1137}
1138
1139static int
1140trans_sweep(rb_econv_t *ec,
1141 const unsigned char **input_ptr, const unsigned char *input_stop,
1142 unsigned char **output_ptr, unsigned char *output_stop,
1143 int flags,
1144 int start)
1145{
1146 int try;
1147 int i, f;
1148
1149 const unsigned char **ipp, *is, *iold;
1150 unsigned char **opp, *os, *oold;
1152
1153 try = 1;
1154 while (try) {
1155 try = 0;
1156 for (i = start; i < ec->num_trans; i++) {
1157 rb_econv_elem_t *te = &ec->elems[i];
1158
1159 if (i == 0) {
1160 ipp = input_ptr;
1161 is = input_stop;
1162 }
1163 else {
1164 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1165 ipp = (const unsigned char **)&prev_te->out_data_start;
1166 is = prev_te->out_data_end;
1167 }
1168
1169 if (i == ec->num_trans-1) {
1170 opp = output_ptr;
1171 os = output_stop;
1172 }
1173 else {
1174 if (te->out_buf_start != te->out_data_start) {
1175 ssize_t len = te->out_data_end - te->out_data_start;
1176 ssize_t off = te->out_data_start - te->out_buf_start;
1177 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1178 te->out_data_start = te->out_buf_start;
1179 te->out_data_end -= off;
1180 }
1181 opp = &te->out_data_end;
1182 os = te->out_buf_end;
1183 }
1184
1185 f = flags;
1186 if (ec->num_finished != i)
1188 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1189 start = 1;
1190 flags &= ~ECONV_AFTER_OUTPUT;
1191 }
1192 if (i != 0)
1193 f &= ~ECONV_AFTER_OUTPUT;
1194 iold = *ipp;
1195 oold = *opp;
1196 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1197 if (iold != *ipp || oold != *opp)
1198 try = 1;
1199
1200 switch (res) {
1204 case econv_after_output:
1205 return i;
1206
1209 break;
1210
1211 case econv_finished:
1212 ec->num_finished = i+1;
1213 break;
1214 }
1215 }
1216 }
1217 return -1;
1218}
1219
1220static rb_econv_result_t
1221rb_trans_conv(rb_econv_t *ec,
1222 const unsigned char **input_ptr, const unsigned char *input_stop,
1223 unsigned char **output_ptr, unsigned char *output_stop,
1224 int flags,
1225 int *result_position_ptr)
1226{
1227 int i;
1228 int needreport_index;
1229 int sweep_start;
1230
1231 unsigned char empty_buf;
1232 unsigned char *empty_ptr = &empty_buf;
1233
1234 if (!input_ptr) {
1235 input_ptr = (const unsigned char **)&empty_ptr;
1236 input_stop = empty_ptr;
1237 }
1238
1239 if (!output_ptr) {
1240 output_ptr = &empty_ptr;
1241 output_stop = empty_ptr;
1242 }
1243
1244 if (ec->elems[0].last_result == econv_after_output)
1245 ec->elems[0].last_result = econv_source_buffer_empty;
1246
1247 for (i = ec->num_trans-1; 0 <= i; i--) {
1248 switch (ec->elems[i].last_result) {
1252 case econv_after_output:
1253 case econv_finished:
1254 sweep_start = i+1;
1255 goto found_needreport;
1256
1259 break;
1260
1261 default:
1262 rb_bug("unexpected transcode last result");
1263 }
1264 }
1265
1266 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1267
1268 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1269 (flags & ECONV_AFTER_OUTPUT)) {
1271
1272 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1274 result_position_ptr);
1275
1276 if (res == econv_source_buffer_empty)
1277 return econv_after_output;
1278 return res;
1279 }
1280
1281 sweep_start = 0;
1282
1283 found_needreport:
1284
1285 do {
1286 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1287 sweep_start = needreport_index + 1;
1288 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1289
1290 for (i = ec->num_trans-1; 0 <= i; i--) {
1291 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1292 rb_econv_result_t res = ec->elems[i].last_result;
1293 if (res == econv_invalid_byte_sequence ||
1294 res == econv_incomplete_input ||
1296 res == econv_after_output) {
1297 ec->elems[i].last_result = econv_source_buffer_empty;
1298 }
1299 if (result_position_ptr)
1300 *result_position_ptr = i;
1301 return res;
1302 }
1303 }
1304 if (result_position_ptr)
1305 *result_position_ptr = -1;
1307}
1308
1309static rb_econv_result_t
1310rb_econv_convert0(rb_econv_t *ec,
1311 const unsigned char **input_ptr, const unsigned char *input_stop,
1312 unsigned char **output_ptr, unsigned char *output_stop,
1313 int flags)
1314{
1316 int result_position;
1317 int has_output = 0;
1318
1319 memset(&ec->last_error, 0, sizeof(ec->last_error));
1320
1321 if (ec->num_trans == 0) {
1322 size_t len;
1323 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1324 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1325 len = output_stop - *output_ptr;
1326 memcpy(*output_ptr, ec->in_data_start, len);
1327 *output_ptr = output_stop;
1328 ec->in_data_start += len;
1330 goto gotresult;
1331 }
1332 len = ec->in_data_end - ec->in_data_start;
1333 memcpy(*output_ptr, ec->in_data_start, len);
1334 *output_ptr += len;
1335 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1336 if (flags & ECONV_AFTER_OUTPUT) {
1337 res = econv_after_output;
1338 goto gotresult;
1339 }
1340 }
1341 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1342 len = output_stop - *output_ptr;
1343 }
1344 else {
1345 len = input_stop - *input_ptr;
1346 }
1347 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1348 *(*output_ptr)++ = *(*input_ptr)++;
1349 res = econv_after_output;
1350 goto gotresult;
1351 }
1352 memcpy(*output_ptr, *input_ptr, len);
1353 *output_ptr += len;
1354 *input_ptr += len;
1355 if (*input_ptr != input_stop)
1357 else if (flags & ECONV_PARTIAL_INPUT)
1359 else
1360 res = econv_finished;
1361 goto gotresult;
1362 }
1363
1364 if (ec->elems[ec->num_trans-1].out_data_start) {
1365 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1366 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1367 if (data_start != data_end) {
1368 size_t len;
1369 if (output_stop - *output_ptr < data_end - data_start) {
1370 len = output_stop - *output_ptr;
1371 memcpy(*output_ptr, data_start, len);
1372 *output_ptr = output_stop;
1373 ec->elems[ec->num_trans-1].out_data_start += len;
1375 goto gotresult;
1376 }
1377 len = data_end - data_start;
1378 memcpy(*output_ptr, data_start, len);
1379 *output_ptr += len;
1380 ec->elems[ec->num_trans-1].out_data_start =
1381 ec->elems[ec->num_trans-1].out_data_end =
1382 ec->elems[ec->num_trans-1].out_buf_start;
1383 has_output = 1;
1384 }
1385 }
1386
1387 if (ec->in_buf_start &&
1388 ec->in_data_start != ec->in_data_end) {
1389 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1390 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1391 if (res != econv_source_buffer_empty)
1392 goto gotresult;
1393 }
1394
1395 if (has_output &&
1396 (flags & ECONV_AFTER_OUTPUT) &&
1397 *input_ptr != input_stop) {
1398 input_stop = *input_ptr;
1399 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1400 if (res == econv_source_buffer_empty)
1401 res = econv_after_output;
1402 }
1403 else if ((flags & ECONV_AFTER_OUTPUT) ||
1404 ec->num_trans == 1) {
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1406 }
1407 else {
1408 flags |= ECONV_AFTER_OUTPUT;
1409 do {
1410 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1411 } while (res == econv_after_output);
1412 }
1413
1414 gotresult:
1415 ec->last_error.result = res;
1416 if (res == econv_invalid_byte_sequence ||
1417 res == econv_incomplete_input ||
1419 rb_transcoding *error_tc = ec->elems[result_position].tc;
1420 ec->last_error.error_tc = error_tc;
1421 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1422 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1423 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1424 ec->last_error.error_bytes_len = error_tc->recognized_len;
1425 ec->last_error.readagain_len = error_tc->readagain_len;
1426 }
1427
1428 return res;
1429}
1430
1431static int output_replacement_character(rb_econv_t *ec);
1432
1433static int
1434output_hex_charref(rb_econv_t *ec)
1435{
1436 int ret;
1437 unsigned char utfbuf[1024];
1438 const unsigned char *utf;
1439 size_t utf_len;
1440 int utf_allocated = 0;
1441 char charef_buf[16];
1442 const unsigned char *p;
1443
1444 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1445 utf = ec->last_error.error_bytes_start;
1446 utf_len = ec->last_error.error_bytes_len;
1447 }
1448 else {
1449 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1450 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1451 utfbuf, sizeof(utfbuf),
1452 &utf_len);
1453 if (!utf)
1454 return -1;
1455 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1456 utf_allocated = 1;
1457 }
1458
1459 if (utf_len % 4 != 0)
1460 goto fail;
1461
1462 p = utf;
1463 while (4 <= utf_len) {
1464 unsigned int u = 0;
1465 u += p[0] << 24;
1466 u += p[1] << 16;
1467 u += p[2] << 8;
1468 u += p[3];
1469 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1470
1471 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1472 if (ret == -1)
1473 goto fail;
1474
1475 p += 4;
1476 utf_len -= 4;
1477 }
1478
1479 if (utf_allocated)
1480 xfree((void *)utf);
1481 return 0;
1482
1483 fail:
1484 if (utf_allocated)
1485 xfree((void *)utf);
1486 return -1;
1487}
1488
1491 const unsigned char **input_ptr, const unsigned char *input_stop,
1492 unsigned char **output_ptr, unsigned char *output_stop,
1493 int flags)
1494{
1496
1497 unsigned char empty_buf;
1498 unsigned char *empty_ptr = &empty_buf;
1499
1500 ec->started = 1;
1501
1502 if (!input_ptr) {
1503 input_ptr = (const unsigned char **)&empty_ptr;
1504 input_stop = empty_ptr;
1505 }
1506
1507 if (!output_ptr) {
1508 output_ptr = &empty_ptr;
1509 output_stop = empty_ptr;
1510 }
1511
1512 resume:
1513 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1514
1515 if (ret == econv_invalid_byte_sequence ||
1516 ret == econv_incomplete_input) {
1517 /* deal with invalid byte sequence */
1518 /* todo: add more alternative behaviors */
1519 switch (ec->flags & ECONV_INVALID_MASK) {
1521 if (output_replacement_character(ec) == 0)
1522 goto resume;
1523 }
1524 }
1525
1526 if (ret == econv_undefined_conversion) {
1527 /* valid character in source encoding
1528 * but no related character(s) in destination encoding */
1529 /* todo: add more alternative behaviors */
1530 switch (ec->flags & ECONV_UNDEF_MASK) {
1532 if (output_replacement_character(ec) == 0)
1533 goto resume;
1534 break;
1535
1537 if (output_hex_charref(ec) == 0)
1538 goto resume;
1539 break;
1540 }
1541 }
1542
1543 return ret;
1544}
1545
1546const char *
1548{
1549 rb_transcoding *tc = ec->last_tc;
1550 const rb_transcoder *tr;
1551
1552 if (tc == NULL)
1553 return "";
1554
1555 tr = tc->transcoder;
1556
1557 if (tr->asciicompat_type == asciicompat_encoder)
1558 return tr->src_encoding;
1559 return tr->dst_encoding;
1560}
1561
1562static unsigned char *
1563allocate_converted_string(const char *sname, const char *dname,
1564 const unsigned char *str, size_t len,
1565 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1566 size_t *dst_len_ptr)
1567{
1568 unsigned char *dst_str;
1569 size_t dst_len;
1570 size_t dst_bufsize;
1571
1572 rb_econv_t *ec;
1574
1575 const unsigned char *sp;
1576 unsigned char *dp;
1577
1578 if (caller_dst_buf)
1579 dst_bufsize = caller_dst_bufsize;
1580 else if (len == 0)
1581 dst_bufsize = 1;
1582 else
1583 dst_bufsize = len;
1584
1585 ec = rb_econv_open(sname, dname, 0);
1586 if (ec == NULL)
1587 return NULL;
1588 if (caller_dst_buf)
1589 dst_str = caller_dst_buf;
1590 else
1591 dst_str = xmalloc(dst_bufsize);
1592 dst_len = 0;
1593 sp = str;
1594 dp = dst_str+dst_len;
1595 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1596 dst_len = dp - dst_str;
1597 while (res == econv_destination_buffer_full) {
1598 if (SIZE_MAX/2 < dst_bufsize) {
1599 goto fail;
1600 }
1601 dst_bufsize *= 2;
1602 if (dst_str == caller_dst_buf) {
1603 unsigned char *tmp;
1604 tmp = xmalloc(dst_bufsize);
1605 memcpy(tmp, dst_str, dst_bufsize/2);
1606 dst_str = tmp;
1607 }
1608 else {
1609 dst_str = xrealloc(dst_str, dst_bufsize);
1610 }
1611 dp = dst_str+dst_len;
1612 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1613 dst_len = dp - dst_str;
1614 }
1615 if (res != econv_finished) {
1616 goto fail;
1617 }
1618 rb_econv_close(ec);
1619 *dst_len_ptr = dst_len;
1620 return dst_str;
1621
1622 fail:
1623 if (dst_str != caller_dst_buf)
1624 xfree(dst_str);
1625 rb_econv_close(ec);
1626 return NULL;
1627}
1628
1629/* result: 0:success -1:failure */
1630int
1632 const unsigned char *str, size_t len, const char *str_encoding)
1633{
1634 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1635 unsigned char insert_buf[4096];
1636 const unsigned char *insert_str = NULL;
1637 size_t insert_len;
1638
1639 int last_trans_index;
1640 rb_transcoding *tc;
1641
1642 unsigned char **buf_start_p;
1643 unsigned char **data_start_p;
1644 unsigned char **data_end_p;
1645 unsigned char **buf_end_p;
1646
1647 size_t need;
1648
1649 ec->started = 1;
1650
1651 if (len == 0)
1652 return 0;
1653
1654 if (encoding_equal(insert_encoding, str_encoding)) {
1655 insert_str = str;
1656 insert_len = len;
1657 }
1658 else {
1659 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1660 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1661 if (insert_str == NULL)
1662 return -1;
1663 }
1664
1665 need = insert_len;
1666
1667 last_trans_index = ec->num_trans-1;
1668 if (ec->num_trans == 0) {
1669 tc = NULL;
1670 buf_start_p = &ec->in_buf_start;
1671 data_start_p = &ec->in_data_start;
1672 data_end_p = &ec->in_data_end;
1673 buf_end_p = &ec->in_buf_end;
1674 }
1675 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1676 tc = ec->elems[last_trans_index].tc;
1677 need += tc->readagain_len;
1678 if (need < insert_len)
1679 goto fail;
1680 if (last_trans_index == 0) {
1681 buf_start_p = &ec->in_buf_start;
1682 data_start_p = &ec->in_data_start;
1683 data_end_p = &ec->in_data_end;
1684 buf_end_p = &ec->in_buf_end;
1685 }
1686 else {
1687 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1688 buf_start_p = &ee->out_buf_start;
1689 data_start_p = &ee->out_data_start;
1690 data_end_p = &ee->out_data_end;
1691 buf_end_p = &ee->out_buf_end;
1692 }
1693 }
1694 else {
1695 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1696 buf_start_p = &ee->out_buf_start;
1697 data_start_p = &ee->out_data_start;
1698 data_end_p = &ee->out_data_end;
1699 buf_end_p = &ee->out_buf_end;
1700 tc = ec->elems[last_trans_index].tc;
1701 }
1702
1703 if (*buf_start_p == NULL) {
1704 unsigned char *buf = xmalloc(need);
1705 *buf_start_p = buf;
1706 *data_start_p = buf;
1707 *data_end_p = buf;
1708 *buf_end_p = buf+need;
1709 }
1710 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1711 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1712 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1713 *data_start_p = *buf_start_p;
1714 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1715 unsigned char *buf;
1716 size_t s = (*data_end_p - *buf_start_p) + need;
1717 if (s < need)
1718 goto fail;
1719 buf = xrealloc(*buf_start_p, s);
1720 *data_start_p = buf;
1721 *data_end_p = buf + (*data_end_p - *buf_start_p);
1722 *buf_start_p = buf;
1723 *buf_end_p = buf + s;
1724 }
1725 }
1726
1727 memcpy(*data_end_p, insert_str, insert_len);
1728 *data_end_p += insert_len;
1729 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1730 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1731 *data_end_p += tc->readagain_len;
1732 tc->readagain_len = 0;
1733 }
1734
1735 if (insert_str != str && insert_str != insert_buf)
1736 xfree((void*)insert_str);
1737 return 0;
1738
1739 fail:
1740 if (insert_str != str && insert_str != insert_buf)
1741 xfree((void*)insert_str);
1742 return -1;
1743}
1744
1745void
1747{
1748 int i;
1749
1750 if (ec->replacement_allocated) {
1751 xfree((void *)ec->replacement_str);
1752 }
1753 for (i = 0; i < ec->num_trans; i++) {
1754 rb_transcoding_close(ec->elems[i].tc);
1755 xfree(ec->elems[i].out_buf_start);
1756 }
1757 xfree(ec->in_buf_start);
1758 xfree(ec->elems);
1759 xfree(ec);
1760}
1761
1762size_t
1763rb_econv_memsize(rb_econv_t *ec)
1764{
1765 size_t size = sizeof(rb_econv_t);
1766 int i;
1767
1768 if (ec->replacement_allocated) {
1769 size += ec->replacement_len;
1770 }
1771 for (i = 0; i < ec->num_trans; i++) {
1772 size += rb_transcoding_memsize(ec->elems[i].tc);
1773
1774 if (ec->elems[i].out_buf_start) {
1775 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1776 }
1777 }
1778 size += ec->in_buf_end - ec->in_buf_start;
1779 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1780
1781 return size;
1782}
1783
1784int
1786{
1787 if (ec->num_trans == 0)
1788 return 0;
1789#if SIZEOF_SIZE_T > SIZEOF_INT
1790 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1791#endif
1792 return (int)ec->elems[0].tc->readagain_len;
1793}
1794
1795void
1796rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1797{
1798 rb_transcoding *tc;
1799 if (ec->num_trans == 0 || n == 0)
1800 return;
1801 tc = ec->elems[0].tc;
1802 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1803 tc->readagain_len -= n;
1804}
1805
1807 const char *ascii_compat_name;
1808 const char *ascii_incompat_name;
1809};
1810
1811static int
1812asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1813{
1814 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1815 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1816 const rb_transcoder *tr;
1817
1818 if (DECORATOR_P(entry->sname, entry->dname))
1819 return ST_CONTINUE;
1820 tr = load_transcoder_entry(entry);
1821 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1822 data->ascii_compat_name = tr->dst_encoding;
1823 return ST_STOP;
1824 }
1825 return ST_CONTINUE;
1826}
1827
1828const char *
1829rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1830{
1831 st_data_t v;
1832 st_table *table2;
1833 struct asciicompat_encoding_t data = {0};
1834
1835 RB_VM_LOCKING() {
1836 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1837 table2 = (st_table *)v;
1838 /*
1839 * Assumption:
1840 * There is at most one transcoder for
1841 * converting from ASCII incompatible encoding.
1842 *
1843 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1844 */
1845 if (table2->num_entries == 1) {
1846 data.ascii_incompat_name = ascii_incompat_name;
1847 data.ascii_compat_name = NULL;
1848 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1849 }
1850
1851 }
1852
1853 }
1854
1855 return data.ascii_compat_name; // can be NULL
1856}
1857
1858/*
1859 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1860 *
1861 * If the result of the conversion is not compatible with the encoding of
1862 * `dst`, `dst` may not be valid encoding.
1863 */
1864VALUE
1865rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1866{
1867 unsigned const char *sp, *se;
1868 unsigned char *ds, *dp, *de;
1870 int max_output;
1871 enum ruby_coderange_type coderange;
1872 rb_encoding *dst_enc = ec->destination_encoding;
1873
1874 if (NIL_P(dst)) {
1875 dst = rb_str_buf_new(len);
1876 if (dst_enc) {
1877 rb_enc_associate(dst, dst_enc);
1878 }
1879 coderange = ENC_CODERANGE_7BIT; // scan from the start
1880 }
1881 else {
1882 dst_enc = rb_enc_get(dst);
1883 coderange = rb_enc_str_coderange(dst);
1884 }
1885
1886 if (ec->last_tc)
1887 max_output = ec->last_tc->transcoder->max_output;
1888 else
1889 max_output = 1;
1890
1891 do {
1892 int cr;
1893 long dlen = RSTRING_LEN(dst);
1894 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1895 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1896 if (LONG_MAX < new_capa)
1897 rb_raise(rb_eArgError, "too long string");
1898 rb_str_modify_expand(dst, new_capa - dlen);
1899 }
1900 sp = (const unsigned char *)ss;
1901 se = sp + len;
1902 ds = (unsigned char *)RSTRING_PTR(dst);
1903 de = ds + rb_str_capacity(dst);
1904 dp = ds += dlen;
1905 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1906 switch (coderange) {
1907 case ENC_CODERANGE_7BIT:
1909 cr = (int)coderange;
1910 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1911 coderange = cr;
1912 ENC_CODERANGE_SET(dst, coderange);
1913 break;
1916 break;
1917 }
1918 len -= (const char *)sp - ss;
1919 ss = (const char *)sp;
1920 rb_str_set_len(dst, dlen + (dp - ds));
1922 } while (res == econv_destination_buffer_full);
1923
1924 return dst;
1925}
1926
1927VALUE
1928rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1929{
1930 src = rb_str_new_frozen(src);
1931 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1932 RB_GC_GUARD(src);
1933 return dst;
1934}
1935
1936VALUE
1938{
1939 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1940}
1941
1942VALUE
1943rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1944{
1945 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1946}
1947
1948VALUE
1950{
1951 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1952}
1953
1954static int
1955rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1956{
1957 transcoder_entry_t *entry;
1958 const rb_transcoder *tr = NULL;
1959
1960 if (ec->started != 0)
1961 return -1;
1962
1963 RB_VM_LOCKING() {
1964 entry = get_transcoder_entry(sname, dname);
1965 if (entry) {
1966 tr = load_transcoder_entry(entry);
1967 }
1968
1969 }
1970
1971 return tr ? rb_econv_add_transcoder_at(ec, tr, n) : -1;
1972}
1973
1974static int
1975rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1976{
1977 return rb_econv_add_converter(ec, "", decorator_name, n);
1978}
1979
1980int
1981rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1982{
1983 const rb_transcoder *tr;
1984
1985 if (ec->num_trans == 0)
1986 return rb_econv_decorate_at(ec, decorator_name, 0);
1987
1988 tr = ec->elems[0].tc->transcoder;
1989
1990 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1991 tr->asciicompat_type == asciicompat_decoder)
1992 return rb_econv_decorate_at(ec, decorator_name, 1);
1993
1994 return rb_econv_decorate_at(ec, decorator_name, 0);
1995}
1996
1997int
1998rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1999{
2000 const rb_transcoder *tr;
2001
2002 if (ec->num_trans == 0)
2003 return rb_econv_decorate_at(ec, decorator_name, 0);
2004
2005 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2006
2007 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
2008 tr->asciicompat_type == asciicompat_encoder)
2009 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2010
2011 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2012}
2013
2014void
2016{
2017 const char *dname = 0;
2018
2019 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2021 dname = "universal_newline";
2022 break;
2024 dname = "crlf_newline";
2025 break;
2027 dname = "cr_newline";
2028 break;
2030 dname = "lf_newline";
2031 break;
2032 }
2033
2034 if (dname) {
2035 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2036 int num_trans = ec->num_trans;
2037 int i, j = 0;
2038
2039 for (i=0; i < num_trans; i++) {
2040 if (transcoder == ec->elems[i].tc->transcoder) {
2041 rb_transcoding_close(ec->elems[i].tc);
2042 xfree(ec->elems[i].out_buf_start);
2043 ec->num_trans--;
2044 }
2045 else
2046 ec->elems[j++] = ec->elems[i];
2047 }
2048 }
2049
2050 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2051}
2052
2053static VALUE
2054econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2055{
2056 int has_description = 0;
2057
2058 if (NIL_P(mesg))
2059 mesg = rb_str_new(NULL, 0);
2060
2061 if (*sname != '\0' || *dname != '\0') {
2062 if (*sname == '\0')
2063 rb_str_cat2(mesg, dname);
2064 else if (*dname == '\0')
2065 rb_str_cat2(mesg, sname);
2066 else
2067 rb_str_catf(mesg, "%s to %s", sname, dname);
2068 has_description = 1;
2069 }
2070
2071 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2075 const char *pre = "";
2076 if (has_description)
2077 rb_str_cat2(mesg, " with ");
2078 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2079 rb_str_cat2(mesg, pre); pre = ",";
2080 rb_str_cat2(mesg, "universal_newline");
2081 }
2082 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2083 rb_str_cat2(mesg, pre); pre = ",";
2084 rb_str_cat2(mesg, "crlf_newline");
2085 }
2086 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2087 rb_str_cat2(mesg, pre); pre = ",";
2088 rb_str_cat2(mesg, "cr_newline");
2089 }
2090 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2091 rb_str_cat2(mesg, pre); pre = ",";
2092 rb_str_cat2(mesg, "lf_newline");
2093 }
2094 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2095 rb_str_cat2(mesg, pre); pre = ",";
2096 rb_str_cat2(mesg, "xml_text");
2097 }
2098 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2099 rb_str_cat2(mesg, pre); pre = ",";
2100 rb_str_cat2(mesg, "xml_attr_content");
2101 }
2102 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2103 rb_str_cat2(mesg, pre); pre = ",";
2104 rb_str_cat2(mesg, "xml_attr_quote");
2105 }
2106 has_description = 1;
2107 }
2108 if (!has_description) {
2109 rb_str_cat2(mesg, "no-conversion");
2110 }
2111
2112 return mesg;
2113}
2114
2115VALUE
2116rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2117{
2118 VALUE mesg, exc;
2119 mesg = rb_str_new_cstr("code converter not found (");
2120 econv_description(sname, dname, ecflags, mesg);
2121 rb_str_cat2(mesg, ")");
2122 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2123 return exc;
2124}
2125
2126static VALUE
2127make_econv_exception(rb_econv_t *ec)
2128{
2129 VALUE mesg, exc;
2130 if (ec->last_error.result == econv_invalid_byte_sequence ||
2131 ec->last_error.result == econv_incomplete_input) {
2132 const char *err = (const char *)ec->last_error.error_bytes_start;
2133 size_t error_len = ec->last_error.error_bytes_len;
2134 VALUE bytes = rb_str_new(err, error_len);
2135 VALUE dumped = rb_str_dump(bytes);
2136 size_t readagain_len = ec->last_error.readagain_len;
2137 VALUE bytes2 = Qnil;
2138 VALUE dumped2;
2139 if (ec->last_error.result == econv_incomplete_input) {
2140 mesg = rb_sprintf("incomplete %s on %s",
2141 StringValueCStr(dumped),
2142 ec->last_error.source_encoding);
2143 }
2144 else if (readagain_len) {
2145 bytes2 = rb_str_new(err+error_len, readagain_len);
2146 dumped2 = rb_str_dump(bytes2);
2147 mesg = rb_sprintf("%s followed by %s on %s",
2148 StringValueCStr(dumped),
2149 StringValueCStr(dumped2),
2150 ec->last_error.source_encoding);
2151 }
2152 else {
2153 mesg = rb_sprintf("%s on %s",
2154 StringValueCStr(dumped),
2155 ec->last_error.source_encoding);
2156 }
2157
2158 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2159 rb_ivar_set(exc, id_error_bytes, bytes);
2160 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2161 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2162 goto set_encs;
2163 }
2164 if (ec->last_error.result == econv_undefined_conversion) {
2165 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2166 ec->last_error.error_bytes_len);
2167 VALUE dumped = Qnil;
2168 int idx;
2169 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2170 rb_encoding *utf8 = rb_utf8_encoding();
2171 const char *start, *end;
2172 int n;
2173 start = (const char *)ec->last_error.error_bytes_start;
2174 end = start + ec->last_error.error_bytes_len;
2175 n = rb_enc_precise_mbclen(start, end, utf8);
2176 if (MBCLEN_CHARFOUND_P(n) &&
2177 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2178 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2179 dumped = rb_sprintf("U+%04X", cc);
2180 }
2181 }
2182 if (NIL_P(dumped))
2183 dumped = rb_str_dump(bytes);
2184 if (strcmp(ec->last_error.source_encoding,
2185 ec->source_encoding_name) == 0 &&
2186 strcmp(ec->last_error.destination_encoding,
2187 ec->destination_encoding_name) == 0) {
2188 mesg = rb_sprintf("%s from %s to %s",
2189 StringValueCStr(dumped),
2190 ec->last_error.source_encoding,
2191 ec->last_error.destination_encoding);
2192 }
2193 else {
2194 int i;
2195 mesg = rb_sprintf("%s to %s in conversion from %s",
2196 StringValueCStr(dumped),
2197 ec->last_error.destination_encoding,
2198 ec->source_encoding_name);
2199 for (i = 0; i < ec->num_trans; i++) {
2200 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2201 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2202 rb_str_catf(mesg, " to %s",
2203 ec->elems[i].tc->transcoder->dst_encoding);
2204 }
2205 }
2206 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2207 idx = rb_enc_find_index(ec->last_error.source_encoding);
2208 if (0 <= idx)
2209 rb_enc_associate_index(bytes, idx);
2210 rb_ivar_set(exc, id_error_char, bytes);
2211 goto set_encs;
2212 }
2213 return Qnil;
2214
2215 set_encs:
2216 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2217 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2218 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2219 if (0 <= idx)
2220 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2221 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2222 if (0 <= idx)
2223 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2224 return exc;
2225}
2226
2227static void
2228more_output_buffer(
2229 VALUE destination,
2230 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2231 int max_output,
2232 unsigned char **out_start_ptr,
2233 unsigned char **out_pos,
2234 unsigned char **out_stop_ptr)
2235{
2236 size_t len = (*out_pos - *out_start_ptr);
2237 size_t new_len = (len + max_output) * 2;
2238 *out_start_ptr = resize_destination(destination, len, new_len);
2239 *out_pos = *out_start_ptr + len;
2240 *out_stop_ptr = *out_start_ptr + new_len;
2241}
2242
2243static int
2244make_replacement(rb_econv_t *ec)
2245{
2246 rb_transcoding *tc;
2247 const rb_transcoder *tr;
2248 const unsigned char *replacement;
2249 const char *repl_enc;
2250 const char *ins_enc;
2251 size_t len;
2252
2253 if (ec->replacement_str)
2254 return 0;
2255
2257
2258 tc = ec->last_tc;
2259 if (*ins_enc) {
2260 tr = tc->transcoder;
2261 rb_enc_find(tr->dst_encoding);
2262 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2263 }
2264 else {
2265 replacement = (unsigned char *)"?";
2266 len = 1;
2267 repl_enc = "";
2268 }
2269
2270 ec->replacement_str = replacement;
2271 ec->replacement_len = len;
2272 ec->replacement_enc = repl_enc;
2273 ec->replacement_allocated = 0;
2274 return 0;
2275}
2276
2277int
2279 const unsigned char *str, size_t len, const char *encname)
2280{
2281 unsigned char *str2;
2282 size_t len2;
2283 const char *encname2;
2284
2286
2287 if (!*encname2 || encoding_equal(encname, encname2)) {
2288 str2 = xmalloc(len);
2289 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2290 len2 = len;
2291 encname2 = encname;
2292 }
2293 else {
2294 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2295 if (!str2)
2296 return -1;
2297 }
2298
2299 if (ec->replacement_allocated) {
2300 xfree((void *)ec->replacement_str);
2301 }
2302 ec->replacement_allocated = 1;
2303 ec->replacement_str = str2;
2304 ec->replacement_len = len2;
2305 ec->replacement_enc = encname2;
2306 return 0;
2307}
2308
2309static int
2310output_replacement_character(rb_econv_t *ec)
2311{
2312 int ret;
2313
2314 if (make_replacement(ec) == -1)
2315 return -1;
2316
2317 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2318 if (ret == -1)
2319 return -1;
2320
2321 return 0;
2322}
2323
2324#if 1
2325#define hash_fallback rb_hash_aref
2326
2327static VALUE
2328proc_fallback(VALUE fallback, VALUE c)
2329{
2330 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2331}
2332
2333static VALUE
2334method_fallback(VALUE fallback, VALUE c)
2335{
2336 return rb_method_call(1, &c, fallback);
2337}
2338
2339static VALUE
2340aref_fallback(VALUE fallback, VALUE c)
2341{
2342 return rb_funcallv_public(fallback, idAREF, 1, &c);
2343}
2344
2345static void
2346transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2347 const unsigned char *in_stop, unsigned char *out_stop,
2348 VALUE destination,
2349 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2350 const char *src_encoding,
2351 const char *dst_encoding,
2352 int ecflags,
2353 VALUE ecopts)
2354{
2355 rb_econv_t *ec;
2356 rb_transcoding *last_tc;
2358 unsigned char *out_start = *out_pos;
2359 int max_output;
2360 VALUE exc;
2361 VALUE fallback = Qnil;
2362 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2363
2364 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2365 if (!ec)
2366 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2367
2368 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2369 fallback = rb_hash_aref(ecopts, sym_fallback);
2370 if (RB_TYPE_P(fallback, T_HASH)) {
2371 fallback_func = hash_fallback;
2372 }
2373 else if (rb_obj_is_proc(fallback)) {
2374 fallback_func = proc_fallback;
2375 }
2376 else if (rb_obj_is_method(fallback)) {
2377 fallback_func = method_fallback;
2378 }
2379 else {
2380 fallback_func = aref_fallback;
2381 }
2382 }
2383 last_tc = ec->last_tc;
2384 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2385
2386 resume:
2387 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2388
2389 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2390 VALUE rep = rb_enc_str_new(
2391 (const char *)ec->last_error.error_bytes_start,
2392 ec->last_error.error_bytes_len,
2393 rb_enc_find(ec->last_error.source_encoding));
2394 rep = (*fallback_func)(fallback, rep);
2395 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2396 StringValue(rep);
2397 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2398 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2399 if ((int)ret == -1) {
2400 rb_raise(rb_eArgError, "too big fallback string");
2401 }
2402 goto resume;
2403 }
2404 }
2405
2406 if (ret == econv_invalid_byte_sequence ||
2407 ret == econv_incomplete_input ||
2409 exc = make_econv_exception(ec);
2410 rb_econv_close(ec);
2411 rb_exc_raise(exc);
2412 }
2413
2414 if (ret == econv_destination_buffer_full) {
2415 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2416 goto resume;
2417 }
2418
2419 rb_econv_close(ec);
2420 return;
2421}
2422#else
2423/* sample transcode_loop implementation in byte-by-byte stream style */
2424static void
2425transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2426 const unsigned char *in_stop, unsigned char *out_stop,
2427 VALUE destination,
2428 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2429 const char *src_encoding,
2430 const char *dst_encoding,
2431 int ecflags,
2432 VALUE ecopts)
2433{
2434 rb_econv_t *ec;
2435 rb_transcoding *last_tc;
2437 unsigned char *out_start = *out_pos;
2438 const unsigned char *ptr;
2439 int max_output;
2440 VALUE exc;
2441
2442 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2443 if (!ec)
2444 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2445
2446 last_tc = ec->last_tc;
2447 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2448
2450 ptr = *in_pos;
2451 while (ret != econv_finished) {
2452 unsigned char input_byte;
2453 const unsigned char *p = &input_byte;
2454
2455 if (ret == econv_source_buffer_empty) {
2456 if (ptr < in_stop) {
2457 input_byte = *ptr;
2458 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2459 }
2460 else {
2461 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2462 }
2463 }
2464 else {
2465 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2466 }
2467 if (&input_byte != p)
2468 ptr += p - &input_byte;
2469 switch (ret) {
2473 exc = make_econv_exception(ec);
2474 rb_econv_close(ec);
2475 rb_exc_raise(exc);
2476 break;
2477
2479 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2480 break;
2481
2483 break;
2484
2485 case econv_finished:
2486 break;
2487 }
2488 }
2489 rb_econv_close(ec);
2490 *in_pos = in_stop;
2491 return;
2492}
2493#endif
2494
2495
2496/*
2497 * String-specific code
2498 */
2499
2500static unsigned char *
2501str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2502{
2503 rb_str_resize(destination, new_len);
2504 return (unsigned char *)RSTRING_PTR(destination);
2505}
2506
2507static int
2508econv_opts(VALUE opt, int ecflags)
2509{
2510 VALUE v;
2511 int newlineflag = 0;
2512
2513 v = rb_hash_aref(opt, sym_invalid);
2514 if (NIL_P(v)) {
2515 }
2516 else if (v==sym_replace) {
2517 ecflags |= ECONV_INVALID_REPLACE;
2518 }
2519 else {
2520 rb_raise(rb_eArgError, "unknown value for invalid character option");
2521 }
2522
2523 v = rb_hash_aref(opt, sym_undef);
2524 if (NIL_P(v)) {
2525 }
2526 else if (v==sym_replace) {
2527 ecflags |= ECONV_UNDEF_REPLACE;
2528 }
2529 else {
2530 rb_raise(rb_eArgError, "unknown value for undefined character option");
2531 }
2532
2533 v = rb_hash_aref(opt, sym_replace);
2534 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2535 ecflags |= ECONV_UNDEF_REPLACE;
2536 }
2537
2538 v = rb_hash_aref(opt, sym_xml);
2539 if (!NIL_P(v)) {
2540 if (v==sym_text) {
2542 }
2543 else if (v==sym_attr) {
2545 }
2546 else if (SYMBOL_P(v)) {
2547 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2548 }
2549 else {
2550 rb_raise(rb_eArgError, "unexpected value for xml option");
2551 }
2552 }
2553
2554#ifdef ENABLE_ECONV_NEWLINE_OPTION
2555 v = rb_hash_aref(opt, sym_newline);
2556 if (!NIL_P(v)) {
2557 newlineflag = 2;
2558 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2559 if (v == sym_universal) {
2561 }
2562 else if (v == sym_crlf) {
2564 }
2565 else if (v == sym_cr) {
2566 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2567 }
2568 else if (v == sym_lf) {
2569 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2570 }
2571 else if (SYMBOL_P(v)) {
2572 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2573 rb_sym2str(v));
2574 }
2575 else {
2576 rb_raise(rb_eArgError, "unexpected value for newline option");
2577 }
2578 }
2579#endif
2580 {
2581 int setflags = 0;
2582
2583 v = rb_hash_aref(opt, sym_universal_newline);
2584 if (RTEST(v))
2586 newlineflag |= !NIL_P(v);
2587
2588 v = rb_hash_aref(opt, sym_crlf_newline);
2589 if (RTEST(v))
2590 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2591 newlineflag |= !NIL_P(v);
2592
2593 v = rb_hash_aref(opt, sym_cr_newline);
2594 if (RTEST(v))
2595 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2596 newlineflag |= !NIL_P(v);
2597
2598 v = rb_hash_aref(opt, sym_lf_newline);
2599 if (RTEST(v))
2600 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2601 newlineflag |= !NIL_P(v);
2602
2603 switch (newlineflag) {
2604 case 1:
2605 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2606 ecflags |= setflags;
2607 break;
2608
2609 case 3:
2610 rb_warning(":newline option precedes other newline options");
2611 break;
2612 }
2613 }
2614
2615 return ecflags;
2616}
2617
2618int
2619rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2620{
2621 VALUE newhash = Qnil;
2622 VALUE v;
2623
2624 if (NIL_P(opthash)) {
2625 *opts = Qnil;
2626 return ecflags;
2627 }
2628 ecflags = econv_opts(opthash, ecflags);
2629
2630 v = rb_hash_aref(opthash, sym_replace);
2631 if (!NIL_P(v)) {
2632 StringValue(v);
2633 if (is_broken_string(v)) {
2634 VALUE dumped = rb_str_dump(v);
2635 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2636 StringValueCStr(dumped),
2637 rb_enc_name(rb_enc_get(v)));
2638 }
2639 v = rb_str_new_frozen(v);
2640 newhash = rb_hash_new();
2641 rb_hash_aset(newhash, sym_replace, v);
2642 }
2643
2644 v = rb_hash_aref(opthash, sym_fallback);
2645 if (!NIL_P(v)) {
2646 VALUE h = rb_check_hash_type(v);
2647 if (NIL_P(h)
2648 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2649 : (v = h, 1)) {
2650 if (NIL_P(newhash))
2651 newhash = rb_hash_new();
2652 rb_hash_aset(newhash, sym_fallback, v);
2653 }
2654 }
2655
2656 if (!NIL_P(newhash))
2657 rb_hash_freeze(newhash);
2658 *opts = newhash;
2659
2660 return ecflags;
2661}
2662
2663int
2665{
2666 return rb_econv_prepare_options(opthash, opts, 0);
2667}
2668
2669rb_econv_t *
2670rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2671{
2672 rb_econv_t *ec;
2673 VALUE replacement;
2674
2675 if (NIL_P(opthash)) {
2676 replacement = Qnil;
2677 }
2678 else {
2679 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2680 rb_bug("rb_econv_open_opts called with invalid opthash");
2681 replacement = rb_hash_aref(opthash, sym_replace);
2682 }
2683
2684 RB_VM_LOCKING() {
2685 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2686 if (ec) {
2687 if (!NIL_P(replacement)) {
2688 int ret;
2689 rb_encoding *enc = rb_enc_get(replacement);
2690
2691 ret = rb_econv_set_replacement(ec,
2692 (const unsigned char *)RSTRING_PTR(replacement),
2693 RSTRING_LEN(replacement),
2694 rb_enc_name(enc));
2695 if (ret == -1) {
2696 rb_econv_close(ec);
2697 ec = NULL;
2698 }
2699 }
2700 }
2701 }
2702 return ec; // can be NULL
2703}
2704
2705static int
2706enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2707{
2708 rb_encoding *enc;
2709 const char *n;
2710 int encidx;
2711 VALUE encval;
2712
2713 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2714 !(enc = rb_enc_from_index(encidx))) {
2715 enc = NULL;
2716 encidx = 0;
2717 n = StringValueCStr(*arg);
2718 }
2719 else {
2720 n = rb_enc_name(enc);
2721 }
2722
2723 *name_p = n;
2724 *enc_p = enc;
2725
2726 return encidx;
2727}
2728
2729static int
2730str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2731 const char **sname_p, rb_encoding **senc_p,
2732 const char **dname_p, rb_encoding **denc_p)
2733{
2734 rb_encoding *senc, *denc;
2735 const char *sname, *dname;
2736 int sencidx, dencidx;
2737
2738 dencidx = enc_arg(arg1, &dname, &denc);
2739
2740 if (NIL_P(*arg2)) {
2741 sencidx = rb_enc_get_index(str);
2742 senc = rb_enc_from_index(sencidx);
2743 sname = rb_enc_name(senc);
2744 }
2745 else {
2746 sencidx = enc_arg(arg2, &sname, &senc);
2747 }
2748
2749 *sname_p = sname;
2750 *senc_p = senc;
2751 *dname_p = dname;
2752 *denc_p = denc;
2753 return dencidx;
2754}
2755
2756static int
2757str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2758{
2759 VALUE dest;
2760 VALUE str = *self;
2761 VALUE arg1, arg2;
2762 long blen, slen;
2763 unsigned char *buf, *bp, *sp;
2764 const unsigned char *fromp;
2765 rb_encoding *senc, *denc;
2766 const char *sname, *dname;
2767 int dencidx;
2768 int explicitly_invalid_replace = TRUE;
2769
2770 rb_check_arity(argc, 0, 2);
2771
2772 if (argc == 0) {
2773 arg1 = rb_enc_default_internal();
2774 if (NIL_P(arg1)) {
2775 if (!ecflags) return -1;
2776 arg1 = rb_obj_encoding(str);
2777 }
2778 if (!(ecflags & ECONV_INVALID_MASK)) {
2779 explicitly_invalid_replace = FALSE;
2780 }
2782 }
2783 else {
2784 arg1 = argv[0];
2785 }
2786 arg2 = argc<=1 ? Qnil : argv[1];
2787 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2788
2789 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2793 if (senc && senc == denc) {
2794 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2795 VALUE rep = Qnil;
2796 if (!NIL_P(ecopts)) {
2797 rep = rb_hash_aref(ecopts, sym_replace);
2798 }
2799 dest = rb_enc_str_scrub(senc, str, rep);
2800 if (NIL_P(dest)) dest = str;
2801 *self = dest;
2802 return dencidx;
2803 }
2804 return NIL_P(arg2) ? -1 : dencidx;
2805 }
2806 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2807 if (is_ascii_string(str)) {
2808 return dencidx;
2809 }
2810 }
2811 if (encoding_equal(sname, dname)) {
2812 return NIL_P(arg2) ? -1 : dencidx;
2813 }
2814 }
2815 else {
2816 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2817 rb_encoding *utf8 = rb_utf8_encoding();
2818 str = rb_str_conv_enc(str, senc, utf8);
2819 senc = utf8;
2820 sname = "UTF-8";
2821 }
2822 if (encoding_equal(sname, dname)) {
2823 sname = "";
2824 dname = "";
2825 }
2826 }
2827
2828 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2829 slen = RSTRING_LEN(str);
2830 blen = slen + 30; /* len + margin */
2831 dest = rb_str_tmp_new(blen);
2832 bp = (unsigned char *)RSTRING_PTR(dest);
2833
2834 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2835 if (fromp != sp+slen) {
2836 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2837 }
2838 buf = (unsigned char *)RSTRING_PTR(dest);
2839 *bp = '\0';
2840 rb_str_set_len(dest, bp - buf);
2841
2842 /* set encoding */
2843 if (!denc) {
2844 dencidx = rb_define_dummy_encoding(dname);
2845 RB_GC_GUARD(arg1);
2846 RB_GC_GUARD(arg2);
2847 }
2848 *self = dest;
2849
2850 return dencidx;
2851}
2852
2853static int
2854str_transcode(int argc, VALUE *argv, VALUE *self)
2855{
2856 VALUE opt;
2857 int ecflags = 0;
2858 VALUE ecopts = Qnil;
2859
2860 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2861 if (!NIL_P(opt)) {
2862 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2863 }
2864 return str_transcode0(argc, argv, self, ecflags, ecopts);
2865}
2866
2867static inline VALUE
2868str_encode_associate(VALUE str, int encidx)
2869{
2870 int cr = 0;
2871
2872 rb_enc_associate_index(str, encidx);
2873
2874 /* transcoded string never be broken. */
2875 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2876 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2877 }
2878 else {
2880 }
2881 ENC_CODERANGE_SET(str, cr);
2882 return str;
2883}
2884
2885/*
2886 * call-seq:
2887 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2888 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2889 *
2890 * Like #encode, but applies encoding changes to +self+; returns +self+.
2891 *
2892 */
2893
2894static VALUE
2895str_encode_bang(int argc, VALUE *argv, VALUE str)
2896{
2897 VALUE newstr;
2898 int encidx;
2899
2900 rb_check_frozen(str);
2901
2902 newstr = str;
2903 encidx = str_transcode(argc, argv, &newstr);
2904
2905 if (encidx < 0) return str;
2906 if (newstr == str) {
2907 rb_enc_associate_index(str, encidx);
2908 return str;
2909 }
2910 rb_str_shared_replace(str, newstr);
2911 return str_encode_associate(str, encidx);
2912}
2913
2914static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2915
2916/*
2917 * call-seq:
2918 * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2919 * encode(dst_encoding, src_encoding, **enc_opts) -> string
2920 *
2921 * :include: doc/string/encode.rdoc
2922 *
2923 */
2924
2925static VALUE
2926str_encode(int argc, VALUE *argv, VALUE str)
2927{
2928 VALUE newstr = str;
2929 int encidx = str_transcode(argc, argv, &newstr);
2930 return encoded_dup(newstr, str, encidx);
2931}
2932
2933VALUE
2934rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2935{
2936 int argc = 1;
2937 VALUE *argv = &to;
2938 VALUE newstr = str;
2939 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2940 return encoded_dup(newstr, str, encidx);
2941}
2942
2943static VALUE
2944encoded_dup(VALUE newstr, VALUE str, int encidx)
2945{
2946 if (encidx < 0) return rb_str_dup(str);
2947 if (newstr == str) {
2948 newstr = rb_str_dup(str);
2949 rb_enc_associate_index(newstr, encidx);
2950 return newstr;
2951 }
2952 else {
2953 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2954 }
2955 return str_encode_associate(newstr, encidx);
2956}
2957
2958/*
2959 * Document-class: Encoding::Converter
2960 *
2961 * Encoding conversion class.
2962 */
2963static void
2964econv_free(void *ptr)
2965{
2966 rb_econv_t *ec = ptr;
2967 rb_econv_close(ec);
2968}
2969
2970static size_t
2971econv_memsize(const void *ptr)
2972{
2973 return sizeof(rb_econv_t);
2974}
2975
2976static const rb_data_type_t econv_data_type = {
2977 "econv",
2978 {0, econv_free, econv_memsize,},
2979 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2980};
2981
2982static VALUE
2983econv_s_allocate(VALUE klass)
2984{
2985 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2986}
2987
2988static rb_encoding *
2989make_dummy_encoding(const char *name)
2990{
2991 rb_encoding *enc;
2992 int idx;
2993 idx = rb_define_dummy_encoding(name);
2994 enc = rb_enc_from_index(idx);
2995 return enc;
2996}
2997
2998static rb_encoding *
2999make_encoding(const char *name)
3000{
3001 rb_encoding *enc;
3002 RB_VM_LOCKING() {
3003 enc = rb_enc_find(name);
3004 if (!enc)
3005 enc = make_dummy_encoding(name);
3006 }
3007 return enc;
3008}
3009
3010static VALUE
3011make_encobj(const char *name)
3012{
3013 return rb_enc_from_encoding(make_encoding(name));
3014}
3015
3016/*
3017 * call-seq:
3018 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
3019 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
3020 *
3021 * Returns the corresponding ASCII compatible encoding.
3022 *
3023 * Returns nil if the argument is an ASCII compatible encoding.
3024 *
3025 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3026 * can represents exactly the same characters as the given ASCII incompatible encoding.
3027 * So, no conversion undefined error occurs when converting between the two encodings.
3028 *
3029 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3030 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3031 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3032 *
3033 */
3034static VALUE
3035econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3036{
3037 const char *arg_name, *result_name;
3038 rb_encoding *arg_enc, *result_enc;
3039 VALUE enc = Qnil;
3040
3041 enc_arg(&arg, &arg_name, &arg_enc);
3042
3043 RB_VM_LOCKING() {
3044 result_name = rb_econv_asciicompat_encoding(arg_name);
3045
3046 if (result_name) {
3047 result_enc = make_encoding(result_name);
3048 enc = rb_enc_from_encoding(result_enc);
3049 }
3050 }
3051 return enc;
3052}
3053
3054static void
3055econv_args(int argc, VALUE *argv,
3056 VALUE *snamev_p, VALUE *dnamev_p,
3057 const char **sname_p, const char **dname_p,
3058 rb_encoding **senc_p, rb_encoding **denc_p,
3059 int *ecflags_p,
3060 VALUE *ecopts_p)
3061{
3062 VALUE opt, flags_v, ecopts;
3063 int sidx, didx;
3064 const char *sname, *dname;
3065 rb_encoding *senc, *denc;
3066 int ecflags;
3067
3068 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3069
3070 if (!NIL_P(flags_v)) {
3071 if (!NIL_P(opt)) {
3072 rb_error_arity(argc + 1, 2, 3);
3073 }
3074 ecflags = NUM2INT(rb_to_int(flags_v));
3075 ecopts = Qnil;
3076 }
3077 else if (!NIL_P(opt)) {
3078 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3079 }
3080 else {
3081 ecflags = 0;
3082 ecopts = Qnil;
3083 }
3084
3085 senc = NULL;
3086 sidx = rb_to_encoding_index(*snamev_p);
3087 if (0 <= sidx) {
3088 senc = rb_enc_from_index(sidx);
3089 }
3090 else {
3091 StringValue(*snamev_p);
3092 }
3093
3094 denc = NULL;
3095 didx = rb_to_encoding_index(*dnamev_p);
3096 if (0 <= didx) {
3097 denc = rb_enc_from_index(didx);
3098 }
3099 else {
3100 StringValue(*dnamev_p);
3101 }
3102
3103 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3104 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3105
3106 *sname_p = sname;
3107 *dname_p = dname;
3108 *senc_p = senc;
3109 *denc_p = denc;
3110 *ecflags_p = ecflags;
3111 *ecopts_p = ecopts;
3112}
3113
3114static int
3115decorate_convpath(VALUE convpath, int ecflags)
3116{
3117 int num_decorators;
3118 const char *decorators[MAX_ECFLAGS_DECORATORS];
3119 int i;
3120 int n, len;
3121
3122 num_decorators = decorator_names(ecflags, decorators);
3123 if (num_decorators == -1)
3124 return -1;
3125
3126 len = n = RARRAY_LENINT(convpath);
3127 if (n != 0) {
3128 VALUE pair = RARRAY_AREF(convpath, n-1);
3129 if (RB_TYPE_P(pair, T_ARRAY)) {
3130 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3131 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3132 transcoder_entry_t *entry;
3133 const rb_transcoder *tr;
3134 RB_VM_LOCKING() {
3135 entry = get_transcoder_entry(sname, dname);
3136 tr = load_transcoder_entry(entry);
3137 }
3138 if (!tr)
3139 return -1;
3140 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3141 tr->asciicompat_type == asciicompat_encoder) {
3142 n--;
3143 rb_ary_store(convpath, len + num_decorators - 1, pair);
3144 }
3145 }
3146 else {
3147 rb_ary_store(convpath, len + num_decorators - 1, pair);
3148 }
3149 }
3150
3151 for (i = 0; i < num_decorators; i++)
3152 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3153
3154 return 0;
3155}
3156
3157static void
3158search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3159{
3160 VALUE *ary_p = arg;
3161 VALUE v;
3162
3163 if (NIL_P(*ary_p)) {
3164 *ary_p = rb_ary_new();
3165 }
3166
3167 if (DECORATOR_P(sname, dname)) {
3168 v = rb_str_new_cstr(dname);
3169 }
3170 else {
3171 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3172 }
3173 rb_ary_store(*ary_p, depth, v);
3174}
3175
3176/*
3177 * call-seq:
3178 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3179 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3180 *
3181 * Returns a conversion path.
3182 *
3183 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3184 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3185 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3186 *
3187 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3188 * or
3189 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3190 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3191 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3192 * # "universal_newline"]
3193 *
3194 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3195 * or
3196 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3197 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3198 * # "universal_newline",
3199 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3200 */
3201static VALUE
3202econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3203{
3204 VALUE snamev, dnamev;
3205 const char *sname, *dname;
3206 rb_encoding *senc, *denc;
3207 int ecflags;
3208 VALUE ecopts;
3209 VALUE convpath;
3210
3211 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3212
3213 convpath = Qnil;
3214 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3215
3216 if (NIL_P(convpath)) {
3217 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3218 RB_GC_GUARD(snamev);
3219 RB_GC_GUARD(dnamev);
3220 rb_exc_raise(exc);
3221 }
3222
3223 if (decorate_convpath(convpath, ecflags) == -1) {
3224 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3225 RB_GC_GUARD(snamev);
3226 RB_GC_GUARD(dnamev);
3227 rb_exc_raise(exc);
3228 }
3229
3230 return convpath;
3231}
3232
3233/*
3234 * Check the existence of a conversion path.
3235 * Returns the number of converters in the conversion path.
3236 * result: >=0:success -1:failure
3237 */
3238int
3239rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3240{
3241 VALUE convpath = Qnil;
3242 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3243 &convpath);
3244 return RTEST(convpath);
3245}
3246
3248 rb_econv_t *ec;
3249 int index;
3250 int ret;
3251};
3252
3253static void
3254rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3255{
3257 int ret;
3258
3259 if (a->ret == -1)
3260 return;
3261
3262 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3263
3264 a->ret = ret;
3265 return;
3266}
3267
3268static rb_econv_t *
3269rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3270 const char **sname_p, const char **dname_p,
3271 rb_encoding **senc_p, rb_encoding**denc_p)
3272{
3273 rb_econv_t *ec;
3274 long i;
3275 int ret, first=1;
3276 VALUE elt;
3277 rb_encoding *senc = 0, *denc = 0;
3278 const char *sname, *dname;
3279
3280 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3281 DATA_PTR(self) = ec;
3282
3283 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3284 VALUE snamev, dnamev;
3285 VALUE pair;
3286 elt = rb_ary_entry(convpath, i);
3287 if (!NIL_P(pair = rb_check_array_type(elt))) {
3288 if (RARRAY_LEN(pair) != 2)
3289 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3290 snamev = rb_ary_entry(pair, 0);
3291 enc_arg(&snamev, &sname, &senc);
3292 dnamev = rb_ary_entry(pair, 1);
3293 enc_arg(&dnamev, &dname, &denc);
3294 }
3295 else {
3296 sname = "";
3297 dname = StringValueCStr(elt);
3298 }
3299 if (DECORATOR_P(sname, dname)) {
3300 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3301 if (ret == -1) {
3302 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3303 RB_GC_GUARD(snamev);
3304 RB_GC_GUARD(dnamev);
3305 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3306 }
3307 }
3308 else {
3309 int j = ec->num_trans;
3310 struct rb_econv_init_by_convpath_t arg;
3311 arg.ec = ec;
3312 arg.index = ec->num_trans;
3313 arg.ret = 0;
3314 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3315 if (ret == -1 || arg.ret == -1) {
3316 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3317 RB_GC_GUARD(snamev);
3318 RB_GC_GUARD(dnamev);
3319 rb_exc_raise(rb_exc_new_str(rb_eArgError, msg));
3320 }
3321 if (first) {
3322 first = 0;
3323 *senc_p = senc;
3324 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3325 }
3326 *denc_p = denc;
3327 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3328 }
3329 }
3330
3331 if (first) {
3332 *senc_p = NULL;
3333 *denc_p = NULL;
3334 *sname_p = "";
3335 *dname_p = "";
3336 }
3337
3338 ec->source_encoding_name = *sname_p;
3339 ec->destination_encoding_name = *dname_p;
3340
3341 return ec;
3342}
3343
3344/*
3345 * call-seq:
3346 * Encoding::Converter.new(source_encoding, destination_encoding)
3347 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3348 * Encoding::Converter.new(convpath)
3349 *
3350 * possible options elements:
3351 * hash form:
3352 * :invalid => nil # raise error on invalid byte sequence (default)
3353 * :invalid => :replace # replace invalid byte sequence
3354 * :undef => nil # raise error on undefined conversion (default)
3355 * :undef => :replace # replace undefined conversion
3356 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3357 * :newline => :universal # decorator for converting CRLF and CR to LF
3358 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3359 * :newline => :crlf # decorator for converting LF to CRLF
3360 * :newline => :cr # decorator for converting LF to CR
3361 * :universal_newline => true # decorator for converting CRLF and CR to LF
3362 * :crlf_newline => true # decorator for converting LF to CRLF
3363 * :cr_newline => true # decorator for converting LF to CR
3364 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3365 * :xml => :text # escape as XML CharData.
3366 * :xml => :attr # escape as XML AttValue
3367 * integer form:
3368 * Encoding::Converter::INVALID_REPLACE
3369 * Encoding::Converter::UNDEF_REPLACE
3370 * Encoding::Converter::UNDEF_HEX_CHARREF
3371 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3372 * Encoding::Converter::LF_NEWLINE_DECORATOR
3373 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3374 * Encoding::Converter::CR_NEWLINE_DECORATOR
3375 * Encoding::Converter::XML_TEXT_DECORATOR
3376 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3377 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3378 *
3379 * Encoding::Converter.new creates an instance of Encoding::Converter.
3380 *
3381 * Source_encoding and destination_encoding should be a string or
3382 * Encoding object.
3383 *
3384 * opt should be nil, a hash or an integer.
3385 *
3386 * convpath should be an array.
3387 * convpath may contain
3388 * - two-element arrays which contain encodings or encoding names, or
3389 * - strings representing decorator names.
3390 *
3391 * Encoding::Converter.new optionally takes an option.
3392 * The option should be a hash or an integer.
3393 * The option hash can contain :invalid => nil, etc.
3394 * The option integer should be logical-or of constants such as
3395 * Encoding::Converter::INVALID_REPLACE, etc.
3396 *
3397 * [:invalid => nil]
3398 * Raise error on invalid byte sequence. This is a default behavior.
3399 * [:invalid => :replace]
3400 * Replace invalid byte sequence by replacement string.
3401 * [:undef => nil]
3402 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3403 * This is a default behavior.
3404 * [:undef => :replace]
3405 * Replace undefined character in destination_encoding with replacement string.
3406 * [:replace => string]
3407 * Specify the replacement string.
3408 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3409 * [:universal_newline => true]
3410 * Convert CRLF and CR to LF.
3411 * [:crlf_newline => true]
3412 * Convert LF to CRLF.
3413 * [:cr_newline => true]
3414 * Convert LF to CR.
3415 * [:lf_newline => true]
3416 * Convert CRLF and CR to LF (when writing).
3417 * [:xml => :text]
3418 * Escape as XML CharData.
3419 * This form can be used as an HTML 4.0 #PCDATA.
3420 * - '&' -> '&amp;'
3421 * - '<' -> '&lt;'
3422 * - '>' -> '&gt;'
3423 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3424 * [:xml => :attr]
3425 * Escape as XML AttValue.
3426 * The converted result is quoted as "...".
3427 * This form can be used as an HTML 4.0 attribute value.
3428 * - '&' -> '&amp;'
3429 * - '<' -> '&lt;'
3430 * - '>' -> '&gt;'
3431 * - '"' -> '&quot;'
3432 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3433 *
3434 * Examples:
3435 * # UTF-16BE to UTF-8
3436 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3437 *
3438 * # Usually, decorators such as newline conversion are inserted last.
3439 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3440 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3441 * # "universal_newline"]
3442 *
3443 * # But, if the last encoding is ASCII incompatible,
3444 * # decorators are inserted before the last conversion.
3445 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3446 * p ec.convpath #=> ["crlf_newline",
3447 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3448 *
3449 * # Conversion path can be specified directly.
3450 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3451 * p ec.convpath #=> ["universal_newline",
3452 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3453 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3454 */
3455static VALUE
3456econv_init(int argc, VALUE *argv, VALUE self)
3457{
3458 VALUE ecopts;
3459 VALUE snamev, dnamev;
3460 const char *sname, *dname;
3461 rb_encoding *senc, *denc;
3462 rb_econv_t *ec;
3463 int ecflags;
3464 VALUE convpath;
3465
3466 if (rb_check_typeddata(self, &econv_data_type)) {
3467 rb_raise(rb_eTypeError, "already initialized");
3468 }
3469
3470 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3471 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3472 ecflags = 0;
3473 ecopts = Qnil;
3474 }
3475 else {
3476 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3477 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3478 }
3479
3480 if (!ec) {
3481 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3482 RB_GC_GUARD(snamev);
3483 RB_GC_GUARD(dnamev);
3484 rb_exc_raise(exc);
3485 }
3486
3487 if (!DECORATOR_P(sname, dname)) {
3488 if (!senc)
3489 senc = make_dummy_encoding(sname);
3490 if (!denc)
3491 denc = make_dummy_encoding(dname);
3492 RB_GC_GUARD(snamev);
3493 RB_GC_GUARD(dnamev);
3494 }
3495
3496 ec->source_encoding = senc;
3497 ec->destination_encoding = denc;
3498
3499 DATA_PTR(self) = ec;
3500
3501 return self;
3502}
3503
3504/*
3505 * call-seq:
3506 * ec.inspect -> string
3507 *
3508 * Returns a printable version of <i>ec</i>
3509 *
3510 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3511 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3512 *
3513 */
3514static VALUE
3515econv_inspect(VALUE self)
3516{
3517 const char *cname = rb_obj_classname(self);
3518 rb_econv_t *ec;
3519
3520 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3521 if (!ec)
3522 return rb_sprintf("#<%s: uninitialized>", cname);
3523 else {
3524 const char *sname = ec->source_encoding_name;
3525 const char *dname = ec->destination_encoding_name;
3526 VALUE str;
3527 str = rb_sprintf("#<%s: ", cname);
3528 econv_description(sname, dname, ec->flags, str);
3529 rb_str_cat2(str, ">");
3530 return str;
3531 }
3532}
3533
3534static rb_econv_t *
3535check_econv(VALUE self)
3536{
3537 rb_econv_t *ec;
3538
3539 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3540 if (!ec) {
3541 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3542 }
3543 return ec;
3544}
3545
3546static VALUE
3547econv_get_encoding(rb_encoding *encoding)
3548{
3549 if (!encoding)
3550 return Qnil;
3551 return rb_enc_from_encoding(encoding);
3552}
3553
3554/*
3555 * call-seq:
3556 * ec.source_encoding -> encoding
3557 *
3558 * Returns the source encoding as an Encoding object.
3559 */
3560static VALUE
3561econv_source_encoding(VALUE self)
3562{
3563 rb_econv_t *ec = check_econv(self);
3564 return econv_get_encoding(ec->source_encoding);
3565}
3566
3567/*
3568 * call-seq:
3569 * ec.destination_encoding -> encoding
3570 *
3571 * Returns the destination encoding as an Encoding object.
3572 */
3573static VALUE
3574econv_destination_encoding(VALUE self)
3575{
3576 rb_econv_t *ec = check_econv(self);
3577 return econv_get_encoding(ec->destination_encoding);
3578}
3579
3580/*
3581 * call-seq:
3582 * ec.convpath -> ary
3583 *
3584 * Returns the conversion path of ec.
3585 *
3586 * The result is an array of conversions.
3587 *
3588 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3589 * p ec.convpath
3590 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3591 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3592 * # "crlf_newline"]
3593 *
3594 * Each element of the array is a pair of encodings or a string.
3595 * A pair means an encoding conversion.
3596 * A string means a decorator.
3597 *
3598 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3599 * a converter from ISO-8859-1 to UTF-8.
3600 * "crlf_newline" means newline converter from LF to CRLF.
3601 */
3602static VALUE
3603econv_convpath(VALUE self)
3604{
3605 rb_econv_t *ec = check_econv(self);
3606 VALUE result;
3607 int i;
3608
3609 result = rb_ary_new();
3610 for (i = 0; i < ec->num_trans; i++) {
3611 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3612 VALUE v;
3613 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3614 v = rb_str_new_cstr(tr->dst_encoding);
3615 else
3616 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3617 rb_ary_push(result, v);
3618 }
3619 return result;
3620}
3621
3622/*
3623 * call-seq:
3624 * ec == other -> true or false
3625 */
3626static VALUE
3627econv_equal(VALUE self, VALUE other)
3628{
3629 rb_econv_t *ec1 = check_econv(self);
3630 rb_econv_t *ec2;
3631 int i;
3632
3633 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3634 return Qnil;
3635 }
3636 ec2 = DATA_PTR(other);
3637 if (!ec2) return Qfalse;
3638 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3639 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3640 return Qfalse;
3641 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3642 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3643 return Qfalse;
3644 if (ec1->flags != ec2->flags) return Qfalse;
3645 if (ec1->replacement_enc != ec2->replacement_enc &&
3646 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3647 return Qfalse;
3648 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3649 if (ec1->replacement_str != ec2->replacement_str &&
3650 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3651 return Qfalse;
3652
3653 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3654 for (i = 0; i < ec1->num_trans; i++) {
3655 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3656 return Qfalse;
3657 }
3658 return Qtrue;
3659}
3660
3661static VALUE
3662econv_result_to_symbol(rb_econv_result_t res)
3663{
3664 switch (res) {
3665 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3666 case econv_incomplete_input: return sym_incomplete_input;
3667 case econv_undefined_conversion: return sym_undefined_conversion;
3668 case econv_destination_buffer_full: return sym_destination_buffer_full;
3669 case econv_source_buffer_empty: return sym_source_buffer_empty;
3670 case econv_finished: return sym_finished;
3671 case econv_after_output: return sym_after_output;
3672 default: return INT2NUM(res); /* should not be reached */
3673 }
3674}
3675
3676/*
3677 * call-seq:
3678 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3679 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3680 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3681 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3682 *
3683 * possible opt elements:
3684 * hash form:
3685 * :partial_input => true # source buffer may be part of larger source
3686 * :after_output => true # stop conversion after output before input
3687 * integer form:
3688 * Encoding::Converter::PARTIAL_INPUT
3689 * Encoding::Converter::AFTER_OUTPUT
3690 *
3691 * possible results:
3692 * :invalid_byte_sequence
3693 * :incomplete_input
3694 * :undefined_conversion
3695 * :after_output
3696 * :destination_buffer_full
3697 * :source_buffer_empty
3698 * :finished
3699 *
3700 * primitive_convert converts source_buffer into destination_buffer.
3701 *
3702 * source_buffer should be a string or nil.
3703 * nil means an empty string.
3704 *
3705 * destination_buffer should be a string.
3706 *
3707 * destination_byteoffset should be an integer or nil.
3708 * nil means the end of destination_buffer.
3709 * If it is omitted, nil is assumed.
3710 *
3711 * destination_bytesize should be an integer or nil.
3712 * nil means unlimited.
3713 * If it is omitted, nil is assumed.
3714 *
3715 * opt should be nil, a hash or an integer.
3716 * nil means no flags.
3717 * If it is omitted, nil is assumed.
3718 *
3719 * primitive_convert converts the content of source_buffer from beginning
3720 * and store the result into destination_buffer.
3721 *
3722 * destination_byteoffset and destination_bytesize specify the region which
3723 * the converted result is stored.
3724 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3725 * If destination_byteoffset is nil,
3726 * destination_buffer.bytesize is used for appending the result.
3727 * destination_bytesize specifies maximum number of bytes.
3728 * If destination_bytesize is nil,
3729 * destination size is unlimited.
3730 * After conversion, destination_buffer is resized to
3731 * destination_byteoffset + actually produced number of bytes.
3732 * Also destination_buffer's encoding is set to destination_encoding.
3733 *
3734 * primitive_convert drops the converted part of source_buffer.
3735 * the dropped part is converted in destination_buffer or
3736 * buffered in Encoding::Converter object.
3737 *
3738 * primitive_convert stops conversion when one of following condition met.
3739 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3740 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3741 * - unexpected end of source buffer (:incomplete_input)
3742 * this occur only when :partial_input is not specified.
3743 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3744 * - character not representable in output encoding (:undefined_conversion)
3745 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3746 * - after some output is generated, before input is done (:after_output)
3747 * this occur only when :after_output is specified.
3748 * - destination buffer is full (:destination_buffer_full)
3749 * this occur only when destination_bytesize is non-nil.
3750 * - source buffer is empty (:source_buffer_empty)
3751 * this occur only when :partial_input is specified.
3752 * - conversion is finished (:finished)
3753 *
3754 * example:
3755 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3756 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3757 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3758 *
3759 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3760 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3761 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3762 * ret = ec.primitive_convert(src, dst="", nil, 1)
3763 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3764 * ret = ec.primitive_convert(src, dst="", nil, 1)
3765 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3766 * ret = ec.primitive_convert(src, dst="", nil, 1)
3767 * p [ret, src, dst] #=> [:finished, "", "i"]
3768 *
3769 */
3770static VALUE
3771econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3772{
3773 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3774 rb_econv_t *ec = check_econv(self);
3776 const unsigned char *ip, *is;
3777 unsigned char *op, *os;
3778 long output_byteoffset, output_bytesize;
3779 unsigned long output_byteend;
3780 int flags;
3781
3782 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3783
3784 if (NIL_P(output_byteoffset_v))
3785 output_byteoffset = 0; /* dummy */
3786 else
3787 output_byteoffset = NUM2LONG(output_byteoffset_v);
3788
3789 if (NIL_P(output_bytesize_v))
3790 output_bytesize = 0; /* dummy */
3791 else
3792 output_bytesize = NUM2LONG(output_bytesize_v);
3793
3794 if (!NIL_P(flags_v)) {
3795 if (!NIL_P(opt)) {
3796 rb_error_arity(argc + 1, 2, 5);
3797 }
3798 flags = NUM2INT(rb_to_int(flags_v));
3799 }
3800 else if (!NIL_P(opt)) {
3801 VALUE v;
3802 flags = 0;
3803 v = rb_hash_aref(opt, sym_partial_input);
3804 if (RTEST(v))
3805 flags |= ECONV_PARTIAL_INPUT;
3806 v = rb_hash_aref(opt, sym_after_output);
3807 if (RTEST(v))
3808 flags |= ECONV_AFTER_OUTPUT;
3809 }
3810 else {
3811 flags = 0;
3812 }
3813
3814 StringValue(output);
3815 if (!NIL_P(input))
3816 StringValue(input);
3817 rb_str_modify(output);
3818
3819 if (NIL_P(output_bytesize_v)) {
3820 output_bytesize = rb_str_capacity(output);
3821
3822 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3823 output_bytesize = RSTRING_LEN(input);
3824 }
3825
3826 retry:
3827
3828 if (NIL_P(output_byteoffset_v))
3829 output_byteoffset = RSTRING_LEN(output);
3830
3831 if (output_byteoffset < 0)
3832 rb_raise(rb_eArgError, "negative output_byteoffset");
3833
3834 if (RSTRING_LEN(output) < output_byteoffset)
3835 rb_raise(rb_eArgError, "output_byteoffset too big");
3836
3837 if (output_bytesize < 0)
3838 rb_raise(rb_eArgError, "negative output_bytesize");
3839
3840 output_byteend = (unsigned long)output_byteoffset +
3841 (unsigned long)output_bytesize;
3842
3843 if (output_byteend < (unsigned long)output_byteoffset ||
3844 LONG_MAX < output_byteend)
3845 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3846
3847 if (rb_str_capacity(output) < output_byteend)
3848 rb_str_resize(output, output_byteend);
3849
3850 if (NIL_P(input)) {
3851 ip = is = NULL;
3852 }
3853 else {
3854 ip = (const unsigned char *)RSTRING_PTR(input);
3855 is = ip + RSTRING_LEN(input);
3856 }
3857
3858 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3859 os = op + output_bytesize;
3860
3861 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3862 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3863 if (!NIL_P(input)) {
3864 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3865 }
3866
3867 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3868 if (LONG_MAX / 2 < output_bytesize)
3869 rb_raise(rb_eArgError, "too long conversion result");
3870 output_bytesize *= 2;
3871 output_byteoffset_v = Qnil;
3872 goto retry;
3873 }
3874
3875 if (ec->destination_encoding) {
3876 rb_enc_associate(output, ec->destination_encoding);
3877 }
3878
3879 return econv_result_to_symbol(res);
3880}
3881
3882/*
3883 * call-seq:
3884 * ec.convert(source_string) -> destination_string
3885 *
3886 * Convert source_string and return destination_string.
3887 *
3888 * source_string is assumed as a part of source.
3889 * i.e. :partial_input=>true is specified internally.
3890 * finish method should be used last.
3891 *
3892 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3893 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3894 * puts ec.finish.dump #=> ""
3895 *
3896 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3897 * puts ec.convert("\xA4").dump #=> ""
3898 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3899 * puts ec.finish.dump #=> ""
3900 *
3901 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3902 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3903 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3904 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3905 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3906 *
3907 * If a conversion error occur,
3908 * Encoding::UndefinedConversionError or
3909 * Encoding::InvalidByteSequenceError is raised.
3910 * Encoding::Converter#convert doesn't supply methods to recover or restart
3911 * from these exceptions.
3912 * When you want to handle these conversion errors,
3913 * use Encoding::Converter#primitive_convert.
3914 *
3915 */
3916static VALUE
3917econv_convert(VALUE self, VALUE source_string)
3918{
3919 VALUE ret, dst;
3920 VALUE av[5];
3921 int ac;
3922 rb_econv_t *ec = check_econv(self);
3923
3924 StringValue(source_string);
3925
3926 dst = rb_str_new(NULL, 0);
3927
3928 av[0] = rb_str_dup(source_string);
3929 av[1] = dst;
3930 av[2] = Qnil;
3931 av[3] = Qnil;
3933 ac = 5;
3934
3935 ret = econv_primitive_convert(ac, av, self);
3936
3937 if (ret == sym_invalid_byte_sequence ||
3938 ret == sym_undefined_conversion ||
3939 ret == sym_incomplete_input) {
3940 VALUE exc = make_econv_exception(ec);
3941 rb_exc_raise(exc);
3942 }
3943
3944 if (ret == sym_finished) {
3945 rb_raise(rb_eArgError, "converter already finished");
3946 }
3947
3948 if (ret != sym_source_buffer_empty) {
3949 rb_bug("unexpected result of econv_primitive_convert");
3950 }
3951
3952 return dst;
3953}
3954
3955/*
3956 * call-seq:
3957 * ec.finish -> string
3958 *
3959 * Finishes the converter.
3960 * It returns the last part of the converted string.
3961 *
3962 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3963 * p ec.convert("\u3042") #=> "\e$B$\""
3964 * p ec.finish #=> "\e(B"
3965 */
3966static VALUE
3967econv_finish(VALUE self)
3968{
3969 VALUE ret, dst;
3970 VALUE av[5];
3971 int ac;
3972 rb_econv_t *ec = check_econv(self);
3973
3974 dst = rb_str_new(NULL, 0);
3975
3976 av[0] = Qnil;
3977 av[1] = dst;
3978 av[2] = Qnil;
3979 av[3] = Qnil;
3980 av[4] = INT2FIX(0);
3981 ac = 5;
3982
3983 ret = econv_primitive_convert(ac, av, self);
3984
3985 if (ret == sym_invalid_byte_sequence ||
3986 ret == sym_undefined_conversion ||
3987 ret == sym_incomplete_input) {
3988 VALUE exc = make_econv_exception(ec);
3989 rb_exc_raise(exc);
3990 }
3991
3992 if (ret != sym_finished) {
3993 rb_bug("unexpected result of econv_primitive_convert");
3994 }
3995
3996 return dst;
3997}
3998
3999/*
4000 * call-seq:
4001 * ec.primitive_errinfo -> array
4002 *
4003 * primitive_errinfo returns important information regarding the last error
4004 * as a 5-element array:
4005 *
4006 * [result, enc1, enc2, error_bytes, readagain_bytes]
4007 *
4008 * result is the last result of primitive_convert.
4009 *
4010 * Other elements are only meaningful when result is
4011 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
4012 *
4013 * enc1 and enc2 indicate a conversion step as a pair of strings.
4014 * For example, a converter from EUC-JP to ISO-8859-1 converts
4015 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
4016 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
4017 *
4018 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
4019 * error_bytes is discarded portion.
4020 * readagain_bytes is buffered portion which is read again on next conversion.
4021 *
4022 * Example:
4023 *
4024 * # \xff is invalid as EUC-JP.
4025 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
4026 * ec.primitive_convert(src="\xff", dst="", nil, 10)
4027 * p ec.primitive_errinfo
4028 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4029 *
4030 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4031 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4032 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4033 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4034 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4035 * p ec.primitive_errinfo
4036 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4037 *
4038 * # partial character is invalid
4039 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4040 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4041 * p ec.primitive_errinfo
4042 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4043 *
4044 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4045 * # partial characters.
4046 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4047 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4048 * p ec.primitive_errinfo
4049 * #=> [:source_buffer_empty, nil, nil, nil, nil]
4050 *
4051 * # \xd8\x00\x00@ is invalid as UTF-16BE because
4052 * # no low surrogate after high surrogate (\xd8\x00).
4053 * # It is detected by 3rd byte (\00) which is part of next character.
4054 * # So the high surrogate (\xd8\x00) is discarded and
4055 * # the 3rd byte is read again later.
4056 * # Since the byte is buffered in ec, it is dropped from src.
4057 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4058 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4059 * p ec.primitive_errinfo
4060 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4061 * p src
4062 * #=> "@"
4063 *
4064 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4065 * # The problem is detected by 4th byte.
4066 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4067 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4068 * p ec.primitive_errinfo
4069 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4070 * p src
4071 * #=> ""
4072 *
4073 */
4074static VALUE
4075econv_primitive_errinfo(VALUE self)
4076{
4077 rb_econv_t *ec = check_econv(self);
4078
4079 VALUE ary;
4080
4081 ary = rb_ary_new2(5);
4082
4083 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4084 rb_ary_store(ary, 4, Qnil);
4085
4086 if (ec->last_error.source_encoding)
4087 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4088
4089 if (ec->last_error.destination_encoding)
4090 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4091
4092 if (ec->last_error.error_bytes_start) {
4093 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4094 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4095 }
4096
4097 return ary;
4098}
4099
4100/*
4101 * call-seq:
4102 * ec.insert_output(string) -> nil
4103 *
4104 * Inserts string into the encoding converter.
4105 * The string will be converted to the destination encoding and
4106 * output on later conversions.
4107 *
4108 * If the destination encoding is stateful,
4109 * string is converted according to the state and the state is updated.
4110 *
4111 * This method should be used only when a conversion error occurs.
4112 *
4113 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4114 * src = "HIRAGANA LETTER A is \u{3042}."
4115 * dst = ""
4116 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4117 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4118 * ec.insert_output("<err>")
4119 * p ec.primitive_convert(src, dst) #=> :finished
4120 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4121 *
4122 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4123 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4124 * dst = ""
4125 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4126 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4127 * ec.insert_output "?" # state change required to output "?".
4128 * p ec.primitive_convert(src, dst) #=> :finished
4129 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4130 *
4131 */
4132static VALUE
4133econv_insert_output(VALUE self, VALUE string)
4134{
4135 const char *insert_enc;
4136
4137 int ret;
4138
4139 rb_econv_t *ec = check_econv(self);
4140
4141 StringValue(string);
4142 insert_enc = rb_econv_encoding_to_insert_output(ec);
4143 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4144
4145 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4146 if (ret == -1) {
4147 rb_raise(rb_eArgError, "too big string");
4148 }
4149
4150 return Qnil;
4151}
4152
4153/*
4154 * call-seq:
4155 * ec.putback -> string
4156 * ec.putback(max_numbytes) -> string
4157 *
4158 * Put back the bytes which will be converted.
4159 *
4160 * The bytes are caused by invalid_byte_sequence error.
4161 * When invalid_byte_sequence error, some bytes are discarded and
4162 * some bytes are buffered to be converted later.
4163 * The latter bytes can be put back.
4164 * It can be observed by
4165 * Encoding::InvalidByteSequenceError#readagain_bytes and
4166 * Encoding::Converter#primitive_errinfo.
4167 *
4168 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4169 * src = "\x00\xd8\x61\x00"
4170 * dst = ""
4171 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4172 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4173 * p ec.putback #=> "a\x00"
4174 * p ec.putback #=> "" # no more bytes to put back
4175 *
4176 */
4177static VALUE
4178econv_putback(int argc, VALUE *argv, VALUE self)
4179{
4180 rb_econv_t *ec = check_econv(self);
4181 int n;
4182 int putbackable;
4183 VALUE str, max;
4184
4185 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4186 n = rb_econv_putbackable(ec);
4187 }
4188 else {
4189 n = NUM2INT(max);
4190 putbackable = rb_econv_putbackable(ec);
4191 if (putbackable < n)
4192 n = putbackable;
4193 }
4194
4195 str = rb_str_new(NULL, n);
4196 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4197
4198 if (ec->source_encoding) {
4199 rb_enc_associate(str, ec->source_encoding);
4200 }
4201
4202 return str;
4203}
4204
4205/*
4206 * call-seq:
4207 * ec.last_error -> exception or nil
4208 *
4209 * Returns an exception object for the last conversion.
4210 * Returns nil if the last conversion did not produce an error.
4211 *
4212 * "error" means that
4213 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4214 * Encoding::Converter#convert and
4215 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4216 * Encoding::Converter#primitive_convert.
4217 *
4218 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4219 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4220 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4221 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4222 * p ec.last_error #=> nil
4223 *
4224 */
4225static VALUE
4226econv_last_error(VALUE self)
4227{
4228 rb_econv_t *ec = check_econv(self);
4229 VALUE exc;
4230
4231 exc = make_econv_exception(ec);
4232 if (NIL_P(exc))
4233 return Qnil;
4234 return exc;
4235}
4236
4237/*
4238 * call-seq:
4239 * ec.replacement -> string
4240 *
4241 * Returns the replacement string.
4242 *
4243 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4244 * p ec.replacement #=> "?"
4245 *
4246 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4247 * p ec.replacement #=> "\uFFFD"
4248 */
4249static VALUE
4250econv_get_replacement(VALUE self)
4251{
4252 rb_econv_t *ec = check_econv(self);
4253 int ret;
4254 rb_encoding *enc;
4255
4256 ret = make_replacement(ec);
4257 if (ret == -1) {
4258 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4259 }
4260
4261 enc = rb_enc_find(ec->replacement_enc);
4262 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4263}
4264
4265/*
4266 * call-seq:
4267 * ec.replacement = string
4268 *
4269 * Sets the replacement string.
4270 *
4271 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4272 * ec.replacement = "<undef>"
4273 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4274 */
4275static VALUE
4276econv_set_replacement(VALUE self, VALUE arg)
4277{
4278 rb_econv_t *ec = check_econv(self);
4279 VALUE string = arg;
4280 int ret;
4281 rb_encoding *enc;
4282
4283 StringValue(string);
4284 enc = rb_enc_get(string);
4285
4286 ret = rb_econv_set_replacement(ec,
4287 (const unsigned char *)RSTRING_PTR(string),
4288 RSTRING_LEN(string),
4289 rb_enc_name(enc));
4290
4291 if (ret == -1) {
4292 /* xxx: rb_eInvalidByteSequenceError? */
4293 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4294 }
4295
4296 return arg;
4297}
4298
4299VALUE
4301{
4302 return make_econv_exception(ec);
4303}
4304
4305void
4307{
4308 VALUE exc;
4309
4310 exc = make_econv_exception(ec);
4311 if (NIL_P(exc))
4312 return;
4313 rb_exc_raise(exc);
4314}
4315
4316/*
4317 * call-seq:
4318 * ecerr.source_encoding_name -> string
4319 *
4320 * Returns the source encoding name as a string.
4321 */
4322static VALUE
4323ecerr_source_encoding_name(VALUE self)
4324{
4325 return rb_attr_get(self, id_source_encoding_name);
4326}
4327
4328/*
4329 * call-seq:
4330 * ecerr.source_encoding -> encoding
4331 *
4332 * Returns the source encoding as an encoding object.
4333 *
4334 * Note that the result may not be equal to the source encoding of
4335 * the encoding converter if the conversion has multiple steps.
4336 *
4337 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4338 * begin
4339 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4340 * rescue Encoding::UndefinedConversionError
4341 * p $!.source_encoding #=> #<Encoding:UTF-8>
4342 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4343 * p $!.source_encoding_name #=> "UTF-8"
4344 * p $!.destination_encoding_name #=> "EUC-JP"
4345 * end
4346 *
4347 */
4348static VALUE
4349ecerr_source_encoding(VALUE self)
4350{
4351 return rb_attr_get(self, id_source_encoding);
4352}
4353
4354/*
4355 * call-seq:
4356 * ecerr.destination_encoding_name -> string
4357 *
4358 * Returns the destination encoding name as a string.
4359 */
4360static VALUE
4361ecerr_destination_encoding_name(VALUE self)
4362{
4363 return rb_attr_get(self, id_destination_encoding_name);
4364}
4365
4366/*
4367 * call-seq:
4368 * ecerr.destination_encoding -> string
4369 *
4370 * Returns the destination encoding as an encoding object.
4371 */
4372static VALUE
4373ecerr_destination_encoding(VALUE self)
4374{
4375 return rb_attr_get(self, id_destination_encoding);
4376}
4377
4378/*
4379 * call-seq:
4380 * ecerr.error_char -> string
4381 *
4382 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4383 *
4384 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4385 * begin
4386 * ec.convert("\xa0")
4387 * rescue Encoding::UndefinedConversionError
4388 * puts $!.error_char.dump #=> "\xC2\xA0"
4389 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4390 * end
4391 *
4392 */
4393static VALUE
4394ecerr_error_char(VALUE self)
4395{
4396 return rb_attr_get(self, id_error_char);
4397}
4398
4399/*
4400 * call-seq:
4401 * ecerr.error_bytes -> string
4402 *
4403 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4404 *
4405 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4406 * begin
4407 * ec.convert("abc\xA1\xFFdef")
4408 * rescue Encoding::InvalidByteSequenceError
4409 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4410 * puts $!.error_bytes.dump #=> "\xA1"
4411 * puts $!.readagain_bytes.dump #=> "\xFF"
4412 * end
4413 */
4414static VALUE
4415ecerr_error_bytes(VALUE self)
4416{
4417 return rb_attr_get(self, id_error_bytes);
4418}
4419
4420/*
4421 * call-seq:
4422 * ecerr.readagain_bytes -> string
4423 *
4424 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4425 */
4426static VALUE
4427ecerr_readagain_bytes(VALUE self)
4428{
4429 return rb_attr_get(self, id_readagain_bytes);
4430}
4431
4432/*
4433 * call-seq:
4434 * ecerr.incomplete_input? -> true or false
4435 *
4436 * Returns true if the invalid byte sequence error is caused by
4437 * premature end of string.
4438 *
4439 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4440 *
4441 * begin
4442 * ec.convert("abc\xA1z")
4443 * rescue Encoding::InvalidByteSequenceError
4444 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4445 * p $!.incomplete_input? #=> false
4446 * end
4447 *
4448 * begin
4449 * ec.convert("abc\xA1")
4450 * ec.finish
4451 * rescue Encoding::InvalidByteSequenceError
4452 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4453 * p $!.incomplete_input? #=> true
4454 * end
4455 */
4456static VALUE
4457ecerr_incomplete_input(VALUE self)
4458{
4459 return rb_attr_get(self, id_incomplete_input);
4460}
4461
4462/*
4463 * Document-class: Encoding::UndefinedConversionError
4464 *
4465 * Raised by Encoding and String methods when a transcoding operation
4466 * fails.
4467 */
4468
4469/*
4470 * Document-class: Encoding::InvalidByteSequenceError
4471 *
4472 * Raised by Encoding and String methods when the string being
4473 * transcoded contains a byte invalid for the either the source or
4474 * target encoding.
4475 */
4476
4477/*
4478 * Document-class: Encoding::ConverterNotFoundError
4479 *
4480 * Raised by transcoding methods when a named encoding does not
4481 * correspond with a known converter.
4482 */
4483
4484void
4485Init_transcode(void)
4486{
4487 transcoder_table = st_init_strcasetable();
4488
4489 id_destination_encoding = rb_intern_const("destination_encoding");
4490 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4491 id_error_bytes = rb_intern_const("error_bytes");
4492 id_error_char = rb_intern_const("error_char");
4493 id_incomplete_input = rb_intern_const("incomplete_input");
4494 id_readagain_bytes = rb_intern_const("readagain_bytes");
4495 id_source_encoding = rb_intern_const("source_encoding");
4496 id_source_encoding_name = rb_intern_const("source_encoding_name");
4497
4498 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4499 sym_undef = ID2SYM(rb_intern_const("undef"));
4500 sym_replace = ID2SYM(rb_intern_const("replace"));
4501 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4502 sym_xml = ID2SYM(rb_intern_const("xml"));
4503 sym_text = ID2SYM(rb_intern_const("text"));
4504 sym_attr = ID2SYM(rb_intern_const("attr"));
4505
4506 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4507 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4508 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4509 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4510 sym_finished = ID2SYM(rb_intern_const("finished"));
4511 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4512 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4513 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4514 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4515 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4516 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4517 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4518
4519#ifdef ENABLE_ECONV_NEWLINE_OPTION
4520 sym_newline = ID2SYM(rb_intern_const("newline"));
4521 sym_universal = ID2SYM(rb_intern_const("universal"));
4522 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4523 sym_cr = ID2SYM(rb_intern_const("cr"));
4524 sym_lf = ID2SYM(rb_intern_const("lf"));
4525#endif
4526
4527 InitVM(transcode);
4528}
4529
4530void
4531InitVM_transcode(void)
4532{
4533 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4534 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4535 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4536
4537 rb_define_method(rb_cString, "encode", str_encode, -1);
4538 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4539
4540 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4541 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4542 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4543 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4544 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4545 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4546 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4547 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4548 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4549 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4550 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4551 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4552 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4553 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4554 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4555 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4556 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4557 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4558 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4559
4560 /*
4561 *Mask for invalid byte sequences
4562 */
4563 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4564
4565 /*
4566 * Replace invalid byte sequences
4567 */
4568 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4569
4570 /*
4571 * Mask for a valid character in the source encoding but no related
4572 * character(s) in destination encoding.
4573 */
4574 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4575
4576 /*
4577 * Replace byte sequences that are undefined in the destination encoding.
4578 */
4579 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4580
4581 /*
4582 * Replace byte sequences that are undefined in the destination encoding
4583 * with an XML hexadecimal character reference. This is valid for XML
4584 * conversion.
4585 */
4586 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4587
4588 /*
4589 * Indicates the source may be part of a larger string. See
4590 * primitive_convert for an example.
4591 */
4592 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4593
4594 /*
4595 * Stop converting after some output is complete but before all of the
4596 * input was consumed. See primitive_convert for an example.
4597 */
4598 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4599
4600 /*
4601 * Decorator for converting CRLF and CR to LF
4602 */
4603 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4604
4605 /*
4606 * Decorator for converting CRLF and CR to LF when writing
4607 */
4608 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4609
4610 /*
4611 * Decorator for converting LF to CRLF
4612 */
4613 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4614
4615 /*
4616 * Decorator for converting LF to CR
4617 */
4618 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4619
4620 /*
4621 * Escape as XML CharData
4622 */
4623 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4624
4625 /*
4626 * Escape as XML AttValue
4627 */
4628 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4629
4630 /*
4631 * Escape as XML AttValue
4632 */
4633 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4634
4635 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4636 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4637 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4638 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4639 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4640
4641 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4642 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4643 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4644 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4645 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4646 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4647 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4648
4649 Init_newline();
4650}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:1515
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:3133
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1675
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:403
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:400
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:136
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:134
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:399
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:682
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1380
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1430
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1428
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1397
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1481
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1436
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:243
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:58
VALUE rb_cString
String class.
Definition string.c:83
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3223
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1316
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:931
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:815
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2619
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2116
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1547
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2664
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1490
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1785
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3239
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1109
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1937
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1928
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1829
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1631
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1949
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2670
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:1998
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:2015
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1981
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2934
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4300
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4306
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1943
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1746
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1865
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1796
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2278
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1168
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:284
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1005
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1675
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2568
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:120
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1710
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1752
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:985
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1492
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1955
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3345
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2702
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:7429
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1682
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5831
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:2094
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:3095
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:284
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition symbol.c:999
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:372
#define ALLOCA_N(type, n)
Definition memory.h:292
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:167
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:384
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_AREF(a, i)
Definition rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:442
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:515
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:450
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:512
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:203
Definition st.h:79
Definition string.c:8388
Definition transcode.c:176
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:376