12#include "ruby/internal/config.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/gc.h"
20#include "internal/object.h"
21#include "internal/string.h"
22#include "internal/transcode.h"
23#include "internal/encoding.h"
27#include "transcode_data.h"
30#define ENABLE_ECONV_NEWLINE_OPTION 1
33static VALUE rb_eUndefinedConversionError;
34static VALUE rb_eInvalidByteSequenceError;
35static VALUE rb_eConverterNotFoundError;
37VALUE rb_cEncodingConverter;
39static ID id_destination_encoding;
40static ID id_destination_encoding_name;
41static ID id_error_bytes;
42static ID id_error_char;
43static ID id_incomplete_input;
44static ID id_readagain_bytes;
45static ID id_source_encoding;
46static ID id_source_encoding_name;
48static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
49static VALUE sym_xml, sym_text, sym_attr;
50static VALUE sym_universal_newline;
51static VALUE sym_crlf_newline;
52static VALUE sym_cr_newline;
53static VALUE sym_lf_newline;
54#ifdef ENABLE_ECONV_NEWLINE_OPTION
55static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
57static VALUE sym_partial_input;
59static VALUE sym_invalid_byte_sequence;
60static VALUE sym_undefined_conversion;
61static VALUE sym_destination_buffer_full;
62static VALUE sym_source_buffer_empty;
63static VALUE sym_finished;
64static VALUE sym_after_output;
65static VALUE sym_incomplete_input;
68allocate_converted_string(
const char *sname,
const char *dname,
69 const unsigned char *str,
size_t len,
70 unsigned char *caller_dst_buf,
size_t caller_dst_bufsize,
71 size_t *dst_len_ptr,
size_t *dst_bufsize_ptr);
81 unsigned int next_table;
83 unsigned char next_byte;
84 unsigned int output_index;
86 ssize_t recognized_len;
87 ssize_t readagain_len;
102 char ary[
sizeof(double) >
sizeof(
void*) ?
sizeof(double) :
sizeof(
void*)];
103 double dummy_for_alignment;
106#define TRANSCODING_READBUF(tc) \
107 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
108 (tc)->readbuf.ary : \
110#define TRANSCODING_WRITEBUF(tc) \
111 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
112 (tc)->writebuf.ary : \
114#define TRANSCODING_WRITEBUF_SIZE(tc) \
115 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
116 sizeof((tc)->writebuf.ary) : \
117 (size_t)(tc)->transcoder->max_output)
118#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
119#define TRANSCODING_STATE(tc) \
120 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
126 unsigned char *out_buf_start;
127 unsigned char *out_data_start;
128 unsigned char *out_data_end;
129 unsigned char *out_buf_end;
137 const char *source_encoding_name;
138 const char *destination_encoding_name;
140 const unsigned char *replacement_str;
141 size_t replacement_len;
142 size_t replacement_bufsize;
143 const char *replacement_enc;
145 unsigned char *in_buf_start;
146 unsigned char *in_data_start;
147 unsigned char *in_data_end;
148 unsigned char *in_buf_end;
150 int replacement_allocated;
160 const char *source_encoding;
161 const char *destination_encoding;
162 const unsigned char *error_bytes_start;
163 size_t error_bytes_len;
164 size_t readagain_len;
177#define DECORATOR_P(sname, dname) (*(sname) == '\0')
189free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
196free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
198 st_foreach((
void *)val, free_inner_transcode_i, 0);
199 st_free_table((
void *)val);
204rb_free_transcoder_table(
void)
206 st_foreach(transcoder_table, free_transcode_i, 0);
207 st_free_table(transcoder_table);
211make_transcoder_entry(
const char *sname,
const char *dname)
217 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
218 val = (st_data_t)st_init_strcasetable();
219 st_add_direct(transcoder_table, (st_data_t)sname, val);
222 if (!st_lookup(table2, (st_data_t)dname, &val)) {
224 entry->sname = sname;
225 entry->dname = dname;
227 entry->transcoder = NULL;
228 val = (st_data_t)entry;
229 st_add_direct(table2, (st_data_t)dname, val);
236get_transcoder_entry(
const char *sname,
const char *dname)
241 if (st_lookup(transcoder_table, (st_data_t)sname, &val)) {
243 if (!st_lookup(table2, (st_data_t)dname, &val)) {
254 const char *
const sname =
tr->src_encoding;
255 const char *
const dname =
tr->dst_encoding;
260 entry = make_transcoder_entry(sname, dname);
261 if (entry->transcoder) {
262 rb_raise(rb_eArgError,
"transcoder from %s to %s has been already registered",
265 entry->transcoder =
tr;
270declare_transcoder(
const char *sname,
const char *dname,
const char *lib)
274 entry = make_transcoder_entry(sname, dname);
278static const char transcoder_lib_prefix[] =
"enc/trans/";
281rb_declare_transcoder(
const char *enc1,
const char *enc2,
const char *lib)
284 rb_raise(rb_eArgError,
"invalid library name - (null)");
286 declare_transcoder(enc1, enc2, lib);
289#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
300 const char *base_enc;
304transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
306 const char *dname = (
const char *)key;
310 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
317 *bfs->queue_last_ptr = q;
318 bfs->queue_last_ptr = &q->next;
320 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
325transcode_search_path(
const char *sname,
const char *dname,
326 void (*callback)(
const char *sname,
const char *dname,
int depth,
void *arg),
337 if (encoding_equal(sname, dname))
343 bfs.queue_last_ptr = &q->next;
346 bfs.visited = st_init_strcasetable();
347 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
354 bfs.queue_last_ptr = &bfs.queue;
357 lookup_res = st_lookup(transcoder_table, (st_data_t)q->enc, &val);
364 if (st_lookup(table2, (st_data_t)dname, &val)) {
365 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
371 bfs.base_enc = q->enc;
372 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
386 const char *enc = dname;
390 st_lookup(bfs.visited, (st_data_t)enc, &val);
394 enc = (
const char *)val;
399 st_lookup(bfs.visited, (st_data_t)enc, &val);
402 callback((
const char *)val, enc, --depth, arg);
403 enc = (
const char *)val;
407 st_free_table(bfs.visited);
412int rb_require_internal_silent(
VALUE fname);
417 ASSERT_vm_unlocking();
418 if (entry->transcoder)
419 return entry->transcoder;
422 const char *
const lib = entry->lib;
423 const size_t len = strlen(lib);
424 const size_t total_len =
sizeof(transcoder_lib_prefix) - 1 +
len;
426 char *
const path = RSTRING_PTR(fn);
428 memcpy(path, transcoder_lib_prefix,
sizeof(transcoder_lib_prefix) - 1);
429 memcpy(path +
sizeof(transcoder_lib_prefix) - 1, lib,
len);
432 rb_require_internal_silent(fn);
435 if (entry->transcoder)
436 return entry->transcoder;
442get_replacement_character(
const char *encname,
size_t *len_ret,
const char **repl_encname_ptr)
444 if (encoding_equal(encname,
"UTF-8")) {
446 *repl_encname_ptr =
"UTF-8";
447 return "\xEF\xBF\xBD";
451 *repl_encname_ptr =
"US-ASCII";
460static const unsigned char *
462 const unsigned char *in_start,
463 const unsigned char *inchar_start,
464 const unsigned char *in_p,
465 size_t *char_len_ptr)
467 const unsigned char *ptr;
468 if (inchar_start - in_start < tc->recognized_len) {
469 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
470 inchar_start,
unsigned char, in_p - inchar_start);
471 ptr = TRANSCODING_READBUF(tc);
474 ptr = inchar_start - tc->recognized_len;
476 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
481transcode_restartable0(
const unsigned char **in_pos,
unsigned char **out_pos,
482 const unsigned char *in_stop,
unsigned char *out_stop,
487 int unitlen =
tr->input_unit_length;
488 ssize_t readagain_len = 0;
490 const unsigned char *inchar_start;
491 const unsigned char *in_p;
493 unsigned char *out_p;
495 in_p = inchar_start = *in_pos;
499#define SUSPEND(ret, num) \
501 tc->resume_position = (num); \
502 if (0 < in_p - inchar_start) \
503 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
504 inchar_start, unsigned char, in_p - inchar_start); \
507 tc->recognized_len += in_p - inchar_start; \
508 if (readagain_len) { \
509 tc->recognized_len -= readagain_len; \
510 tc->readagain_len = readagain_len; \
513 resume_label ## num:; \
515#define SUSPEND_OBUF(num) \
517 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
520#define SUSPEND_AFTER_OUTPUT(num) \
521 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
522 SUSPEND(econv_after_output, num); \
525#define next_table (tc->next_table)
526#define next_info (tc->next_info)
527#define next_byte (tc->next_byte)
528#define writebuf_len (tc->writebuf_len)
529#define writebuf_off (tc->writebuf_off)
531 switch (tc->resume_position) {
533 case 1:
goto resume_label1;
534 case 2:
goto resume_label2;
535 case 3:
goto resume_label3;
536 case 4:
goto resume_label4;
537 case 5:
goto resume_label5;
538 case 6:
goto resume_label6;
539 case 7:
goto resume_label7;
540 case 8:
goto resume_label8;
541 case 9:
goto resume_label9;
542 case 10:
goto resume_label10;
543 case 11:
goto resume_label11;
544 case 12:
goto resume_label12;
545 case 13:
goto resume_label13;
546 case 14:
goto resume_label14;
547 case 15:
goto resume_label15;
548 case 16:
goto resume_label16;
549 case 17:
goto resume_label17;
550 case 18:
goto resume_label18;
551 case 19:
goto resume_label19;
552 case 20:
goto resume_label20;
553 case 21:
goto resume_label21;
554 case 22:
goto resume_label22;
555 case 23:
goto resume_label23;
556 case 24:
goto resume_label24;
557 case 25:
goto resume_label25;
558 case 26:
goto resume_label26;
559 case 27:
goto resume_label27;
560 case 28:
goto resume_label28;
561 case 29:
goto resume_label29;
562 case 30:
goto resume_label30;
563 case 31:
goto resume_label31;
564 case 32:
goto resume_label32;
565 case 33:
goto resume_label33;
566 case 34:
goto resume_label34;
571 tc->recognized_len = 0;
572 next_table =
tr->conv_tree_start;
574 SUSPEND_AFTER_OUTPUT(24);
576 if (in_stop <= in_p) {
583#define BYTE_ADDR(index) (tr->byte_array + (index))
584#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
585#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
586#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
587#define BL_MIN_BYTE (BL_BASE[0])
588#define BL_MAX_BYTE (BL_BASE[1])
589#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
590#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
592 next_byte = (
unsigned char)*in_p++;
594 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
597 next_info = (
VALUE)BL_ACTION(next_byte);
600 switch (next_info & 0x1F) {
603 const unsigned char *p = inchar_start;
606 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (
unsigned char)*p++;
608 writebuf_len = writebuf_off;
610 while (writebuf_off < writebuf_len) {
612 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
616 case 0x00:
case 0x04:
case 0x08:
case 0x0C:
617 case 0x10:
case 0x14:
case 0x18:
case 0x1C:
618 SUSPEND_AFTER_OUTPUT(25);
619 while (in_p >= in_stop) {
624 next_byte = (
unsigned char)*in_p++;
625 next_table = (
unsigned int)next_info;
630 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
633 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
634 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
637 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
638 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
639 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
642 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
643 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
644 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
645 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
648 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
649 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
650 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
651 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
654 tc->output_index = 0;
655 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
656 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
661 next_info = (
VALUE)(*
tr->func_ii)(TRANSCODING_STATE(tc), next_info);
665 const unsigned char *char_start;
667 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
668 next_info = (
VALUE)(*
tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
673 if (
tr->max_output <= out_stop - out_p)
674 out_p +=
tr->func_io(TRANSCODING_STATE(tc),
675 next_info, out_p, out_stop - out_p);
677 writebuf_len =
tr->func_io(TRANSCODING_STATE(tc),
679 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
681 while (writebuf_off < writebuf_len) {
683 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
689 const unsigned char *char_start;
692 if (
tr->max_output <= out_stop - out_p) {
693 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
694 out_p +=
tr->func_so(TRANSCODING_STATE(tc),
695 char_start, (
size_t)char_len,
696 out_p, out_stop - out_p);
699 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
700 writebuf_len =
tr->func_so(TRANSCODING_STATE(tc),
701 char_start, (
size_t)char_len,
702 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
704 while (writebuf_off < writebuf_len) {
706 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
713 const unsigned char *char_start;
716 if (
tr->max_output <= out_stop - out_p) {
717 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
718 out_p +=
tr->func_sio(TRANSCODING_STATE(tc),
719 char_start, (
size_t)char_len, next_info,
720 out_p, out_stop - out_p);
723 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
724 writebuf_len =
tr->func_sio(TRANSCODING_STATE(tc),
725 char_start, (
size_t)char_len, next_info,
726 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
728 while (writebuf_off < writebuf_len) {
730 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
736 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
737 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
738 SUSPEND_AFTER_OUTPUT(26);
739 while ((opt &
ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
743 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
747 in_p = inchar_start + (unitlen - tc->recognized_len);
753 invalid_len = tc->recognized_len + (in_p - inchar_start);
754 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
755 readagain_len = invalid_len - discard_len;
779 if (
tr->finish_func) {
781 if (
tr->max_output <= out_stop - out_p) {
782 out_p +=
tr->finish_func(TRANSCODING_STATE(tc),
783 out_p, out_stop - out_p);
786 writebuf_len =
tr->finish_func(TRANSCODING_STATE(tc),
787 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
789 while (writebuf_off < writebuf_len) {
791 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
806transcode_restartable(
const unsigned char **in_pos,
unsigned char **out_pos,
807 const unsigned char *in_stop,
unsigned char *out_stop,
811 if (tc->readagain_len) {
812 unsigned char *readagain_buf =
ALLOCA_N(
unsigned char, tc->readagain_len);
813 const unsigned char *readagain_pos = readagain_buf;
814 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
817 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
818 unsigned char, tc->readagain_len);
819 tc->readagain_len = 0;
820 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|
ECONV_PARTIAL_INPUT);
822 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
823 readagain_pos,
unsigned char, readagain_stop - readagain_pos);
824 tc->readagain_len += readagain_stop - readagain_pos;
828 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
839 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
841 if (
tr->state_init_func) {
842 (
tr->state_init_func)(TRANSCODING_STATE(tc));
844 tc->resume_position = 0;
845 tc->recognized_len = 0;
846 tc->readagain_len = 0;
847 tc->writebuf_len = 0;
848 tc->writebuf_off = 0;
849 if ((
int)
sizeof(tc->readbuf.ary) <
tr->max_input) {
850 tc->readbuf.ptr =
xmalloc(
tr->max_input);
852 if ((
int)
sizeof(tc->writebuf.ary) <
tr->max_output) {
853 tc->writebuf.ptr =
xmalloc(
tr->max_output);
860 const unsigned char **input_ptr,
const unsigned char *input_stop,
861 unsigned char **output_ptr,
unsigned char *output_stop,
864 return transcode_restartable(
865 input_ptr, output_ptr,
866 input_stop, output_stop,
874 if (
tr->state_fini_func) {
875 (
tr->state_fini_func)(TRANSCODING_STATE(tc));
877 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
878 ruby_sized_xfree(tc->state.ptr,
tr->state_size);
879 if ((
int)
sizeof(tc->readbuf.ary) <
tr->max_input)
880 ruby_sized_xfree(tc->readbuf.ptr,
tr->max_input);
881 if ((
int)
sizeof(tc->writebuf.ary) <
tr->max_output)
882 ruby_sized_xfree(tc->writebuf.ptr,
tr->max_output);
892 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
893 size +=
tr->state_size;
895 if ((
int)
sizeof(tc->readbuf.ary) <
tr->max_input) {
896 size +=
tr->max_input;
898 if ((
int)
sizeof(tc->writebuf.ary) <
tr->max_output) {
899 size +=
tr->max_output;
905rb_econv_alloc(
int n_hint)
914 ec->source_encoding_name = NULL;
915 ec->destination_encoding_name = NULL;
917 ec->replacement_str = NULL;
918 ec->replacement_len = 0;
919 ec->replacement_bufsize = 0;
920 ec->replacement_enc = NULL;
921 ec->replacement_allocated = 0;
922 ec->in_buf_start = NULL;
923 ec->in_data_start = NULL;
924 ec->in_data_end = NULL;
925 ec->in_buf_end = NULL;
926 ec->num_allocated = n_hint;
929 ec->num_finished = 0;
932 ec->last_error.error_tc = NULL;
933 ec->last_error.source_encoding = NULL;
934 ec->last_error.destination_encoding = NULL;
935 ec->last_error.error_bytes_start = NULL;
936 ec->last_error.error_bytes_len = 0;
937 ec->last_error.readagain_len = 0;
938 ec->source_encoding = NULL;
939 ec->destination_encoding = NULL;
950 if (ec->num_trans == ec->num_allocated) {
951 n = ec->num_allocated * 2;
953 ec->num_allocated = n;
960 ec->elems[i].tc = rb_transcoding_open_by_transcoder(
tr, 0);
961 ec->elems[i].out_buf_start = p;
962 ec->elems[i].out_buf_end = p + bufsize;
963 ec->elems[i].out_data_start = p;
964 ec->elems[i].out_data_end = p;
969 if (!DECORATOR_P(
tr->src_encoding,
tr->dst_encoding))
970 for (j = ec->num_trans-1; i <= j; j--) {
973 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
988 for (i = 0; i < n; i++) {
990 tr = load_transcoder_entry(entries[i]);
995 ec = rb_econv_alloc(n);
997 for (i = 0; i < n; i++) {
999 ret = rb_econv_add_transcoder_at(ec,
tr, ec->num_trans);
1014trans_open_i(
const char *sname,
const char *dname,
int depth,
void *arg)
1018 if (!toarg->entries) {
1021 toarg->entries[depth] = get_transcoder_entry(sname, dname);
1025rb_econv_open0(
const char *sname,
const char *dname,
int ecflags)
1032 if (*sname) rb_enc_find_index(sname);
1033 if (*dname) rb_enc_find_index(dname);
1035 if (*sname ==
'\0' && *dname ==
'\0') {
1042 num_trans = transcode_search_path(sname, dname, trans_open_i, (
void *)&toarg);
1043 entries = toarg.entries;
1044 if (num_trans < 0) {
1045 SIZED_FREE_N(entries, num_trans);
1050 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1051 SIZED_FREE_N(entries, num_trans);
1055 ec->flags = ecflags;
1056 ec->source_encoding_name = sname;
1057 ec->destination_encoding_name = dname;
1062#define MAX_ECFLAGS_DECORATORS 32
1065decorator_names(
int ecflags,
const char **decorators_ret)
1087 decorators_ret[num_decorators++] =
"xml_text_escape";
1089 decorators_ret[num_decorators++] =
"xml_attr_content_escape";
1091 decorators_ret[num_decorators++] =
"xml_attr_quote";
1094 decorators_ret[num_decorators++] =
"crlf_newline";
1096 decorators_ret[num_decorators++] =
"cr_newline";
1098 decorators_ret[num_decorators++] =
"lf_newline";
1100 decorators_ret[num_decorators++] =
"universal_newline";
1102 return num_decorators;
1110 const char *decorators[MAX_ECFLAGS_DECORATORS];
1113 num_decorators = decorator_names(ecflags, decorators);
1114 if (num_decorators == -1)
1119 for (i = 0; i < num_decorators; i++) {
1129 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1136 const unsigned char **input_ptr,
const unsigned char *input_stop,
1137 unsigned char **output_ptr,
unsigned char *output_stop,
1144 const unsigned char **ipp, *is, *iold;
1145 unsigned char **opp, *os, *oold;
1151 for (i = start; i < ec->num_trans; i++) {
1160 ipp = (
const unsigned char **)&prev_te->out_data_start;
1161 is = prev_te->out_data_end;
1164 if (i == ec->num_trans-1) {
1169 if (te->out_buf_start != te->out_data_start) {
1170 ssize_t
len = te->out_data_end - te->out_data_start;
1171 ssize_t
off = te->out_data_start - te->out_buf_start;
1172 MEMMOVE(te->out_buf_start, te->out_data_start,
unsigned char,
len);
1173 te->out_data_start = te->out_buf_start;
1174 te->out_data_end -=
off;
1176 opp = &te->out_data_end;
1177 os = te->out_buf_end;
1181 if (ec->num_finished != i)
1185 flags &= ~ECONV_AFTER_OUTPUT;
1188 f &= ~ECONV_AFTER_OUTPUT;
1191 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1192 if (iold != *ipp || oold != *opp)
1207 ec->num_finished = i+1;
1217 const unsigned char **input_ptr,
const unsigned char *input_stop,
1218 unsigned char **output_ptr,
unsigned char *output_stop,
1220 int *result_position_ptr)
1223 int needreport_index;
1226 unsigned char empty_buf;
1227 unsigned char *empty_ptr = &empty_buf;
1230 input_ptr = (
const unsigned char **)&empty_ptr;
1231 input_stop = empty_ptr;
1235 output_ptr = &empty_ptr;
1236 output_stop = empty_ptr;
1242 for (i = ec->num_trans-1; 0 <= i; i--) {
1243 switch (ec->elems[i].last_result) {
1250 goto found_needreport;
1257 rb_bug(
"unexpected transcode last result");
1267 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1269 result_position_ptr);
1281 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1282 sweep_start = needreport_index + 1;
1283 }
while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1285 for (i = ec->num_trans-1; 0 <= i; i--) {
1294 if (result_position_ptr)
1295 *result_position_ptr = i;
1299 if (result_position_ptr)
1300 *result_position_ptr = -1;
1306 const unsigned char **input_ptr,
const unsigned char *input_stop,
1307 unsigned char **output_ptr,
unsigned char *output_stop,
1311 int result_position;
1314 memset(&ec->last_error, 0,
sizeof(ec->last_error));
1316 if (ec->num_trans == 0) {
1318 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1319 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1320 len = output_stop - *output_ptr;
1321 memcpy(*output_ptr, ec->in_data_start,
len);
1322 *output_ptr = output_stop;
1323 ec->in_data_start +=
len;
1327 len = ec->in_data_end - ec->in_data_start;
1328 memcpy(*output_ptr, ec->in_data_start,
len);
1330 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1336 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1337 len = output_stop - *output_ptr;
1340 len = input_stop - *input_ptr;
1343 *(*output_ptr)++ = *(*input_ptr)++;
1347 memcpy(*output_ptr, *input_ptr,
len);
1350 if (*input_ptr != input_stop)
1359 if (ec->elems[ec->num_trans-1].out_data_start) {
1360 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1361 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1362 if (data_start != data_end) {
1364 if (output_stop - *output_ptr < data_end - data_start) {
1365 len = output_stop - *output_ptr;
1366 memcpy(*output_ptr, data_start,
len);
1367 *output_ptr = output_stop;
1368 ec->elems[ec->num_trans-1].out_data_start +=
len;
1372 len = data_end - data_start;
1373 memcpy(*output_ptr, data_start,
len);
1375 ec->elems[ec->num_trans-1].out_data_start =
1376 ec->elems[ec->num_trans-1].out_data_end =
1377 ec->elems[ec->num_trans-1].out_buf_start;
1382 if (ec->in_buf_start &&
1383 ec->in_data_start != ec->in_data_end) {
1384 res = rb_trans_conv(ec, (
const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1392 *input_ptr != input_stop) {
1393 input_stop = *input_ptr;
1394 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1399 ec->num_trans == 1) {
1400 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1405 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1410 ec->last_error.result = res;
1415 ec->last_error.error_tc = error_tc;
1416 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1417 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1418 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1419 ec->last_error.error_bytes_len = error_tc->recognized_len;
1420 ec->last_error.readagain_len = error_tc->readagain_len;
1426static int output_replacement_character(
rb_econv_t *ec);
1432 unsigned char utfbuf[1024];
1433 const unsigned char *utf;
1434 size_t utf_len, utf_bufsize;
1435 int utf_allocated = 0;
1436 char charef_buf[16];
1437 const unsigned char *p;
1439 if (encoding_equal(ec->last_error.source_encoding,
"UTF-32BE")) {
1440 utf = ec->last_error.error_bytes_start;
1441 utf_len = ec->last_error.error_bytes_len;
1444 utf = allocate_converted_string(ec->last_error.source_encoding,
"UTF-32BE",
1445 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1446 utfbuf,
sizeof(utfbuf),
1447 &utf_len, &utf_bufsize);
1450 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1454 if (utf_len % 4 != 0)
1458 while (4 <= utf_len) {
1464 snprintf(charef_buf,
sizeof(charef_buf),
"&#x%X;", u);
1475 ruby_sized_xfree((
void *)utf, utf_bufsize);
1480 ruby_sized_xfree((
void *)utf, utf_bufsize);
1486 const unsigned char **input_ptr,
const unsigned char *input_stop,
1487 unsigned char **output_ptr,
unsigned char *output_stop,
1492 unsigned char empty_buf;
1493 unsigned char *empty_ptr = &empty_buf;
1498 input_ptr = (
const unsigned char **)&empty_ptr;
1499 input_stop = empty_ptr;
1503 output_ptr = &empty_ptr;
1504 output_stop = empty_ptr;
1508 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1516 if (output_replacement_character(ec) == 0)
1527 if (output_replacement_character(ec) == 0)
1532 if (output_hex_charref(ec) == 0)
1550 tr = tc->transcoder;
1552 if (
tr->asciicompat_type == asciicompat_encoder)
1553 return tr->src_encoding;
1554 return tr->dst_encoding;
1557static unsigned char *
1558allocate_converted_string(
const char *sname,
const char *dname,
1559 const unsigned char *str,
size_t len,
1560 unsigned char *caller_dst_buf,
size_t caller_dst_bufsize,
1561 size_t *dst_len_ptr,
size_t *dst_bufsize_ptr)
1563 unsigned char *dst_str;
1570 const unsigned char *sp;
1574 dst_bufsize = caller_dst_bufsize;
1584 dst_str = caller_dst_buf;
1586 dst_str =
xmalloc(dst_bufsize);
1589 dp = dst_str+dst_len;
1591 dst_len = dp - dst_str;
1593 if (SIZE_MAX/2 < dst_bufsize) {
1597 if (dst_str == caller_dst_buf) {
1600 memcpy(tmp, dst_str, dst_bufsize/2);
1604 dst_str = ruby_sized_xrealloc(dst_str, dst_bufsize, dst_bufsize / 2);
1606 dp = dst_str+dst_len;
1608 dst_len = dp - dst_str;
1614 *dst_len_ptr = dst_len;
1615 *dst_bufsize_ptr = dst_bufsize;
1619 if (dst_str != caller_dst_buf)
1620 ruby_sized_xfree(dst_str, dst_bufsize);
1628 const unsigned char *str,
size_t len,
const char *str_encoding)
1631 unsigned char insert_buf[4096];
1632 const unsigned char *insert_str = NULL;
1633 size_t insert_len, insert_bufsize;
1635 int last_trans_index;
1638 unsigned char **buf_start_p;
1639 unsigned char **data_start_p;
1640 unsigned char **data_end_p;
1641 unsigned char **buf_end_p;
1650 if (encoding_equal(insert_encoding, str_encoding)) {
1655 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1656 str,
len, insert_buf,
sizeof(insert_buf), &insert_len, &insert_bufsize);
1657 if (insert_str == NULL)
1663 last_trans_index = ec->num_trans-1;
1664 if (ec->num_trans == 0) {
1666 buf_start_p = &ec->in_buf_start;
1667 data_start_p = &ec->in_data_start;
1668 data_end_p = &ec->in_data_end;
1669 buf_end_p = &ec->in_buf_end;
1671 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1672 tc = ec->elems[last_trans_index].tc;
1673 need += tc->readagain_len;
1674 if (need < insert_len)
1676 if (last_trans_index == 0) {
1677 buf_start_p = &ec->in_buf_start;
1678 data_start_p = &ec->in_data_start;
1679 data_end_p = &ec->in_data_end;
1680 buf_end_p = &ec->in_buf_end;
1684 buf_start_p = &ee->out_buf_start;
1685 data_start_p = &ee->out_data_start;
1686 data_end_p = &ee->out_data_end;
1687 buf_end_p = &ee->out_buf_end;
1692 buf_start_p = &ee->out_buf_start;
1693 data_start_p = &ee->out_data_start;
1694 data_end_p = &ee->out_data_end;
1695 buf_end_p = &ee->out_buf_end;
1696 tc = ec->elems[last_trans_index].tc;
1699 if (*buf_start_p == NULL) {
1700 unsigned char *buf =
xmalloc(need);
1702 *data_start_p = buf;
1704 *buf_end_p = buf+need;
1706 else if ((
size_t)(*buf_end_p - *data_end_p) < need) {
1707 MEMMOVE(*buf_start_p, *data_start_p,
unsigned char, *data_end_p - *data_start_p);
1708 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1709 *data_start_p = *buf_start_p;
1710 if ((
size_t)(*buf_end_p - *data_end_p) < need) {
1712 size_t s = (*data_end_p - *buf_start_p) + need;
1715 buf = ruby_sized_xrealloc(*buf_start_p, s, buf_end_p - buf_start_p);
1716 *data_start_p = buf;
1717 *data_end_p = buf + (*data_end_p - *buf_start_p);
1719 *buf_end_p = buf + s;
1723 memcpy(*data_end_p, insert_str, insert_len);
1724 *data_end_p += insert_len;
1725 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1726 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1727 *data_end_p += tc->readagain_len;
1728 tc->readagain_len = 0;
1731 if (insert_str != str && insert_str != insert_buf)
1732 ruby_sized_xfree((
void *)insert_str, insert_bufsize);
1736 if (insert_str != str && insert_str != insert_buf)
1737 ruby_sized_xfree((
void *)insert_str, insert_bufsize);
1746 if (ec->replacement_allocated) {
1747 SIZED_FREE_N((
char *)ec->replacement_str, ec->replacement_len);
1749 for (i = 0; i < ec->num_trans; i++) {
1750 rb_transcoding_close(ec->elems[i].tc);
1751 ruby_sized_xfree(ec->elems[i].out_buf_start, ec->elems[i].out_buf_end - ec->elems[i].out_buf_start);
1753 SIZED_FREE_N(ec->in_buf_start, ec->in_buf_end - ec->in_buf_start);
1754 SIZED_FREE_N(ec->elems, ec->num_allocated);
1764 if (ec->replacement_allocated) {
1765 size += ec->replacement_len;
1767 for (i = 0; i < ec->num_trans; i++) {
1768 size += rb_transcoding_memsize(ec->elems[i].tc);
1770 if (ec->elems[i].out_buf_start) {
1771 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1774 size += ec->in_buf_end - ec->in_buf_start;
1783 if (ec->num_trans == 0)
1785#if SIZEOF_SIZE_T > SIZEOF_INT
1786 if (ec->elems[0].tc->readagain_len > INT_MAX)
return INT_MAX;
1788 return (
int)ec->elems[0].tc->readagain_len;
1795 if (ec->num_trans == 0 || n == 0)
1797 tc = ec->elems[0].tc;
1798 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1799 tc->readagain_len -= n;
1803 const char *ascii_compat_name;
1804 const char *ascii_incompat_name;
1808asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1814 if (DECORATOR_P(entry->sname, entry->dname))
1816 tr = load_transcoder_entry(entry);
1817 if (
tr &&
tr->asciicompat_type == asciicompat_decoder) {
1818 data->ascii_compat_name =
tr->dst_encoding;
1832 RB_VM_LOCK_ENTER_LEV(&lev);
1834 if (st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v)) {
1843 if (table2->num_entries == 1) {
1844 data.ascii_incompat_name = ascii_incompat_name;
1845 data.ascii_compat_name = NULL;
1846 if (rb_multi_ractor_p()) {
1851 st_table *dup_table2 = st_copy(table2);
1852 RB_VM_LOCK_LEAVE_LEV(&lev);
1853 st_foreach(dup_table2, asciicompat_encoding_i, (st_data_t)&data);
1854 st_free_table(dup_table2);
1855 RB_VM_LOCK_ENTER_LEV(&lev);
1858 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1864 RB_VM_LOCK_LEAVE_LEV(&lev);
1866 return data.ascii_compat_name;
1878 unsigned const char *sp, *se;
1879 unsigned char *ds, *dp, *de;
1888 rb_enc_associate(dst, dst_enc);
1893 dst_enc = rb_enc_get(dst);
1898 max_output = ec->last_tc->transcoder->max_output;
1904 long dlen = RSTRING_LEN(dst);
1906 unsigned long new_capa = (
unsigned long)dlen +
len + max_output;
1907 if (LONG_MAX < new_capa)
1908 rb_raise(rb_eArgError,
"too long string");
1911 sp = (
const unsigned char *)ss;
1913 ds = (
unsigned char *)RSTRING_PTR(dst);
1917 switch (coderange) {
1920 cr = (int)coderange;
1929 len -= (
const char *)sp - ss;
1930 ss = (
const char *)sp;
1966rb_econv_add_converter(
rb_econv_t *ec,
const char *sname,
const char *dname,
int n)
1971 if (ec->started != 0)
1974 entry = get_transcoder_entry(sname, dname);
1976 tr = load_transcoder_entry(entry);
1979 return tr ? rb_econv_add_transcoder_at(ec,
tr, n) : -1;
1983rb_econv_decorate_at(
rb_econv_t *ec,
const char *decorator_name,
int n)
1985 return rb_econv_add_converter(ec,
"", decorator_name, n);
1993 if (ec->num_trans == 0)
1994 return rb_econv_decorate_at(ec, decorator_name, 0);
1996 tr = ec->elems[0].tc->transcoder;
1998 if (!DECORATOR_P(
tr->src_encoding,
tr->dst_encoding) &&
1999 tr->asciicompat_type == asciicompat_decoder)
2000 return rb_econv_decorate_at(ec, decorator_name, 1);
2002 return rb_econv_decorate_at(ec, decorator_name, 0);
2010 if (ec->num_trans == 0)
2011 return rb_econv_decorate_at(ec, decorator_name, 0);
2013 tr = ec->elems[ec->num_trans-1].tc->transcoder;
2015 if (!DECORATOR_P(
tr->src_encoding,
tr->dst_encoding) &&
2016 tr->asciicompat_type == asciicompat_encoder)
2017 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
2019 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
2025 const char *dname = 0;
2029 dname =
"universal_newline";
2032 dname =
"crlf_newline";
2035 dname =
"cr_newline";
2038 dname =
"lf_newline";
2043 const rb_transcoder *transcoder = get_transcoder_entry(
"", dname)->transcoder;
2044 int num_trans = ec->num_trans;
2047 for (i=0; i < num_trans; i++) {
2048 if (transcoder == ec->elems[i].tc->transcoder) {
2049 rb_transcoding_close(ec->elems[i].tc);
2050 ruby_sized_xfree(ec->elems[i].out_buf_start, ec->elems[i].out_buf_end - ec->elems[i].out_buf_start);
2054 ec->elems[j++] = ec->elems[i];
2058 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2062econv_description(
const char *sname,
const char *dname,
int ecflags,
VALUE mesg)
2064 int has_description = 0;
2069 if (*sname !=
'\0' || *dname !=
'\0') {
2072 else if (*dname ==
'\0')
2075 rb_str_catf(mesg,
"%s to %s", sname, dname);
2076 has_description = 1;
2083 const char *pre =
"";
2084 if (has_description)
2114 has_description = 1;
2116 if (!has_description) {
2128 econv_description(sname, dname, ecflags, mesg);
2130 exc =
rb_exc_new3(rb_eConverterNotFoundError, mesg);
2140 const char *err = (
const char *)ec->last_error.error_bytes_start;
2141 size_t error_len = ec->last_error.error_bytes_len;
2144 size_t readagain_len = ec->last_error.readagain_len;
2148 mesg = rb_sprintf(
"incomplete %s on %s",
2150 ec->last_error.source_encoding);
2152 else if (readagain_len) {
2153 bytes2 =
rb_str_new(err+error_len, readagain_len);
2155 mesg = rb_sprintf(
"%s followed by %s on %s",
2158 ec->last_error.source_encoding);
2161 mesg = rb_sprintf(
"%s on %s",
2163 ec->last_error.source_encoding);
2166 exc =
rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2173 VALUE bytes =
rb_str_new((
const char *)ec->last_error.error_bytes_start,
2174 ec->last_error.error_bytes_len);
2177 if (strcmp(ec->last_error.source_encoding,
"UTF-8") == 0) {
2179 const char *start, *end;
2181 start = (
const char *)ec->last_error.error_bytes_start;
2182 end = start + ec->last_error.error_bytes_len;
2183 n = rb_enc_precise_mbclen(start, end, utf8);
2186 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2187 dumped = rb_sprintf(
"U+%04X", cc);
2192 if (strcmp(ec->last_error.source_encoding,
2193 ec->source_encoding_name) == 0 &&
2194 strcmp(ec->last_error.destination_encoding,
2195 ec->destination_encoding_name) == 0) {
2196 mesg = rb_sprintf(
"%s from %s to %s",
2198 ec->last_error.source_encoding,
2199 ec->last_error.destination_encoding);
2203 mesg = rb_sprintf(
"%s to %s in conversion from %s",
2205 ec->last_error.destination_encoding,
2206 ec->source_encoding_name);
2207 for (i = 0; i < ec->num_trans; i++) {
2209 if (!DECORATOR_P(
tr->src_encoding,
tr->dst_encoding))
2210 rb_str_catf(mesg,
" to %s",
2211 ec->elems[i].tc->transcoder->dst_encoding);
2214 exc =
rb_exc_new3(rb_eUndefinedConversionError, mesg);
2215 idx = rb_enc_find_index(ec->last_error.source_encoding);
2217 rb_enc_associate_index(bytes, idx);
2226 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2228 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2229 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2231 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2238 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2240 unsigned char **out_start_ptr,
2241 unsigned char **out_pos,
2242 unsigned char **out_stop_ptr)
2244 size_t len = (*out_pos - *out_start_ptr);
2245 size_t new_len = (
len + max_output) * 2;
2246 *out_start_ptr = resize_destination(destination,
len, new_len);
2247 *out_pos = *out_start_ptr +
len;
2248 *out_stop_ptr = *out_start_ptr + new_len;
2256 const unsigned char *replacement;
2257 const char *repl_enc;
2258 const char *ins_enc;
2261 if (ec->replacement_str)
2268 tr = tc->transcoder;
2269 rb_enc_find(
tr->dst_encoding);
2270 replacement = (
const unsigned char *)get_replacement_character(ins_enc, &
len, &repl_enc);
2273 replacement = (
unsigned char *)
"?";
2278 ec->replacement_str = replacement;
2279 ec->replacement_len =
len;
2280 ec->replacement_bufsize =
len;
2281 ec->replacement_enc = repl_enc;
2282 ec->replacement_allocated = 0;
2288 const unsigned char *str,
size_t len,
const char *encname)
2290 unsigned char *str2;
2291 size_t len2, buf_size2;
2292 const char *encname2;
2296 if (!*encname2 || encoding_equal(encname, encname2)) {
2299 buf_size2 = len2 =
len;
2303 str2 = allocate_converted_string(encname, encname2, str,
len, NULL, 0, &len2, &buf_size2);
2308 if (ec->replacement_allocated) {
2309 SIZED_FREE_N((
char *)ec->replacement_str, ec->replacement_bufsize);
2311 ec->replacement_allocated = 1;
2312 ec->replacement_str = str2;
2313 ec->replacement_len = len2;
2314 ec->replacement_bufsize = buf_size2;
2315 ec->replacement_enc = encname2;
2324 if (make_replacement(ec) == -1)
2335#define hash_fallback rb_hash_aref
2362transcode_loop_fallback_try(
VALUE a)
2366 VALUE ret = args->fallback_func(args->fallback, args->rep);
2368 if (!UNDEF_P(ret) && !
NIL_P(ret)) {
2376transcode_loop(
const unsigned char **in_pos,
unsigned char **out_pos,
2377 const unsigned char *in_stop,
unsigned char *out_stop,
2379 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2380 const char *src_encoding,
2381 const char *dst_encoding,
2388 unsigned char *out_start = *out_pos;
2399 fallback = rb_hash_aref(ecopts, sym_fallback);
2401 fallback_func = hash_fallback;
2404 fallback_func = proc_fallback;
2407 fallback_func = method_fallback;
2410 fallback_func = aref_fallback;
2413 last_tc = ec->last_tc;
2414 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2420 VALUE rep = rb_enc_str_new(
2421 (
const char *)ec->last_error.error_bytes_start,
2422 ec->last_error.error_bytes_len,
2423 rb_enc_find(ec->last_error.source_encoding));
2427 .fallback_func = fallback_func,
2428 .fallback = fallback,
2433 rep = rb_protect(transcode_loop_fallback_try, (
VALUE)&args, &state);
2439 if (!UNDEF_P(rep) && !
NIL_P(rep)) {
2441 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2442 if ((
int)ret == -1) {
2444 rb_raise(rb_eArgError,
"too big fallback string");
2453 exc = make_econv_exception(ec);
2459 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2469transcode_loop(
const unsigned char **in_pos,
unsigned char **out_pos,
2470 const unsigned char *in_stop,
unsigned char *out_stop,
2472 unsigned char *(*resize_destination)(
VALUE,
size_t,
size_t),
2473 const char *src_encoding,
2474 const char *dst_encoding,
2481 unsigned char *out_start = *out_pos;
2482 const unsigned char *ptr;
2490 last_tc = ec->last_tc;
2491 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2496 unsigned char input_byte;
2497 const unsigned char *p = &input_byte;
2500 if (ptr < in_stop) {
2511 if (&input_byte != p)
2512 ptr += p - &input_byte;
2517 exc = make_econv_exception(ec);
2523 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2544static unsigned char *
2545str_transcoding_resize(
VALUE destination,
size_t len,
size_t new_len)
2547 rb_str_resize(destination, new_len);
2548 return (
unsigned char *)RSTRING_PTR(destination);
2552econv_opts(
VALUE opt,
int ecflags)
2555 int newlineflag = 0;
2557 v = rb_hash_aref(opt, sym_invalid);
2560 else if (v==sym_replace) {
2564 rb_raise(rb_eArgError,
"unknown value for invalid character option");
2567 v = rb_hash_aref(opt, sym_undef);
2570 else if (v==sym_replace) {
2574 rb_raise(rb_eArgError,
"unknown value for undefined character option");
2577 v = rb_hash_aref(opt, sym_replace);
2582 v = rb_hash_aref(opt, sym_xml);
2587 else if (v==sym_attr) {
2591 rb_raise(rb_eArgError,
"unexpected value for xml option: %"PRIsVALUE,
rb_sym2str(v));
2594 rb_raise(rb_eArgError,
"unexpected value for xml option");
2598#ifdef ENABLE_ECONV_NEWLINE_OPTION
2599 v = rb_hash_aref(opt, sym_newline);
2602 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2603 if (v == sym_universal) {
2606 else if (v == sym_crlf) {
2609 else if (v == sym_cr) {
2612 else if (v == sym_lf) {
2616 rb_raise(rb_eArgError,
"unexpected value for newline option: %"PRIsVALUE,
2620 rb_raise(rb_eArgError,
"unexpected value for newline option");
2627 v = rb_hash_aref(opt, sym_universal_newline);
2630 newlineflag |= !
NIL_P(v);
2632 v = rb_hash_aref(opt, sym_crlf_newline);
2635 newlineflag |= !
NIL_P(v);
2637 v = rb_hash_aref(opt, sym_cr_newline);
2640 newlineflag |= !
NIL_P(v);
2642 v = rb_hash_aref(opt, sym_lf_newline);
2645 newlineflag |= !
NIL_P(v);
2647 switch (newlineflag) {
2649 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2650 ecflags |= setflags;
2654 rb_warning(
":newline option precedes other newline options");
2668 if (
NIL_P(opthash)) {
2672 ecflags = econv_opts(opthash, ecflags);
2674 v = rb_hash_aref(opthash, sym_replace);
2677 if (is_broken_string(v)) {
2679 rb_raise(rb_eArgError,
"replacement string is broken: %s as %s",
2681 rb_enc_name(rb_enc_get(v)));
2684 newhash = rb_hash_new();
2685 rb_hash_aset(newhash, sym_replace, v);
2688 v = rb_hash_aref(opthash, sym_fallback);
2690 VALUE h = rb_check_hash_type(v);
2695 newhash = rb_hash_new();
2696 rb_hash_aset(newhash, sym_fallback, v);
2700 if (!
NIL_P(newhash))
2701 rb_hash_freeze(newhash);
2719 if (
NIL_P(opthash)) {
2724 rb_bug(
"rb_econv_open_opts called with invalid opthash");
2725 replacement = rb_hash_aref(opthash, sym_replace);
2728 ec =
rb_econv_open(source_encoding, destination_encoding, ecflags);
2730 if (!
NIL_P(replacement)) {
2735 (
const unsigned char *)RSTRING_PTR(replacement),
2736 RSTRING_LEN(replacement),
2755 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2756 !(enc = rb_enc_from_index(encidx))) {
2762 n = rb_enc_name(enc);
2777 const char *sname, *dname;
2778 int sencidx, dencidx;
2780 dencidx = enc_arg(arg1, &dname, &denc);
2783 sencidx = rb_enc_get_index(str);
2784 senc = rb_enc_from_index(sencidx);
2785 sname = rb_enc_name(senc);
2788 sencidx = enc_arg(arg2, &sname, &senc);
2799str_transcode0(
int argc,
VALUE *argv,
VALUE *self,
int ecflags,
VALUE ecopts)
2805 unsigned char *buf, *bp, *sp;
2806 const unsigned char *fromp;
2808 const char *sname, *dname;
2810 int explicitly_invalid_replace = TRUE;
2815 arg1 = rb_enc_default_internal();
2817 if (!ecflags)
return -1;
2818 arg1 = rb_obj_encoding(str);
2821 explicitly_invalid_replace = FALSE;
2828 arg2 = argc<=1 ?
Qnil : argv[1];
2829 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2835 if (senc && senc == denc) {
2838 if (!
NIL_P(ecopts)) {
2839 rep = rb_hash_aref(ecopts, sym_replace);
2841 dest = rb_enc_str_scrub(senc, str, rep);
2842 if (
NIL_P(dest)) dest = str;
2846 return NIL_P(arg2) ? -1 : dencidx;
2848 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2849 if (is_ascii_string(str)) {
2853 if (encoding_equal(sname, dname)) {
2854 return NIL_P(arg2) ? -1 : dencidx;
2858 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2864 if (encoding_equal(sname, dname)) {
2870 fromp = sp = (
unsigned char *)RSTRING_PTR(str);
2871 slen = RSTRING_LEN(str);
2874 bp = (
unsigned char *)RSTRING_PTR(dest);
2876 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2877 if (fromp != sp+slen) {
2878 rb_raise(rb_eArgError,
"not fully converted, %"PRIdPTRDIFF
" bytes left", sp+slen-fromp);
2880 buf = (
unsigned char *)RSTRING_PTR(dest);
2886 dencidx = rb_define_dummy_encoding(dname);
2896str_transcode(
int argc,
VALUE *argv,
VALUE *self)
2902 argc =
rb_scan_args(argc, argv,
"02:", NULL, NULL, &opt);
2906 return str_transcode0(argc, argv, self, ecflags, ecopts);
2910str_encode_associate(
VALUE str,
int encidx)
2914 rb_enc_associate_index(str, encidx);
2917 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2938str_encode_bang(
int argc,
VALUE *argv,
VALUE str)
2943 rb_check_frozen(str);
2946 encidx = str_transcode(argc, argv, &newstr);
2948 if (encidx < 0)
return str;
2949 if (newstr == str) {
2950 rb_enc_associate_index(str, encidx);
2954 return str_encode_associate(str, encidx);
2972 int encidx = str_transcode(argc, argv, &newstr);
2973 return encoded_dup(newstr, str, encidx);
2982 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2983 return encoded_dup(newstr, str, encidx);
2987encoded_dup(
VALUE newstr,
VALUE str,
int encidx)
2990 if (newstr == str) {
2992 rb_enc_associate_index(newstr, encidx);
2998 return str_encode_associate(newstr, encidx);
3007econv_free(
void *ptr)
3014econv_memsize(
const void *ptr)
3021 {0, econv_free, econv_memsize,},
3026econv_s_allocate(
VALUE klass)
3032make_dummy_encoding(
const char *name)
3036 idx = rb_define_dummy_encoding(name);
3037 enc = rb_enc_from_index(idx);
3042make_encoding(
const char *name)
3045 enc = rb_enc_find(name);
3048 if (rb_enc_registered(name)) {
3052 enc = make_dummy_encoding(name);
3060make_encobj(
const char *name)
3062 return rb_enc_from_encoding(make_encoding(name));
3084econv_s_asciicompat_encoding(
VALUE klass,
VALUE arg)
3086 const char *arg_name, *result_name;
3090 enc_arg(&arg, &arg_name, &arg_enc);
3093 result_enc = make_encoding(result_name);
3094 enc = rb_enc_from_encoding(result_enc);
3100econv_args(
int argc,
VALUE *argv,
3102 const char **sname_p,
const char **dname_p,
3107 VALUE opt, flags_v, ecopts;
3109 const char *sname, *dname;
3113 argc =
rb_scan_args(argc, argv,
"21:", snamev_p, dnamev_p, &flags_v, &opt);
3115 if (!
NIL_P(flags_v)) {
3117 rb_error_arity(argc + 1, 2, 3);
3122 else if (!
NIL_P(opt)) {
3131 sidx = rb_to_encoding_index(*snamev_p);
3133 senc = rb_enc_from_index(sidx);
3140 didx = rb_to_encoding_index(*dnamev_p);
3142 denc = rb_enc_from_index(didx);
3155 *ecflags_p = ecflags;
3160decorate_convpath(
VALUE convpath,
int ecflags)
3163 const char *decorators[MAX_ECFLAGS_DECORATORS];
3167 num_decorators = decorator_names(ecflags, decorators);
3168 if (num_decorators == -1)
3175 const char *sname = rb_enc_name(rb_to_encoding(
RARRAY_AREF(pair, 0)));
3176 const char *dname = rb_enc_name(rb_to_encoding(
RARRAY_AREF(pair, 1)));
3179 entry = get_transcoder_entry(sname, dname);
3180 tr = load_transcoder_entry(entry);
3183 if (!DECORATOR_P(
tr->src_encoding,
tr->dst_encoding) &&
3184 tr->asciicompat_type == asciicompat_encoder) {
3194 for (i = 0; i < num_decorators; i++)
3201search_convpath_i(
const char *sname,
const char *dname,
int depth,
void *arg)
3206 if (
NIL_P(*ary_p)) {
3210 if (DECORATOR_P(sname, dname)) {
3214 v =
rb_assoc_new(make_encobj(sname), make_encobj(dname));
3245econv_s_search_convpath(
int argc,
VALUE *argv,
VALUE klass)
3247 VALUE snamev, dnamev;
3248 const char *sname, *dname;
3254 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3257 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3259 if (
NIL_P(convpath)) {
3266 if (decorate_convpath(convpath, ecflags) == -1) {
3285 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3287 return RTEST(convpath);
3297rb_econv_init_by_convpath_i(
const char *sname,
const char *dname,
int depth,
void *arg)
3305 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3312rb_econv_init_by_convpath(
VALUE self,
VALUE convpath,
3313 const char **sname_p,
const char **dname_p,
3321 const char *sname, *dname;
3327 VALUE snamev, dnamev;
3332 rb_raise(rb_eArgError,
"not a 2-element array in convpath");
3334 enc_arg(&snamev, &sname, &senc);
3336 enc_arg(&dnamev, &dname, &denc);
3342 if (DECORATOR_P(sname, dname)) {
3343 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3345 VALUE msg = rb_sprintf(
"decoration failed: %s", dname);
3352 int j = ec->num_trans;
3355 arg.index = ec->num_trans;
3357 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3358 if (ret == -1 || arg.ret == -1) {
3359 VALUE msg = rb_sprintf(
"adding conversion failed: %s to %s", sname, dname);
3367 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3370 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3381 ec->source_encoding_name = *sname_p;
3382 ec->destination_encoding_name = *dname_p;
3502 VALUE snamev, dnamev;
3503 const char *sname, *dname;
3514 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3519 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3530 if (!DECORATOR_P(sname, dname)) {
3532 senc = make_dummy_encoding(sname);
3534 denc = make_dummy_encoding(dname);
3539 ec->source_encoding = senc;
3540 ec->destination_encoding = denc;
3558econv_inspect(
VALUE self)
3565 return rb_sprintf(
"#<%s: uninitialized>", cname);
3567 const char *sname = ec->source_encoding_name;
3568 const char *dname = ec->destination_encoding_name;
3570 str = rb_sprintf(
"#<%s: ", cname);
3571 econv_description(sname, dname, ec->flags, str);
3578check_econv(
VALUE self)
3584 rb_raise(
rb_eTypeError,
"uninitialized encoding converter");
3594 return rb_enc_from_encoding(encoding);
3604econv_source_encoding(
VALUE self)
3607 return econv_get_encoding(ec->source_encoding);
3617econv_destination_encoding(
VALUE self)
3620 return econv_get_encoding(ec->destination_encoding);
3646econv_convpath(
VALUE self)
3653 for (i = 0; i < ec->num_trans; i++) {
3656 if (DECORATOR_P(
tr->src_encoding,
tr->dst_encoding))
3659 v =
rb_assoc_new(make_encobj(
tr->src_encoding), make_encobj(
tr->dst_encoding));
3676 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3681 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3682 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3684 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3685 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3687 if (ec1->flags != ec2->flags)
return Qfalse;
3688 if (ec1->replacement_enc != ec2->replacement_enc &&
3689 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3691 if (ec1->replacement_len != ec2->replacement_len)
return Qfalse;
3692 if (ec1->replacement_str != ec2->replacement_str &&
3693 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3696 if (ec1->num_trans != ec2->num_trans)
return Qfalse;
3697 for (i = 0; i < ec1->num_trans; i++) {
3698 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3814econv_primitive_convert(
int argc,
VALUE *argv,
VALUE self)
3816 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3819 const unsigned char *ip, *is;
3820 unsigned char *op, *os;
3821 long output_byteoffset, output_bytesize;
3822 unsigned long output_byteend;
3825 argc =
rb_scan_args(argc, argv,
"23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3827 if (
NIL_P(output_byteoffset_v))
3828 output_byteoffset = 0;
3830 output_byteoffset =
NUM2LONG(output_byteoffset_v);
3832 if (
NIL_P(output_bytesize_v))
3833 output_bytesize = 0;
3835 output_bytesize =
NUM2LONG(output_bytesize_v);
3837 if (!
NIL_P(flags_v)) {
3839 rb_error_arity(argc + 1, 2, 5);
3843 else if (!
NIL_P(opt)) {
3846 v = rb_hash_aref(opt, sym_partial_input);
3849 v = rb_hash_aref(opt, sym_after_output);
3860 rb_str_modify(output);
3862 if (
NIL_P(output_bytesize_v)) {
3865 if (!
NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3866 output_bytesize = RSTRING_LEN(input);
3871 if (
NIL_P(output_byteoffset_v))
3872 output_byteoffset = RSTRING_LEN(output);
3874 if (output_byteoffset < 0)
3875 rb_raise(rb_eArgError,
"negative output_byteoffset");
3877 if (RSTRING_LEN(output) < output_byteoffset)
3878 rb_raise(rb_eArgError,
"output_byteoffset too big");
3880 if (output_bytesize < 0)
3881 rb_raise(rb_eArgError,
"negative output_bytesize");
3883 output_byteend = (
unsigned long)output_byteoffset +
3884 (
unsigned long)output_bytesize;
3886 if (output_byteend < (
unsigned long)output_byteoffset ||
3887 LONG_MAX < output_byteend)
3888 rb_raise(rb_eArgError,
"output_byteoffset+output_bytesize too big");
3891 rb_str_resize(output, output_byteend);
3897 ip = (
const unsigned char *)RSTRING_PTR(input);
3898 is = ip + RSTRING_LEN(input);
3901 op = (
unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3902 os = op + output_bytesize;
3906 if (!
NIL_P(input)) {
3911 if (LONG_MAX / 2 < output_bytesize)
3912 rb_raise(rb_eArgError,
"too long conversion result");
3913 output_bytesize *= 2;
3914 output_byteoffset_v =
Qnil;
3918 if (ec->destination_encoding) {
3919 rb_enc_associate(output, ec->destination_encoding);
3922 return econv_result_to_symbol(res);
3960econv_convert(
VALUE self,
VALUE source_string)
3978 ret = econv_primitive_convert(ac, av, self);
3980 if (ret == sym_invalid_byte_sequence ||
3981 ret == sym_undefined_conversion ||
3982 ret == sym_incomplete_input) {
3983 VALUE exc = make_econv_exception(ec);
3987 if (ret == sym_finished) {
3988 rb_raise(rb_eArgError,
"converter already finished");
3991 if (ret != sym_source_buffer_empty) {
3992 rb_bug(
"unexpected result of econv_primitive_convert");
4010econv_finish(
VALUE self)
4026 ret = econv_primitive_convert(ac, av, self);
4028 if (ret == sym_invalid_byte_sequence ||
4029 ret == sym_undefined_conversion ||
4030 ret == sym_incomplete_input) {
4031 VALUE exc = make_econv_exception(ec);
4035 if (ret != sym_finished) {
4036 rb_bug(
"unexpected result of econv_primitive_convert");
4118econv_primitive_errinfo(
VALUE self)
4126 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4129 if (ec->last_error.source_encoding)
4132 if (ec->last_error.destination_encoding)
4135 if (ec->last_error.error_bytes_start) {
4136 rb_ary_store(ary, 3,
rb_str_new((
const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4137 rb_ary_store(ary, 4,
rb_str_new((
const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4178 const char *insert_enc;
4186 string =
rb_str_encode(
string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0,
Qnil);
4188 ret =
rb_econv_insert_output(ec, (
const unsigned char *)RSTRING_PTR(
string), RSTRING_LEN(
string), insert_enc);
4190 rb_raise(rb_eArgError,
"too big string");
4221econv_putback(
int argc,
VALUE *argv,
VALUE self)
4234 if (putbackable < n)
4241 if (ec->source_encoding) {
4242 rb_enc_associate(str, ec->source_encoding);
4269econv_last_error(
VALUE self)
4274 exc = make_econv_exception(ec);
4293econv_get_replacement(
VALUE self)
4299 ret = make_replacement(ec);
4301 rb_raise(rb_eUndefinedConversionError,
"replacement character setup failed");
4304 enc = rb_enc_find(ec->replacement_enc);
4305 return rb_enc_str_new((
const char *)ec->replacement_str, (
long)ec->replacement_len, enc);
4327 enc = rb_enc_get(
string);
4330 (
const unsigned char *)RSTRING_PTR(
string),
4331 RSTRING_LEN(
string),
4336 rb_raise(rb_eUndefinedConversionError,
"replacement character setup failed");
4345 return make_econv_exception(ec);
4353 exc = make_econv_exception(ec);
4366ecerr_source_encoding_name(
VALUE self)
4368 return rb_attr_get(self, id_source_encoding_name);
4392ecerr_source_encoding(
VALUE self)
4394 return rb_attr_get(self, id_source_encoding);
4404ecerr_destination_encoding_name(
VALUE self)
4406 return rb_attr_get(self, id_destination_encoding_name);
4416ecerr_destination_encoding(
VALUE self)
4418 return rb_attr_get(self, id_destination_encoding);
4437ecerr_error_char(
VALUE self)
4439 return rb_attr_get(self, id_error_char);
4458ecerr_error_bytes(
VALUE self)
4460 return rb_attr_get(self, id_error_bytes);
4470ecerr_readagain_bytes(
VALUE self)
4472 return rb_attr_get(self, id_readagain_bytes);
4500ecerr_incomplete_input(
VALUE self)
4502 return rb_attr_get(self, id_incomplete_input);
4530 transcoder_table = st_init_strcasetable();
4533 id_destination_encoding_name =
rb_intern_const(
"destination_encoding_name");
4559 sym_lf_newline =
ID2SYM(rb_intern(
"lf_newline"));
4562#ifdef ENABLE_ECONV_NEWLINE_OPTION
4574InitVM_transcode(
void)
4590 rb_define_method(rb_cEncodingConverter,
"source_encoding", econv_source_encoding, 0);
4591 rb_define_method(rb_cEncodingConverter,
"destination_encoding", econv_destination_encoding, 0);
4592 rb_define_method(rb_cEncodingConverter,
"primitive_convert", econv_primitive_convert, -1);
4595 rb_define_method(rb_cEncodingConverter,
"primitive_errinfo", econv_primitive_errinfo, 0);
4596 rb_define_method(rb_cEncodingConverter,
"insert_output", econv_insert_output, 1);
4598 rb_define_method(rb_cEncodingConverter,
"last_error", econv_last_error, 0);
4599 rb_define_method(rb_cEncodingConverter,
"replacement", econv_get_replacement, 0);
4600 rb_define_method(rb_cEncodingConverter,
"replacement=", econv_set_replacement, 1);
4678 rb_define_method(rb_eUndefinedConversionError,
"source_encoding_name", ecerr_source_encoding_name, 0);
4679 rb_define_method(rb_eUndefinedConversionError,
"destination_encoding_name", ecerr_destination_encoding_name, 0);
4680 rb_define_method(rb_eUndefinedConversionError,
"source_encoding", ecerr_source_encoding, 0);
4681 rb_define_method(rb_eUndefinedConversionError,
"destination_encoding", ecerr_destination_encoding, 0);
4682 rb_define_method(rb_eUndefinedConversionError,
"error_char", ecerr_error_char, 0);
4684 rb_define_method(rb_eInvalidByteSequenceError,
"source_encoding_name", ecerr_source_encoding_name, 0);
4685 rb_define_method(rb_eInvalidByteSequenceError,
"destination_encoding_name", ecerr_destination_encoding_name, 0);
4686 rb_define_method(rb_eInvalidByteSequenceError,
"source_encoding", ecerr_source_encoding, 0);
4687 rb_define_method(rb_eInvalidByteSequenceError,
"destination_encoding", ecerr_destination_encoding, 0);
4688 rb_define_method(rb_eInvalidByteSequenceError,
"error_bytes", ecerr_error_bytes, 0);
4689 rb_define_method(rb_eInvalidByteSequenceError,
"readagain_bytes", ecerr_readagain_bytes, 0);
4690 rb_define_method(rb_eInvalidByteSequenceError,
"incomplete_input?", ecerr_incomplete_input, 0);
ruby_coderange_type
What rb_enc_str_coderange() returns.
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
#define rb_str_new2
Old name of rb_str_new_cstr.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
#define ALLOC
Old name of RB_ALLOC.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
#define ID2SYM
Old name of RB_ID2SYM.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
#define rb_ary_new4
Old name of rb_ary_new_from_values.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
#define xmalloc
Old name of ruby_xmalloc.
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
#define T_HASH
Old name of RUBY_T_HASH.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define rb_exc_new3
Old name of rb_exc_new_str.
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
#define NUM2INT
Old name of RB_NUM2INT.
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
#define INT2NUM
Old name of RB_INT2NUM.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
#define T_ARRAY
Old name of RUBY_T_ARRAY.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
VALUE rb_eTypeError
TypeError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_eEncodingError
EncodingError exception.
void rb_warning(const char *fmt,...)
Issues a warning.
VALUE rb_cObject
Object class.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_cEncoding
Encoding class.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
@ econv_finished
The conversion stopped after converting everything.
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
@ econv_source_buffer_empty
The conversion stopped because there is no input.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
VALUE rb_ary_new(void)
Allocates a new, empty array.
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
int off
Offset inside of ptr.
int len
Length of the buffer.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
#define RARRAY_LEN
Just another name of rb_array_len.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_AREF(a, i)
#define DATA_PTR(obj)
Convenient getter macro.
#define StringValue(v)
Ensures that the parameter object is a String.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define RUBY_TYPED_FREE_IMMEDIATELY
Macros to see if each corresponding flag is defined.
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
#define InitVM(ext)
This macro is for internal use.
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.