Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
transcode.c (892c46283a5ea4179500d951c9d4866c0051f27b)
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author$
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/internal/config.h"
13 
14 #include <ctype.h>
15 
16 #include "internal.h"
17 #include "internal/array.h"
18 #include "internal/inits.h"
19 #include "internal/object.h"
20 #include "internal/string.h"
21 #include "internal/transcode.h"
22 #include "ruby/encoding.h"
23 
24 #include "transcode_data.h"
25 #include "id.h"
26 
27 #define ENABLE_ECONV_NEWLINE_OPTION 1
28 
29 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30 static VALUE rb_eUndefinedConversionError;
31 static VALUE rb_eInvalidByteSequenceError;
32 static VALUE rb_eConverterNotFoundError;
33 
34 VALUE rb_cEncodingConverter;
35 
36 static ID id_destination_encoding;
37 static ID id_destination_encoding_name;
38 static ID id_error_bytes;
39 static ID id_error_char;
40 static ID id_incomplete_input;
41 static ID id_readagain_bytes;
42 static ID id_source_encoding;
43 static ID id_source_encoding_name;
44 
45 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46 static VALUE sym_xml, sym_text, sym_attr;
47 static VALUE sym_universal_newline;
48 static VALUE sym_crlf_newline;
49 static VALUE sym_cr_newline;
50 static VALUE sym_lf_newline;
51 #ifdef ENABLE_ECONV_NEWLINE_OPTION
52 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
53 #endif
54 static VALUE sym_partial_input;
55 
56 static VALUE sym_invalid_byte_sequence;
57 static VALUE sym_undefined_conversion;
58 static VALUE sym_destination_buffer_full;
59 static VALUE sym_source_buffer_empty;
60 static VALUE sym_finished;
61 static VALUE sym_after_output;
62 static VALUE sym_incomplete_input;
63 
64 static unsigned char *
65 allocate_converted_string(const char *sname, const char *dname,
66  const unsigned char *str, size_t len,
67  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
68  size_t *dst_len_ptr);
69 
70 /* dynamic structure, one per conversion (similar to iconv_t) */
71 /* may carry conversion state (e.g. for iso-2022-jp) */
72 typedef struct rb_transcoding {
73  const rb_transcoder *transcoder;
74 
75  int flags;
76 
77  int resume_position;
78  unsigned int next_table;
79  VALUE next_info;
80  unsigned char next_byte;
81  unsigned int output_index;
82 
83  ssize_t recognized_len; /* already interpreted */
84  ssize_t readagain_len; /* not yet interpreted */
85  union {
86  unsigned char ary[8]; /* max_input <= sizeof(ary) */
87  unsigned char *ptr; /* length: max_input */
88  } readbuf; /* recognized_len + readagain_len used */
89 
90  ssize_t writebuf_off;
91  ssize_t writebuf_len;
92  union {
93  unsigned char ary[8]; /* max_output <= sizeof(ary) */
94  unsigned char *ptr; /* length: max_output */
95  } writebuf;
96 
97  union rb_transcoding_state_t { /* opaque data for stateful encoding */
98  void *ptr;
99  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
100  double dummy_for_alignment;
101  } state;
103 #define TRANSCODING_READBUF(tc) \
104  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
105  (tc)->readbuf.ary : \
106  (tc)->readbuf.ptr)
107 #define TRANSCODING_WRITEBUF(tc) \
108  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
109  (tc)->writebuf.ary : \
110  (tc)->writebuf.ptr)
111 #define TRANSCODING_WRITEBUF_SIZE(tc) \
112  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
113  sizeof((tc)->writebuf.ary) : \
114  (size_t)(tc)->transcoder->max_output)
115 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
116 #define TRANSCODING_STATE(tc) \
117  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
118  (tc)->state.ary : \
119  (tc)->state.ptr)
120 
121 typedef struct {
122  struct rb_transcoding *tc;
123  unsigned char *out_buf_start;
124  unsigned char *out_data_start;
125  unsigned char *out_data_end;
126  unsigned char *out_buf_end;
127  rb_econv_result_t last_result;
129 
130 struct rb_econv_t {
131  int flags;
132  int started; /* bool */
133 
134  const char *source_encoding_name;
135  const char *destination_encoding_name;
136 
137  const unsigned char *replacement_str;
138  size_t replacement_len;
139  const char *replacement_enc;
140 
141  unsigned char *in_buf_start;
142  unsigned char *in_data_start;
143  unsigned char *in_data_end;
144  unsigned char *in_buf_end;
145  rb_econv_elem_t *elems;
146  int replacement_allocated; /* bool */
147  int num_allocated;
148  int num_trans;
149  int num_finished;
150  struct rb_transcoding *last_tc;
151 
152  /* last error */
153  struct {
154  rb_econv_result_t result;
155  struct rb_transcoding *error_tc;
156  const char *source_encoding;
157  const char *destination_encoding;
158  const unsigned char *error_bytes_start;
159  size_t error_bytes_len;
160  size_t readagain_len;
161  } last_error;
162 
163  /* The following fields are only for Encoding::Converter.
164  * rb_econv_open set them NULL. */
165  rb_encoding *source_encoding;
166  rb_encoding *destination_encoding;
167 };
168 
169 /*
170  * Dispatch data and logic
171  */
172 
173 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
174 
175 typedef struct {
176  const char *sname;
177  const char *dname;
178  const char *lib; /* null means no need to load a library */
179  const rb_transcoder *transcoder;
181 
182 static st_table *transcoder_table;
183 
184 static int
185 free_inner_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
186 {
187  xfree((void *)val);
188  return ST_DELETE;
189 }
190 
191 static int
192 free_transcode_i(st_data_t key, st_data_t val, st_data_t arg)
193 {
194  st_foreach((void *)val, free_inner_transcode_i, 0);
195  st_free_table((void *)val);
196  return ST_DELETE;
197 }
198 
199 void
200 rb_free_transcoder_table(void)
201 {
202  st_foreach(transcoder_table, free_transcode_i, 0);
203  st_free_table(transcoder_table);
204 }
205 
206 static transcoder_entry_t *
207 make_transcoder_entry(const char *sname, const char *dname)
208 {
209  st_data_t val;
210  st_table *table2;
211 
212  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
213  val = (st_data_t)st_init_strcasetable();
214  st_add_direct(transcoder_table, (st_data_t)sname, val);
215  }
216  table2 = (st_table *)val;
217  if (!st_lookup(table2, (st_data_t)dname, &val)) {
219  entry->sname = sname;
220  entry->dname = dname;
221  entry->lib = NULL;
222  entry->transcoder = NULL;
223  val = (st_data_t)entry;
224  st_add_direct(table2, (st_data_t)dname, val);
225  }
226  return (transcoder_entry_t *)val;
227 }
228 
229 static transcoder_entry_t *
230 get_transcoder_entry(const char *sname, const char *dname)
231 {
232  st_data_t val;
233  st_table *table2;
234 
235  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
236  return NULL;
237  }
238  table2 = (st_table *)val;
239  if (!st_lookup(table2, (st_data_t)dname, &val)) {
240  return NULL;
241  }
242  return (transcoder_entry_t *)val;
243 }
244 
245 void
246 rb_register_transcoder(const rb_transcoder *tr)
247 {
248  const char *const sname = tr->src_encoding;
249  const char *const dname = tr->dst_encoding;
250 
251  transcoder_entry_t *entry;
252 
253  entry = make_transcoder_entry(sname, dname);
254  if (entry->transcoder) {
255  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
256  sname, dname);
257  }
258 
259  entry->transcoder = tr;
260 }
261 
262 static void
263 declare_transcoder(const char *sname, const char *dname, const char *lib)
264 {
265  transcoder_entry_t *entry;
266 
267  entry = make_transcoder_entry(sname, dname);
268  entry->lib = lib;
269 }
270 
271 static const char transcoder_lib_prefix[] = "enc/trans/";
272 
273 void
274 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
275 {
276  if (!lib) {
277  rb_raise(rb_eArgError, "invalid library name - (null)");
278  }
279  declare_transcoder(enc1, enc2, lib);
280 }
281 
282 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
283 
284 typedef struct search_path_queue_tag {
285  struct search_path_queue_tag *next;
286  const char *enc;
288 
289 typedef struct {
290  st_table *visited;
291  search_path_queue_t *queue;
292  search_path_queue_t **queue_last_ptr;
293  const char *base_enc;
295 
296 static int
297 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
298 {
299  const char *dname = (const char *)key;
300  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
302 
303  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
304  return ST_CONTINUE;
305  }
306 
308  q->enc = dname;
309  q->next = NULL;
310  *bfs->queue_last_ptr = q;
311  bfs->queue_last_ptr = &q->next;
312 
313  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
314  return ST_CONTINUE;
315 }
316 
317 static int
318 transcode_search_path(const char *sname, const char *dname,
319  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
320  void *arg)
321 {
322  search_path_bfs_t bfs;
324  st_data_t val;
325  st_table *table2;
326  int found;
327  int pathlen = -1;
328 
329  if (encoding_equal(sname, dname))
330  return -1;
331 
333  q->enc = sname;
334  q->next = NULL;
335  bfs.queue_last_ptr = &q->next;
336  bfs.queue = q;
337 
338  bfs.visited = st_init_strcasetable();
339  st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
340 
341  while (bfs.queue) {
342  q = bfs.queue;
343  bfs.queue = q->next;
344  if (!bfs.queue)
345  bfs.queue_last_ptr = &bfs.queue;
346 
347  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
348  xfree(q);
349  continue;
350  }
351  table2 = (st_table *)val;
352 
353  if (st_lookup(table2, (st_data_t)dname, &val)) {
354  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
355  xfree(q);
356  found = 1;
357  goto cleanup;
358  }
359 
360  bfs.base_enc = q->enc;
361  st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
362  bfs.base_enc = NULL;
363 
364  xfree(q);
365  }
366  found = 0;
367 
368  cleanup:
369  while (bfs.queue) {
370  q = bfs.queue;
371  bfs.queue = q->next;
372  xfree(q);
373  }
374 
375  if (found) {
376  const char *enc = dname;
377  int depth;
378  pathlen = 0;
379  while (1) {
380  st_lookup(bfs.visited, (st_data_t)enc, &val);
381  if (!val)
382  break;
383  pathlen++;
384  enc = (const char *)val;
385  }
386  depth = pathlen;
387  enc = dname;
388  while (1) {
389  st_lookup(bfs.visited, (st_data_t)enc, &val);
390  if (!val)
391  break;
392  callback((const char *)val, enc, --depth, arg);
393  enc = (const char *)val;
394  }
395  }
396 
397  st_free_table(bfs.visited);
398 
399  return pathlen; /* is -1 if not found */
400 }
401 
402 int rb_require_internal_silent(VALUE fname);
403 
404 static const rb_transcoder *
405 load_transcoder_entry(transcoder_entry_t *entry)
406 {
407  if (entry->transcoder)
408  return entry->transcoder;
409 
410  if (entry->lib) {
411  const char *const lib = entry->lib;
412  const size_t len = strlen(lib);
413  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
414  const VALUE fn = rb_str_new(0, total_len);
415  char *const path = RSTRING_PTR(fn);
416 
417  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
418  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
419  rb_str_set_len(fn, total_len);
420  OBJ_FREEZE(fn);
421  rb_require_internal_silent(fn);
422  }
423 
424  if (entry->transcoder)
425  return entry->transcoder;
426 
427  return NULL;
428 }
429 
430 static const char*
431 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
432 {
433  if (encoding_equal(encname, "UTF-8")) {
434  *len_ret = 3;
435  *repl_encname_ptr = "UTF-8";
436  return "\xEF\xBF\xBD";
437  }
438  else {
439  *len_ret = 1;
440  *repl_encname_ptr = "US-ASCII";
441  return "?";
442  }
443 }
444 
445 /*
446  * Transcoding engine logic
447  */
448 
449 static const unsigned char *
450 transcode_char_start(rb_transcoding *tc,
451  const unsigned char *in_start,
452  const unsigned char *inchar_start,
453  const unsigned char *in_p,
454  size_t *char_len_ptr)
455 {
456  const unsigned char *ptr;
457  if (inchar_start - in_start < tc->recognized_len) {
458  MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
459  inchar_start, unsigned char, in_p - inchar_start);
460  ptr = TRANSCODING_READBUF(tc);
461  }
462  else {
463  ptr = inchar_start - tc->recognized_len;
464  }
465  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
466  return ptr;
467 }
468 
469 static rb_econv_result_t
470 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
471  const unsigned char *in_stop, unsigned char *out_stop,
472  rb_transcoding *tc,
473  const int opt)
474 {
475  const rb_transcoder *tr = tc->transcoder;
476  int unitlen = tr->input_unit_length;
477  ssize_t readagain_len = 0;
478 
479  const unsigned char *inchar_start;
480  const unsigned char *in_p;
481 
482  unsigned char *out_p;
483 
484  in_p = inchar_start = *in_pos;
485 
486  out_p = *out_pos;
487 
488 #define SUSPEND(ret, num) \
489  do { \
490  tc->resume_position = (num); \
491  if (0 < in_p - inchar_start) \
492  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
493  inchar_start, unsigned char, in_p - inchar_start); \
494  *in_pos = in_p; \
495  *out_pos = out_p; \
496  tc->recognized_len += in_p - inchar_start; \
497  if (readagain_len) { \
498  tc->recognized_len -= readagain_len; \
499  tc->readagain_len = readagain_len; \
500  } \
501  return (ret); \
502  resume_label ## num:; \
503  } while (0)
504 #define SUSPEND_OBUF(num) \
505  do { \
506  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
507  } while (0)
508 
509 #define SUSPEND_AFTER_OUTPUT(num) \
510  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
511  SUSPEND(econv_after_output, num); \
512  }
513 
514 #define next_table (tc->next_table)
515 #define next_info (tc->next_info)
516 #define next_byte (tc->next_byte)
517 #define writebuf_len (tc->writebuf_len)
518 #define writebuf_off (tc->writebuf_off)
519 
520  switch (tc->resume_position) {
521  case 0: break;
522  case 1: goto resume_label1;
523  case 2: goto resume_label2;
524  case 3: goto resume_label3;
525  case 4: goto resume_label4;
526  case 5: goto resume_label5;
527  case 6: goto resume_label6;
528  case 7: goto resume_label7;
529  case 8: goto resume_label8;
530  case 9: goto resume_label9;
531  case 10: goto resume_label10;
532  case 11: goto resume_label11;
533  case 12: goto resume_label12;
534  case 13: goto resume_label13;
535  case 14: goto resume_label14;
536  case 15: goto resume_label15;
537  case 16: goto resume_label16;
538  case 17: goto resume_label17;
539  case 18: goto resume_label18;
540  case 19: goto resume_label19;
541  case 20: goto resume_label20;
542  case 21: goto resume_label21;
543  case 22: goto resume_label22;
544  case 23: goto resume_label23;
545  case 24: goto resume_label24;
546  case 25: goto resume_label25;
547  case 26: goto resume_label26;
548  case 27: goto resume_label27;
549  case 28: goto resume_label28;
550  case 29: goto resume_label29;
551  case 30: goto resume_label30;
552  case 31: goto resume_label31;
553  case 32: goto resume_label32;
554  case 33: goto resume_label33;
555  case 34: goto resume_label34;
556  }
557 
558  while (1) {
559  inchar_start = in_p;
560  tc->recognized_len = 0;
561  next_table = tr->conv_tree_start;
562 
563  SUSPEND_AFTER_OUTPUT(24);
564 
565  if (in_stop <= in_p) {
566  if (!(opt & ECONV_PARTIAL_INPUT))
567  break;
568  SUSPEND(econv_source_buffer_empty, 7);
569  continue;
570  }
571 
572 #define BYTE_ADDR(index) (tr->byte_array + (index))
573 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
574 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
575 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
576 #define BL_MIN_BYTE (BL_BASE[0])
577 #define BL_MAX_BYTE (BL_BASE[1])
578 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
579 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
580 
581  next_byte = (unsigned char)*in_p++;
582  follow_byte:
583  if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
584  next_info = INVALID;
585  else {
586  next_info = (VALUE)BL_ACTION(next_byte);
587  }
588  follow_info:
589  switch (next_info & 0x1F) {
590  case NOMAP:
591  {
592  const unsigned char *p = inchar_start;
593  writebuf_off = 0;
594  while (p < in_p) {
595  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
596  }
597  writebuf_len = writebuf_off;
598  writebuf_off = 0;
599  while (writebuf_off < writebuf_len) {
600  SUSPEND_OBUF(3);
601  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
602  }
603  }
604  continue;
605  case 0x00: case 0x04: case 0x08: case 0x0C:
606  case 0x10: case 0x14: case 0x18: case 0x1C:
607  SUSPEND_AFTER_OUTPUT(25);
608  while (in_p >= in_stop) {
609  if (!(opt & ECONV_PARTIAL_INPUT))
610  goto incomplete;
611  SUSPEND(econv_source_buffer_empty, 5);
612  }
613  next_byte = (unsigned char)*in_p++;
614  next_table = (unsigned int)next_info;
615  goto follow_byte;
616  case ZERObt: /* drop input */
617  continue;
618  case ONEbt:
619  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
620  continue;
621  case TWObt:
622  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
623  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
624  continue;
625  case THREEbt:
626  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
627  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
628  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
629  continue;
630  case FOURbt:
631  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
632  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
633  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
634  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
635  continue;
636  case GB4bt:
637  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
638  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
639  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
640  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
641  continue;
642  case STR1:
643  tc->output_index = 0;
644  while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
645  SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
646  tc->output_index++;
647  }
648  continue;
649  case FUNii:
650  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
651  goto follow_info;
652  case FUNsi:
653  {
654  const unsigned char *char_start;
655  size_t char_len;
656  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
657  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
658  goto follow_info;
659  }
660  case FUNio:
661  SUSPEND_OBUF(13);
662  if (tr->max_output <= out_stop - out_p)
663  out_p += tr->func_io(TRANSCODING_STATE(tc),
664  next_info, out_p, out_stop - out_p);
665  else {
666  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
667  next_info,
668  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
669  writebuf_off = 0;
670  while (writebuf_off < writebuf_len) {
671  SUSPEND_OBUF(20);
672  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
673  }
674  }
675  break;
676  case FUNso:
677  {
678  const unsigned char *char_start;
679  size_t char_len;
680  SUSPEND_OBUF(14);
681  if (tr->max_output <= out_stop - out_p) {
682  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
683  out_p += tr->func_so(TRANSCODING_STATE(tc),
684  char_start, (size_t)char_len,
685  out_p, out_stop - out_p);
686  }
687  else {
688  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
689  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
690  char_start, (size_t)char_len,
691  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
692  writebuf_off = 0;
693  while (writebuf_off < writebuf_len) {
694  SUSPEND_OBUF(22);
695  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
696  }
697  }
698  break;
699  }
700  case FUNsio:
701  {
702  const unsigned char *char_start;
703  size_t char_len;
704  SUSPEND_OBUF(33);
705  if (tr->max_output <= out_stop - out_p) {
706  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
707  out_p += tr->func_sio(TRANSCODING_STATE(tc),
708  char_start, (size_t)char_len, next_info,
709  out_p, out_stop - out_p);
710  }
711  else {
712  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
713  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
714  char_start, (size_t)char_len, next_info,
715  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
716  writebuf_off = 0;
717  while (writebuf_off < writebuf_len) {
718  SUSPEND_OBUF(34);
719  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
720  }
721  }
722  break;
723  }
724  case INVALID:
725  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
726  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
727  SUSPEND_AFTER_OUTPUT(26);
728  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
729  in_p = in_stop;
730  SUSPEND(econv_source_buffer_empty, 8);
731  }
732  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
733  in_p = in_stop;
734  }
735  else {
736  in_p = inchar_start + (unitlen - tc->recognized_len);
737  }
738  }
739  else {
740  ssize_t invalid_len; /* including the last byte which causes invalid */
741  ssize_t discard_len;
742  invalid_len = tc->recognized_len + (in_p - inchar_start);
743  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
744  readagain_len = invalid_len - discard_len;
745  }
746  goto invalid;
747  case UNDEF:
748  goto undef;
749  default:
750  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
751  }
752  continue;
753 
754  invalid:
755  SUSPEND(econv_invalid_byte_sequence, 1);
756  continue;
757 
758  incomplete:
759  SUSPEND(econv_incomplete_input, 27);
760  continue;
761 
762  undef:
763  SUSPEND(econv_undefined_conversion, 2);
764  continue;
765  }
766 
767  /* cleanup */
768  if (tr->finish_func) {
769  SUSPEND_OBUF(4);
770  if (tr->max_output <= out_stop - out_p) {
771  out_p += tr->finish_func(TRANSCODING_STATE(tc),
772  out_p, out_stop - out_p);
773  }
774  else {
775  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
776  TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
777  writebuf_off = 0;
778  while (writebuf_off < writebuf_len) {
779  SUSPEND_OBUF(23);
780  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
781  }
782  }
783  }
784  while (1)
785  SUSPEND(econv_finished, 6);
786 #undef SUSPEND
787 #undef next_table
788 #undef next_info
789 #undef next_byte
790 #undef writebuf_len
791 #undef writebuf_off
792 }
793 
794 static rb_econv_result_t
795 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
796  const unsigned char *in_stop, unsigned char *out_stop,
797  rb_transcoding *tc,
798  const int opt)
799 {
800  if (tc->readagain_len) {
801  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
802  const unsigned char *readagain_pos = readagain_buf;
803  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
804  rb_econv_result_t res;
805 
806  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
807  unsigned char, tc->readagain_len);
808  tc->readagain_len = 0;
809  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
810  if (res != econv_source_buffer_empty) {
811  MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
812  readagain_pos, unsigned char, readagain_stop - readagain_pos);
813  tc->readagain_len += readagain_stop - readagain_pos;
814  return res;
815  }
816  }
817  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
818 }
819 
820 static rb_transcoding *
821 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
822 {
823  rb_transcoding *tc;
824 
825  tc = ALLOC(rb_transcoding);
826  tc->transcoder = tr;
827  tc->flags = flags;
828  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
829  tc->state.ptr = xmalloc(tr->state_size);
830  if (tr->state_init_func) {
831  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
832  }
833  tc->resume_position = 0;
834  tc->recognized_len = 0;
835  tc->readagain_len = 0;
836  tc->writebuf_len = 0;
837  tc->writebuf_off = 0;
838  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
839  tc->readbuf.ptr = xmalloc(tr->max_input);
840  }
841  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
842  tc->writebuf.ptr = xmalloc(tr->max_output);
843  }
844  return tc;
845 }
846 
847 static rb_econv_result_t
848 rb_transcoding_convert(rb_transcoding *tc,
849  const unsigned char **input_ptr, const unsigned char *input_stop,
850  unsigned char **output_ptr, unsigned char *output_stop,
851  int flags)
852 {
853  return transcode_restartable(
854  input_ptr, output_ptr,
855  input_stop, output_stop,
856  tc, flags);
857 }
858 
859 static void
860 rb_transcoding_close(rb_transcoding *tc)
861 {
862  const rb_transcoder *tr = tc->transcoder;
863  if (tr->state_fini_func) {
864  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
865  }
866  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
867  xfree(tc->state.ptr);
868  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
869  xfree(tc->readbuf.ptr);
870  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
871  xfree(tc->writebuf.ptr);
872  xfree(tc);
873 }
874 
875 static size_t
876 rb_transcoding_memsize(rb_transcoding *tc)
877 {
878  size_t size = sizeof(rb_transcoding);
879  const rb_transcoder *tr = tc->transcoder;
880 
881  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
882  size += tr->state_size;
883  }
884  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
885  size += tr->max_input;
886  }
887  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
888  size += tr->max_output;
889  }
890  return size;
891 }
892 
893 static rb_econv_t *
894 rb_econv_alloc(int n_hint)
895 {
896  rb_econv_t *ec;
897 
898  if (n_hint <= 0)
899  n_hint = 1;
900 
901  ec = ALLOC(rb_econv_t);
902  ec->flags = 0;
903  ec->source_encoding_name = NULL;
904  ec->destination_encoding_name = NULL;
905  ec->started = 0;
906  ec->replacement_str = NULL;
907  ec->replacement_len = 0;
908  ec->replacement_enc = NULL;
909  ec->replacement_allocated = 0;
910  ec->in_buf_start = NULL;
911  ec->in_data_start = NULL;
912  ec->in_data_end = NULL;
913  ec->in_buf_end = NULL;
914  ec->num_allocated = n_hint;
915  ec->num_trans = 0;
916  ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
917  ec->num_finished = 0;
918  ec->last_tc = NULL;
919  ec->last_error.result = econv_source_buffer_empty;
920  ec->last_error.error_tc = NULL;
921  ec->last_error.source_encoding = NULL;
922  ec->last_error.destination_encoding = NULL;
923  ec->last_error.error_bytes_start = NULL;
924  ec->last_error.error_bytes_len = 0;
925  ec->last_error.readagain_len = 0;
926  ec->source_encoding = NULL;
927  ec->destination_encoding = NULL;
928  return ec;
929 }
930 
931 static int
932 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
933 {
934  int n, j;
935  int bufsize = 4096;
936  unsigned char *p;
937 
938  if (ec->num_trans == ec->num_allocated) {
939  n = ec->num_allocated * 2;
940  REALLOC_N(ec->elems, rb_econv_elem_t, n);
941  ec->num_allocated = n;
942  }
943 
944  p = xmalloc(bufsize);
945 
946  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
947 
948  ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
949  ec->elems[i].out_buf_start = p;
950  ec->elems[i].out_buf_end = p + bufsize;
951  ec->elems[i].out_data_start = p;
952  ec->elems[i].out_data_end = p;
953  ec->elems[i].last_result = econv_source_buffer_empty;
954 
955  ec->num_trans++;
956 
957  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
958  for (j = ec->num_trans-1; i <= j; j--) {
959  rb_transcoding *tc = ec->elems[j].tc;
960  const rb_transcoder *tr2 = tc->transcoder;
961  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
962  ec->last_tc = tc;
963  break;
964  }
965  }
966 
967  return 0;
968 }
969 
970 static rb_econv_t *
971 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
972 {
973  rb_econv_t *ec;
974  int i, ret;
975 
976  for (i = 0; i < n; i++) {
977  const rb_transcoder *tr;
978  tr = load_transcoder_entry(entries[i]);
979  if (!tr)
980  return NULL;
981  }
982 
983  ec = rb_econv_alloc(n);
984 
985  for (i = 0; i < n; i++) {
986  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
987  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
988  if (ret == -1) {
989  rb_econv_close(ec);
990  return NULL;
991  }
992  }
993 
994  return ec;
995 }
996 
997 struct trans_open_t {
998  transcoder_entry_t **entries;
999  int num_additional;
1000 };
1001 
1002 static void
1003 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
1004 {
1005  struct trans_open_t *toarg = arg;
1006 
1007  if (!toarg->entries) {
1008  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
1009  }
1010  toarg->entries[depth] = get_transcoder_entry(sname, dname);
1011 }
1012 
1013 static rb_econv_t *
1014 rb_econv_open0(const char *sname, const char *dname, int ecflags)
1015 {
1016  transcoder_entry_t **entries = NULL;
1017  int num_trans;
1018  rb_econv_t *ec;
1019 
1020  /* Just check if sname and dname are defined */
1021  /* (This check is needed?) */
1022  if (*sname) rb_enc_find_index(sname);
1023  if (*dname) rb_enc_find_index(dname);
1024 
1025  if (*sname == '\0' && *dname == '\0') {
1026  num_trans = 0;
1027  entries = NULL;
1028  sname = dname = "";
1029  }
1030  else {
1031  struct trans_open_t toarg;
1032  toarg.entries = NULL;
1033  toarg.num_additional = 0;
1034  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1035  entries = toarg.entries;
1036  if (num_trans < 0) {
1037  xfree(entries);
1038  return NULL;
1039  }
1040  }
1041 
1042  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1043  xfree(entries);
1044  if (!ec)
1045  return NULL;
1046 
1047  ec->flags = ecflags;
1048  ec->source_encoding_name = sname;
1049  ec->destination_encoding_name = dname;
1050 
1051  return ec;
1052 }
1053 
1054 #define MAX_ECFLAGS_DECORATORS 32
1055 
1056 static int
1057 decorator_names(int ecflags, const char **decorators_ret)
1058 {
1059  int num_decorators;
1060 
1061  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1066  case 0:
1067  break;
1068  default:
1069  return -1;
1070  }
1071 
1072  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1074  return -1;
1075 
1076  num_decorators = 0;
1077 
1078  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1079  decorators_ret[num_decorators++] = "xml_text_escape";
1080  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1081  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1082  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1083  decorators_ret[num_decorators++] = "xml_attr_quote";
1084 
1085  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1086  decorators_ret[num_decorators++] = "crlf_newline";
1087  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1088  decorators_ret[num_decorators++] = "cr_newline";
1089  if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1090  decorators_ret[num_decorators++] = "lf_newline";
1091  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1092  decorators_ret[num_decorators++] = "universal_newline";
1093 
1094  return num_decorators;
1095 }
1096 
1097 rb_econv_t *
1098 rb_econv_open(const char *sname, const char *dname, int ecflags)
1099 {
1100  rb_econv_t *ec;
1101  int num_decorators;
1102  const char *decorators[MAX_ECFLAGS_DECORATORS];
1103  int i;
1104 
1105  num_decorators = decorator_names(ecflags, decorators);
1106  if (num_decorators == -1)
1107  return NULL;
1108 
1109  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1110  if (!ec)
1111  return NULL;
1112 
1113  for (i = 0; i < num_decorators; i++)
1114  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1115  rb_econv_close(ec);
1116  return NULL;
1117  }
1118 
1119  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1120 
1121  return ec;
1122 }
1123 
1124 static int
1125 trans_sweep(rb_econv_t *ec,
1126  const unsigned char **input_ptr, const unsigned char *input_stop,
1127  unsigned char **output_ptr, unsigned char *output_stop,
1128  int flags,
1129  int start)
1130 {
1131  int try;
1132  int i, f;
1133 
1134  const unsigned char **ipp, *is, *iold;
1135  unsigned char **opp, *os, *oold;
1136  rb_econv_result_t res;
1137 
1138  try = 1;
1139  while (try) {
1140  try = 0;
1141  for (i = start; i < ec->num_trans; i++) {
1142  rb_econv_elem_t *te = &ec->elems[i];
1143 
1144  if (i == 0) {
1145  ipp = input_ptr;
1146  is = input_stop;
1147  }
1148  else {
1149  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1150  ipp = (const unsigned char **)&prev_te->out_data_start;
1151  is = prev_te->out_data_end;
1152  }
1153 
1154  if (i == ec->num_trans-1) {
1155  opp = output_ptr;
1156  os = output_stop;
1157  }
1158  else {
1159  if (te->out_buf_start != te->out_data_start) {
1160  ssize_t len = te->out_data_end - te->out_data_start;
1161  ssize_t off = te->out_data_start - te->out_buf_start;
1162  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1163  te->out_data_start = te->out_buf_start;
1164  te->out_data_end -= off;
1165  }
1166  opp = &te->out_data_end;
1167  os = te->out_buf_end;
1168  }
1169 
1170  f = flags;
1171  if (ec->num_finished != i)
1172  f |= ECONV_PARTIAL_INPUT;
1173  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1174  start = 1;
1175  flags &= ~ECONV_AFTER_OUTPUT;
1176  }
1177  if (i != 0)
1178  f &= ~ECONV_AFTER_OUTPUT;
1179  iold = *ipp;
1180  oold = *opp;
1181  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1182  if (iold != *ipp || oold != *opp)
1183  try = 1;
1184 
1185  switch (res) {
1189  case econv_after_output:
1190  return i;
1191 
1194  break;
1195 
1196  case econv_finished:
1197  ec->num_finished = i+1;
1198  break;
1199  }
1200  }
1201  }
1202  return -1;
1203 }
1204 
1205 static rb_econv_result_t
1206 rb_trans_conv(rb_econv_t *ec,
1207  const unsigned char **input_ptr, const unsigned char *input_stop,
1208  unsigned char **output_ptr, unsigned char *output_stop,
1209  int flags,
1210  int *result_position_ptr)
1211 {
1212  int i;
1213  int needreport_index;
1214  int sweep_start;
1215 
1216  unsigned char empty_buf;
1217  unsigned char *empty_ptr = &empty_buf;
1218 
1219  if (!input_ptr) {
1220  input_ptr = (const unsigned char **)&empty_ptr;
1221  input_stop = empty_ptr;
1222  }
1223 
1224  if (!output_ptr) {
1225  output_ptr = &empty_ptr;
1226  output_stop = empty_ptr;
1227  }
1228 
1229  if (ec->elems[0].last_result == econv_after_output)
1230  ec->elems[0].last_result = econv_source_buffer_empty;
1231 
1232  for (i = ec->num_trans-1; 0 <= i; i--) {
1233  switch (ec->elems[i].last_result) {
1237  case econv_after_output:
1238  case econv_finished:
1239  sweep_start = i+1;
1240  goto found_needreport;
1241 
1244  break;
1245 
1246  default:
1247  rb_bug("unexpected transcode last result");
1248  }
1249  }
1250 
1251  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1252 
1253  if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1254  (flags & ECONV_AFTER_OUTPUT)) {
1255  rb_econv_result_t res;
1256 
1257  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1259  result_position_ptr);
1260 
1261  if (res == econv_source_buffer_empty)
1262  return econv_after_output;
1263  return res;
1264  }
1265 
1266  sweep_start = 0;
1267 
1268  found_needreport:
1269 
1270  do {
1271  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1272  sweep_start = needreport_index + 1;
1273  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1274 
1275  for (i = ec->num_trans-1; 0 <= i; i--) {
1276  if (ec->elems[i].last_result != econv_source_buffer_empty) {
1277  rb_econv_result_t res = ec->elems[i].last_result;
1278  if (res == econv_invalid_byte_sequence ||
1279  res == econv_incomplete_input ||
1280  res == econv_undefined_conversion ||
1281  res == econv_after_output) {
1282  ec->elems[i].last_result = econv_source_buffer_empty;
1283  }
1284  if (result_position_ptr)
1285  *result_position_ptr = i;
1286  return res;
1287  }
1288  }
1289  if (result_position_ptr)
1290  *result_position_ptr = -1;
1292 }
1293 
1294 static rb_econv_result_t
1295 rb_econv_convert0(rb_econv_t *ec,
1296  const unsigned char **input_ptr, const unsigned char *input_stop,
1297  unsigned char **output_ptr, unsigned char *output_stop,
1298  int flags)
1299 {
1300  rb_econv_result_t res;
1301  int result_position;
1302  int has_output = 0;
1303 
1304  memset(&ec->last_error, 0, sizeof(ec->last_error));
1305 
1306  if (ec->num_trans == 0) {
1307  size_t len;
1308  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1309  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1310  len = output_stop - *output_ptr;
1311  memcpy(*output_ptr, ec->in_data_start, len);
1312  *output_ptr = output_stop;
1313  ec->in_data_start += len;
1315  goto gotresult;
1316  }
1317  len = ec->in_data_end - ec->in_data_start;
1318  memcpy(*output_ptr, ec->in_data_start, len);
1319  *output_ptr += len;
1320  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1321  if (flags & ECONV_AFTER_OUTPUT) {
1322  res = econv_after_output;
1323  goto gotresult;
1324  }
1325  }
1326  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1327  len = output_stop - *output_ptr;
1328  }
1329  else {
1330  len = input_stop - *input_ptr;
1331  }
1332  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1333  *(*output_ptr)++ = *(*input_ptr)++;
1334  res = econv_after_output;
1335  goto gotresult;
1336  }
1337  memcpy(*output_ptr, *input_ptr, len);
1338  *output_ptr += len;
1339  *input_ptr += len;
1340  if (*input_ptr != input_stop)
1342  else if (flags & ECONV_PARTIAL_INPUT)
1344  else
1345  res = econv_finished;
1346  goto gotresult;
1347  }
1348 
1349  if (ec->elems[ec->num_trans-1].out_data_start) {
1350  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1351  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1352  if (data_start != data_end) {
1353  size_t len;
1354  if (output_stop - *output_ptr < data_end - data_start) {
1355  len = output_stop - *output_ptr;
1356  memcpy(*output_ptr, data_start, len);
1357  *output_ptr = output_stop;
1358  ec->elems[ec->num_trans-1].out_data_start += len;
1360  goto gotresult;
1361  }
1362  len = data_end - data_start;
1363  memcpy(*output_ptr, data_start, len);
1364  *output_ptr += len;
1365  ec->elems[ec->num_trans-1].out_data_start =
1366  ec->elems[ec->num_trans-1].out_data_end =
1367  ec->elems[ec->num_trans-1].out_buf_start;
1368  has_output = 1;
1369  }
1370  }
1371 
1372  if (ec->in_buf_start &&
1373  ec->in_data_start != ec->in_data_end) {
1374  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1375  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1376  if (res != econv_source_buffer_empty)
1377  goto gotresult;
1378  }
1379 
1380  if (has_output &&
1381  (flags & ECONV_AFTER_OUTPUT) &&
1382  *input_ptr != input_stop) {
1383  input_stop = *input_ptr;
1384  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1385  if (res == econv_source_buffer_empty)
1386  res = econv_after_output;
1387  }
1388  else if ((flags & ECONV_AFTER_OUTPUT) ||
1389  ec->num_trans == 1) {
1390  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1391  }
1392  else {
1393  flags |= ECONV_AFTER_OUTPUT;
1394  do {
1395  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1396  } while (res == econv_after_output);
1397  }
1398 
1399  gotresult:
1400  ec->last_error.result = res;
1401  if (res == econv_invalid_byte_sequence ||
1402  res == econv_incomplete_input ||
1403  res == econv_undefined_conversion) {
1404  rb_transcoding *error_tc = ec->elems[result_position].tc;
1405  ec->last_error.error_tc = error_tc;
1406  ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1407  ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1408  ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1409  ec->last_error.error_bytes_len = error_tc->recognized_len;
1410  ec->last_error.readagain_len = error_tc->readagain_len;
1411  }
1412 
1413  return res;
1414 }
1415 
1416 static int output_replacement_character(rb_econv_t *ec);
1417 
1418 static int
1419 output_hex_charref(rb_econv_t *ec)
1420 {
1421  int ret;
1422  unsigned char utfbuf[1024];
1423  const unsigned char *utf;
1424  size_t utf_len;
1425  int utf_allocated = 0;
1426  char charef_buf[16];
1427  const unsigned char *p;
1428 
1429  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1430  utf = ec->last_error.error_bytes_start;
1431  utf_len = ec->last_error.error_bytes_len;
1432  }
1433  else {
1434  utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1435  ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1436  utfbuf, sizeof(utfbuf),
1437  &utf_len);
1438  if (!utf)
1439  return -1;
1440  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1441  utf_allocated = 1;
1442  }
1443 
1444  if (utf_len % 4 != 0)
1445  goto fail;
1446 
1447  p = utf;
1448  while (4 <= utf_len) {
1449  unsigned int u = 0;
1450  u += p[0] << 24;
1451  u += p[1] << 16;
1452  u += p[2] << 8;
1453  u += p[3];
1454  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1455 
1456  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1457  if (ret == -1)
1458  goto fail;
1459 
1460  p += 4;
1461  utf_len -= 4;
1462  }
1463 
1464  if (utf_allocated)
1465  xfree((void *)utf);
1466  return 0;
1467 
1468  fail:
1469  if (utf_allocated)
1470  xfree((void *)utf);
1471  return -1;
1472 }
1473 
1476  const unsigned char **input_ptr, const unsigned char *input_stop,
1477  unsigned char **output_ptr, unsigned char *output_stop,
1478  int flags)
1479 {
1480  rb_econv_result_t ret;
1481 
1482  unsigned char empty_buf;
1483  unsigned char *empty_ptr = &empty_buf;
1484 
1485  ec->started = 1;
1486 
1487  if (!input_ptr) {
1488  input_ptr = (const unsigned char **)&empty_ptr;
1489  input_stop = empty_ptr;
1490  }
1491 
1492  if (!output_ptr) {
1493  output_ptr = &empty_ptr;
1494  output_stop = empty_ptr;
1495  }
1496 
1497  resume:
1498  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1499 
1500  if (ret == econv_invalid_byte_sequence ||
1501  ret == econv_incomplete_input) {
1502  /* deal with invalid byte sequence */
1503  /* todo: add more alternative behaviors */
1504  switch (ec->flags & ECONV_INVALID_MASK) {
1505  case ECONV_INVALID_REPLACE:
1506  if (output_replacement_character(ec) == 0)
1507  goto resume;
1508  }
1509  }
1510 
1511  if (ret == econv_undefined_conversion) {
1512  /* valid character in source encoding
1513  * but no related character(s) in destination encoding */
1514  /* todo: add more alternative behaviors */
1515  switch (ec->flags & ECONV_UNDEF_MASK) {
1516  case ECONV_UNDEF_REPLACE:
1517  if (output_replacement_character(ec) == 0)
1518  goto resume;
1519  break;
1520 
1522  if (output_hex_charref(ec) == 0)
1523  goto resume;
1524  break;
1525  }
1526  }
1527 
1528  return ret;
1529 }
1530 
1531 const char *
1533 {
1534  rb_transcoding *tc = ec->last_tc;
1535  const rb_transcoder *tr;
1536 
1537  if (tc == NULL)
1538  return "";
1539 
1540  tr = tc->transcoder;
1541 
1542  if (tr->asciicompat_type == asciicompat_encoder)
1543  return tr->src_encoding;
1544  return tr->dst_encoding;
1545 }
1546 
1547 static unsigned char *
1548 allocate_converted_string(const char *sname, const char *dname,
1549  const unsigned char *str, size_t len,
1550  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1551  size_t *dst_len_ptr)
1552 {
1553  unsigned char *dst_str;
1554  size_t dst_len;
1555  size_t dst_bufsize;
1556 
1557  rb_econv_t *ec;
1558  rb_econv_result_t res;
1559 
1560  const unsigned char *sp;
1561  unsigned char *dp;
1562 
1563  if (caller_dst_buf)
1564  dst_bufsize = caller_dst_bufsize;
1565  else if (len == 0)
1566  dst_bufsize = 1;
1567  else
1568  dst_bufsize = len;
1569 
1570  ec = rb_econv_open(sname, dname, 0);
1571  if (ec == NULL)
1572  return NULL;
1573  if (caller_dst_buf)
1574  dst_str = caller_dst_buf;
1575  else
1576  dst_str = xmalloc(dst_bufsize);
1577  dst_len = 0;
1578  sp = str;
1579  dp = dst_str+dst_len;
1580  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1581  dst_len = dp - dst_str;
1582  while (res == econv_destination_buffer_full) {
1583  if (SIZE_MAX/2 < dst_bufsize) {
1584  goto fail;
1585  }
1586  dst_bufsize *= 2;
1587  if (dst_str == caller_dst_buf) {
1588  unsigned char *tmp;
1589  tmp = xmalloc(dst_bufsize);
1590  memcpy(tmp, dst_str, dst_bufsize/2);
1591  dst_str = tmp;
1592  }
1593  else {
1594  dst_str = xrealloc(dst_str, dst_bufsize);
1595  }
1596  dp = dst_str+dst_len;
1597  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1598  dst_len = dp - dst_str;
1599  }
1600  if (res != econv_finished) {
1601  goto fail;
1602  }
1603  rb_econv_close(ec);
1604  *dst_len_ptr = dst_len;
1605  return dst_str;
1606 
1607  fail:
1608  if (dst_str != caller_dst_buf)
1609  xfree(dst_str);
1610  rb_econv_close(ec);
1611  return NULL;
1612 }
1613 
1614 /* result: 0:success -1:failure */
1615 int
1617  const unsigned char *str, size_t len, const char *str_encoding)
1618 {
1619  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1620  unsigned char insert_buf[4096];
1621  const unsigned char *insert_str = NULL;
1622  size_t insert_len;
1623 
1624  int last_trans_index;
1625  rb_transcoding *tc;
1626 
1627  unsigned char **buf_start_p;
1628  unsigned char **data_start_p;
1629  unsigned char **data_end_p;
1630  unsigned char **buf_end_p;
1631 
1632  size_t need;
1633 
1634  ec->started = 1;
1635 
1636  if (len == 0)
1637  return 0;
1638 
1639  if (encoding_equal(insert_encoding, str_encoding)) {
1640  insert_str = str;
1641  insert_len = len;
1642  }
1643  else {
1644  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1645  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1646  if (insert_str == NULL)
1647  return -1;
1648  }
1649 
1650  need = insert_len;
1651 
1652  last_trans_index = ec->num_trans-1;
1653  if (ec->num_trans == 0) {
1654  tc = NULL;
1655  buf_start_p = &ec->in_buf_start;
1656  data_start_p = &ec->in_data_start;
1657  data_end_p = &ec->in_data_end;
1658  buf_end_p = &ec->in_buf_end;
1659  }
1660  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1661  tc = ec->elems[last_trans_index].tc;
1662  need += tc->readagain_len;
1663  if (need < insert_len)
1664  goto fail;
1665  if (last_trans_index == 0) {
1666  buf_start_p = &ec->in_buf_start;
1667  data_start_p = &ec->in_data_start;
1668  data_end_p = &ec->in_data_end;
1669  buf_end_p = &ec->in_buf_end;
1670  }
1671  else {
1672  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1673  buf_start_p = &ee->out_buf_start;
1674  data_start_p = &ee->out_data_start;
1675  data_end_p = &ee->out_data_end;
1676  buf_end_p = &ee->out_buf_end;
1677  }
1678  }
1679  else {
1680  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1681  buf_start_p = &ee->out_buf_start;
1682  data_start_p = &ee->out_data_start;
1683  data_end_p = &ee->out_data_end;
1684  buf_end_p = &ee->out_buf_end;
1685  tc = ec->elems[last_trans_index].tc;
1686  }
1687 
1688  if (*buf_start_p == NULL) {
1689  unsigned char *buf = xmalloc(need);
1690  *buf_start_p = buf;
1691  *data_start_p = buf;
1692  *data_end_p = buf;
1693  *buf_end_p = buf+need;
1694  }
1695  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1696  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1697  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1698  *data_start_p = *buf_start_p;
1699  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1700  unsigned char *buf;
1701  size_t s = (*data_end_p - *buf_start_p) + need;
1702  if (s < need)
1703  goto fail;
1704  buf = xrealloc(*buf_start_p, s);
1705  *data_start_p = buf;
1706  *data_end_p = buf + (*data_end_p - *buf_start_p);
1707  *buf_start_p = buf;
1708  *buf_end_p = buf + s;
1709  }
1710  }
1711 
1712  memcpy(*data_end_p, insert_str, insert_len);
1713  *data_end_p += insert_len;
1714  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1715  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1716  *data_end_p += tc->readagain_len;
1717  tc->readagain_len = 0;
1718  }
1719 
1720  if (insert_str != str && insert_str != insert_buf)
1721  xfree((void*)insert_str);
1722  return 0;
1723 
1724  fail:
1725  if (insert_str != str && insert_str != insert_buf)
1726  xfree((void*)insert_str);
1727  return -1;
1728 }
1729 
1730 void
1732 {
1733  int i;
1734 
1735  if (ec->replacement_allocated) {
1736  xfree((void *)ec->replacement_str);
1737  }
1738  for (i = 0; i < ec->num_trans; i++) {
1739  rb_transcoding_close(ec->elems[i].tc);
1740  xfree(ec->elems[i].out_buf_start);
1741  }
1742  xfree(ec->in_buf_start);
1743  xfree(ec->elems);
1744  xfree(ec);
1745 }
1746 
1747 size_t
1748 rb_econv_memsize(rb_econv_t *ec)
1749 {
1750  size_t size = sizeof(rb_econv_t);
1751  int i;
1752 
1753  if (ec->replacement_allocated) {
1754  size += ec->replacement_len;
1755  }
1756  for (i = 0; i < ec->num_trans; i++) {
1757  size += rb_transcoding_memsize(ec->elems[i].tc);
1758 
1759  if (ec->elems[i].out_buf_start) {
1760  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1761  }
1762  }
1763  size += ec->in_buf_end - ec->in_buf_start;
1764  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1765 
1766  return size;
1767 }
1768 
1769 int
1771 {
1772  if (ec->num_trans == 0)
1773  return 0;
1774 #if SIZEOF_SIZE_T > SIZEOF_INT
1775  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1776 #endif
1777  return (int)ec->elems[0].tc->readagain_len;
1778 }
1779 
1780 void
1781 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1782 {
1783  rb_transcoding *tc;
1784  if (ec->num_trans == 0 || n == 0)
1785  return;
1786  tc = ec->elems[0].tc;
1787  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1788  tc->readagain_len -= n;
1789 }
1790 
1792  const char *ascii_compat_name;
1793  const char *ascii_incompat_name;
1794 };
1795 
1796 static int
1797 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1798 {
1799  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1800  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1801  const rb_transcoder *tr;
1802 
1803  if (DECORATOR_P(entry->sname, entry->dname))
1804  return ST_CONTINUE;
1805  tr = load_transcoder_entry(entry);
1806  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1807  data->ascii_compat_name = tr->dst_encoding;
1808  return ST_STOP;
1809  }
1810  return ST_CONTINUE;
1811 }
1812 
1813 const char *
1814 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1815 {
1816  st_data_t v;
1817  st_table *table2;
1818  struct asciicompat_encoding_t data;
1819 
1820  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1821  return NULL;
1822  table2 = (st_table *)v;
1823 
1824  /*
1825  * Assumption:
1826  * There is at most one transcoder for
1827  * converting from ASCII incompatible encoding.
1828  *
1829  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1830  */
1831  if (table2->num_entries != 1)
1832  return NULL;
1833 
1834  data.ascii_incompat_name = ascii_incompat_name;
1835  data.ascii_compat_name = NULL;
1836  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1837  return data.ascii_compat_name;
1838 }
1839 
1840 /*
1841  * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1842  *
1843  * If the result of the conversion is not compatible with the encoding of
1844  * `dst`, `dst` may not be valid encoding.
1845  */
1846 VALUE
1847 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1848 {
1849  unsigned const char *sp, *se;
1850  unsigned char *ds, *dp, *de;
1851  rb_econv_result_t res;
1852  int max_output;
1853  enum ruby_coderange_type coderange;
1854  rb_encoding *dst_enc = ec->destination_encoding;
1855 
1856  if (NIL_P(dst)) {
1857  dst = rb_str_buf_new(len);
1858  if (dst_enc) {
1859  rb_enc_associate(dst, dst_enc);
1860  }
1861  coderange = ENC_CODERANGE_7BIT; // scan from the start
1862  }
1863  else {
1864  dst_enc = rb_enc_get(dst);
1865  coderange = rb_enc_str_coderange(dst);
1866  }
1867 
1868  if (ec->last_tc)
1869  max_output = ec->last_tc->transcoder->max_output;
1870  else
1871  max_output = 1;
1872 
1873  do {
1874  int cr;
1875  long dlen = RSTRING_LEN(dst);
1876  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1877  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1878  if (LONG_MAX < new_capa)
1879  rb_raise(rb_eArgError, "too long string");
1880  rb_str_modify_expand(dst, new_capa - dlen);
1881  }
1882  sp = (const unsigned char *)ss;
1883  se = sp + len;
1884  ds = (unsigned char *)RSTRING_PTR(dst);
1885  de = ds + rb_str_capacity(dst);
1886  dp = ds += dlen;
1887  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1888  switch (coderange) {
1889  case ENC_CODERANGE_7BIT:
1890  case ENC_CODERANGE_VALID:
1891  cr = (int)coderange;
1892  rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1893  coderange = cr;
1894  ENC_CODERANGE_SET(dst, coderange);
1895  break;
1896  case ENC_CODERANGE_UNKNOWN:
1897  case ENC_CODERANGE_BROKEN:
1898  break;
1899  }
1900  len -= (const char *)sp - ss;
1901  ss = (const char *)sp;
1902  rb_str_set_len(dst, dlen + (dp - ds));
1904  } while (res == econv_destination_buffer_full);
1905 
1906  return dst;
1907 }
1908 
1909 VALUE
1910 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1911 {
1912  src = rb_str_new_frozen(src);
1913  dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1914  RB_GC_GUARD(src);
1915  return dst;
1916 }
1917 
1918 VALUE
1919 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1920 {
1921  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1922 }
1923 
1924 VALUE
1925 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1926 {
1927  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1928 }
1929 
1930 VALUE
1932 {
1933  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1934 }
1935 
1936 static int
1937 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1938 {
1939  transcoder_entry_t *entry;
1940  const rb_transcoder *tr;
1941 
1942  if (ec->started != 0)
1943  return -1;
1944 
1945  entry = get_transcoder_entry(sname, dname);
1946  if (!entry)
1947  return -1;
1948 
1949  tr = load_transcoder_entry(entry);
1950  if (!tr) return -1;
1951 
1952  return rb_econv_add_transcoder_at(ec, tr, n);
1953 }
1954 
1955 static int
1956 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1957 {
1958  return rb_econv_add_converter(ec, "", decorator_name, n);
1959 }
1960 
1961 int
1962 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1963 {
1964  const rb_transcoder *tr;
1965 
1966  if (ec->num_trans == 0)
1967  return rb_econv_decorate_at(ec, decorator_name, 0);
1968 
1969  tr = ec->elems[0].tc->transcoder;
1970 
1971  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1972  tr->asciicompat_type == asciicompat_decoder)
1973  return rb_econv_decorate_at(ec, decorator_name, 1);
1974 
1975  return rb_econv_decorate_at(ec, decorator_name, 0);
1976 }
1977 
1978 int
1979 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1980 {
1981  const rb_transcoder *tr;
1982 
1983  if (ec->num_trans == 0)
1984  return rb_econv_decorate_at(ec, decorator_name, 0);
1985 
1986  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1987 
1988  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1989  tr->asciicompat_type == asciicompat_encoder)
1990  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1991 
1992  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1993 }
1994 
1995 void
1997 {
1998  const char *dname = 0;
1999 
2000  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
2002  dname = "universal_newline";
2003  break;
2005  dname = "crlf_newline";
2006  break;
2008  dname = "cr_newline";
2009  break;
2011  dname = "lf_newline";
2012  break;
2013  }
2014 
2015  if (dname) {
2016  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
2017  int num_trans = ec->num_trans;
2018  int i, j = 0;
2019 
2020  for (i=0; i < num_trans; i++) {
2021  if (transcoder == ec->elems[i].tc->transcoder) {
2022  rb_transcoding_close(ec->elems[i].tc);
2023  xfree(ec->elems[i].out_buf_start);
2024  ec->num_trans--;
2025  }
2026  else
2027  ec->elems[j++] = ec->elems[i];
2028  }
2029  }
2030 
2031  ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2032 }
2033 
2034 static VALUE
2035 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2036 {
2037  int has_description = 0;
2038 
2039  if (NIL_P(mesg))
2040  mesg = rb_str_new(NULL, 0);
2041 
2042  if (*sname != '\0' || *dname != '\0') {
2043  if (*sname == '\0')
2044  rb_str_cat2(mesg, dname);
2045  else if (*dname == '\0')
2046  rb_str_cat2(mesg, sname);
2047  else
2048  rb_str_catf(mesg, "%s to %s", sname, dname);
2049  has_description = 1;
2050  }
2051 
2052  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2056  const char *pre = "";
2057  if (has_description)
2058  rb_str_cat2(mesg, " with ");
2059  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2060  rb_str_cat2(mesg, pre); pre = ",";
2061  rb_str_cat2(mesg, "universal_newline");
2062  }
2063  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2064  rb_str_cat2(mesg, pre); pre = ",";
2065  rb_str_cat2(mesg, "crlf_newline");
2066  }
2067  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2068  rb_str_cat2(mesg, pre); pre = ",";
2069  rb_str_cat2(mesg, "cr_newline");
2070  }
2071  if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2072  rb_str_cat2(mesg, pre); pre = ",";
2073  rb_str_cat2(mesg, "lf_newline");
2074  }
2075  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2076  rb_str_cat2(mesg, pre); pre = ",";
2077  rb_str_cat2(mesg, "xml_text");
2078  }
2079  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2080  rb_str_cat2(mesg, pre); pre = ",";
2081  rb_str_cat2(mesg, "xml_attr_content");
2082  }
2083  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2084  rb_str_cat2(mesg, pre); pre = ",";
2085  rb_str_cat2(mesg, "xml_attr_quote");
2086  }
2087  has_description = 1;
2088  }
2089  if (!has_description) {
2090  rb_str_cat2(mesg, "no-conversion");
2091  }
2092 
2093  return mesg;
2094 }
2095 
2096 VALUE
2097 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2098 {
2099  VALUE mesg, exc;
2100  mesg = rb_str_new_cstr("code converter not found (");
2101  econv_description(sname, dname, ecflags, mesg);
2102  rb_str_cat2(mesg, ")");
2103  exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2104  return exc;
2105 }
2106 
2107 static VALUE
2108 make_econv_exception(rb_econv_t *ec)
2109 {
2110  VALUE mesg, exc;
2111  if (ec->last_error.result == econv_invalid_byte_sequence ||
2112  ec->last_error.result == econv_incomplete_input) {
2113  const char *err = (const char *)ec->last_error.error_bytes_start;
2114  size_t error_len = ec->last_error.error_bytes_len;
2115  VALUE bytes = rb_str_new(err, error_len);
2116  VALUE dumped = rb_str_dump(bytes);
2117  size_t readagain_len = ec->last_error.readagain_len;
2118  VALUE bytes2 = Qnil;
2119  VALUE dumped2;
2120  if (ec->last_error.result == econv_incomplete_input) {
2121  mesg = rb_sprintf("incomplete %s on %s",
2122  StringValueCStr(dumped),
2123  ec->last_error.source_encoding);
2124  }
2125  else if (readagain_len) {
2126  bytes2 = rb_str_new(err+error_len, readagain_len);
2127  dumped2 = rb_str_dump(bytes2);
2128  mesg = rb_sprintf("%s followed by %s on %s",
2129  StringValueCStr(dumped),
2130  StringValueCStr(dumped2),
2131  ec->last_error.source_encoding);
2132  }
2133  else {
2134  mesg = rb_sprintf("%s on %s",
2135  StringValueCStr(dumped),
2136  ec->last_error.source_encoding);
2137  }
2138 
2139  exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2140  rb_ivar_set(exc, id_error_bytes, bytes);
2141  rb_ivar_set(exc, id_readagain_bytes, bytes2);
2142  rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2143  goto set_encs;
2144  }
2145  if (ec->last_error.result == econv_undefined_conversion) {
2146  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2147  ec->last_error.error_bytes_len);
2148  VALUE dumped = Qnil;
2149  int idx;
2150  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2151  rb_encoding *utf8 = rb_utf8_encoding();
2152  const char *start, *end;
2153  int n;
2154  start = (const char *)ec->last_error.error_bytes_start;
2155  end = start + ec->last_error.error_bytes_len;
2156  n = rb_enc_precise_mbclen(start, end, utf8);
2157  if (MBCLEN_CHARFOUND_P(n) &&
2158  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2159  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2160  dumped = rb_sprintf("U+%04X", cc);
2161  }
2162  }
2163  if (NIL_P(dumped))
2164  dumped = rb_str_dump(bytes);
2165  if (strcmp(ec->last_error.source_encoding,
2166  ec->source_encoding_name) == 0 &&
2167  strcmp(ec->last_error.destination_encoding,
2168  ec->destination_encoding_name) == 0) {
2169  mesg = rb_sprintf("%s from %s to %s",
2170  StringValueCStr(dumped),
2171  ec->last_error.source_encoding,
2172  ec->last_error.destination_encoding);
2173  }
2174  else {
2175  int i;
2176  mesg = rb_sprintf("%s to %s in conversion from %s",
2177  StringValueCStr(dumped),
2178  ec->last_error.destination_encoding,
2179  ec->source_encoding_name);
2180  for (i = 0; i < ec->num_trans; i++) {
2181  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2182  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2183  rb_str_catf(mesg, " to %s",
2184  ec->elems[i].tc->transcoder->dst_encoding);
2185  }
2186  }
2187  exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2188  idx = rb_enc_find_index(ec->last_error.source_encoding);
2189  if (0 <= idx)
2190  rb_enc_associate_index(bytes, idx);
2191  rb_ivar_set(exc, id_error_char, bytes);
2192  goto set_encs;
2193  }
2194  return Qnil;
2195 
2196  set_encs:
2197  rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2198  rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2199  int idx = rb_enc_find_index(ec->last_error.source_encoding);
2200  if (0 <= idx)
2201  rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2202  idx = rb_enc_find_index(ec->last_error.destination_encoding);
2203  if (0 <= idx)
2204  rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2205  return exc;
2206 }
2207 
2208 static void
2209 more_output_buffer(
2210  VALUE destination,
2211  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2212  int max_output,
2213  unsigned char **out_start_ptr,
2214  unsigned char **out_pos,
2215  unsigned char **out_stop_ptr)
2216 {
2217  size_t len = (*out_pos - *out_start_ptr);
2218  size_t new_len = (len + max_output) * 2;
2219  *out_start_ptr = resize_destination(destination, len, new_len);
2220  *out_pos = *out_start_ptr + len;
2221  *out_stop_ptr = *out_start_ptr + new_len;
2222 }
2223 
2224 static int
2225 make_replacement(rb_econv_t *ec)
2226 {
2227  rb_transcoding *tc;
2228  const rb_transcoder *tr;
2229  const unsigned char *replacement;
2230  const char *repl_enc;
2231  const char *ins_enc;
2232  size_t len;
2233 
2234  if (ec->replacement_str)
2235  return 0;
2236 
2237  ins_enc = rb_econv_encoding_to_insert_output(ec);
2238 
2239  tc = ec->last_tc;
2240  if (*ins_enc) {
2241  tr = tc->transcoder;
2242  rb_enc_find(tr->dst_encoding);
2243  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2244  }
2245  else {
2246  replacement = (unsigned char *)"?";
2247  len = 1;
2248  repl_enc = "";
2249  }
2250 
2251  ec->replacement_str = replacement;
2252  ec->replacement_len = len;
2253  ec->replacement_enc = repl_enc;
2254  ec->replacement_allocated = 0;
2255  return 0;
2256 }
2257 
2258 int
2260  const unsigned char *str, size_t len, const char *encname)
2261 {
2262  unsigned char *str2;
2263  size_t len2;
2264  const char *encname2;
2265 
2266  encname2 = rb_econv_encoding_to_insert_output(ec);
2267 
2268  if (!*encname2 || encoding_equal(encname, encname2)) {
2269  str2 = xmalloc(len);
2270  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2271  len2 = len;
2272  encname2 = encname;
2273  }
2274  else {
2275  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2276  if (!str2)
2277  return -1;
2278  }
2279 
2280  if (ec->replacement_allocated) {
2281  xfree((void *)ec->replacement_str);
2282  }
2283  ec->replacement_allocated = 1;
2284  ec->replacement_str = str2;
2285  ec->replacement_len = len2;
2286  ec->replacement_enc = encname2;
2287  return 0;
2288 }
2289 
2290 static int
2291 output_replacement_character(rb_econv_t *ec)
2292 {
2293  int ret;
2294 
2295  if (make_replacement(ec) == -1)
2296  return -1;
2297 
2298  ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2299  if (ret == -1)
2300  return -1;
2301 
2302  return 0;
2303 }
2304 
2305 #if 1
2306 #define hash_fallback rb_hash_aref
2307 
2308 static VALUE
2309 proc_fallback(VALUE fallback, VALUE c)
2310 {
2311  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2312 }
2313 
2314 static VALUE
2315 method_fallback(VALUE fallback, VALUE c)
2316 {
2317  return rb_method_call(1, &c, fallback);
2318 }
2319 
2320 static VALUE
2321 aref_fallback(VALUE fallback, VALUE c)
2322 {
2323  return rb_funcallv_public(fallback, idAREF, 1, &c);
2324 }
2325 
2326 static void
2327 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2328  const unsigned char *in_stop, unsigned char *out_stop,
2329  VALUE destination,
2330  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2331  const char *src_encoding,
2332  const char *dst_encoding,
2333  int ecflags,
2334  VALUE ecopts)
2335 {
2336  rb_econv_t *ec;
2337  rb_transcoding *last_tc;
2338  rb_econv_result_t ret;
2339  unsigned char *out_start = *out_pos;
2340  int max_output;
2341  VALUE exc;
2342  VALUE fallback = Qnil;
2343  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2344 
2345  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2346  if (!ec)
2347  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2348 
2349  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2350  fallback = rb_hash_aref(ecopts, sym_fallback);
2351  if (RB_TYPE_P(fallback, T_HASH)) {
2352  fallback_func = hash_fallback;
2353  }
2354  else if (rb_obj_is_proc(fallback)) {
2355  fallback_func = proc_fallback;
2356  }
2357  else if (rb_obj_is_method(fallback)) {
2358  fallback_func = method_fallback;
2359  }
2360  else {
2361  fallback_func = aref_fallback;
2362  }
2363  }
2364  last_tc = ec->last_tc;
2365  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2366 
2367  resume:
2368  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2369 
2370  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2371  VALUE rep = rb_enc_str_new(
2372  (const char *)ec->last_error.error_bytes_start,
2373  ec->last_error.error_bytes_len,
2374  rb_enc_find(ec->last_error.source_encoding));
2375  rep = (*fallback_func)(fallback, rep);
2376  if (!UNDEF_P(rep) && !NIL_P(rep)) {
2377  StringValue(rep);
2378  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2379  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2380  if ((int)ret == -1) {
2381  rb_raise(rb_eArgError, "too big fallback string");
2382  }
2383  goto resume;
2384  }
2385  }
2386 
2387  if (ret == econv_invalid_byte_sequence ||
2388  ret == econv_incomplete_input ||
2389  ret == econv_undefined_conversion) {
2390  exc = make_econv_exception(ec);
2391  rb_econv_close(ec);
2392  rb_exc_raise(exc);
2393  }
2394 
2395  if (ret == econv_destination_buffer_full) {
2396  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2397  goto resume;
2398  }
2399 
2400  rb_econv_close(ec);
2401  return;
2402 }
2403 #else
2404 /* sample transcode_loop implementation in byte-by-byte stream style */
2405 static void
2406 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2407  const unsigned char *in_stop, unsigned char *out_stop,
2408  VALUE destination,
2409  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2410  const char *src_encoding,
2411  const char *dst_encoding,
2412  int ecflags,
2413  VALUE ecopts)
2414 {
2415  rb_econv_t *ec;
2416  rb_transcoding *last_tc;
2417  rb_econv_result_t ret;
2418  unsigned char *out_start = *out_pos;
2419  const unsigned char *ptr;
2420  int max_output;
2421  VALUE exc;
2422 
2423  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2424  if (!ec)
2425  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2426 
2427  last_tc = ec->last_tc;
2428  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2429 
2431  ptr = *in_pos;
2432  while (ret != econv_finished) {
2433  unsigned char input_byte;
2434  const unsigned char *p = &input_byte;
2435 
2436  if (ret == econv_source_buffer_empty) {
2437  if (ptr < in_stop) {
2438  input_byte = *ptr;
2439  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2440  }
2441  else {
2442  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2443  }
2444  }
2445  else {
2446  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2447  }
2448  if (&input_byte != p)
2449  ptr += p - &input_byte;
2450  switch (ret) {
2454  exc = make_econv_exception(ec);
2455  rb_econv_close(ec);
2456  rb_exc_raise(exc);
2457  break;
2458 
2460  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2461  break;
2462 
2464  break;
2465 
2466  case econv_finished:
2467  break;
2468  }
2469  }
2470  rb_econv_close(ec);
2471  *in_pos = in_stop;
2472  return;
2473 }
2474 #endif
2475 
2476 
2477 /*
2478  * String-specific code
2479  */
2480 
2481 static unsigned char *
2482 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2483 {
2484  rb_str_resize(destination, new_len);
2485  return (unsigned char *)RSTRING_PTR(destination);
2486 }
2487 
2488 static int
2489 econv_opts(VALUE opt, int ecflags)
2490 {
2491  VALUE v;
2492  int newlineflag = 0;
2493 
2494  v = rb_hash_aref(opt, sym_invalid);
2495  if (NIL_P(v)) {
2496  }
2497  else if (v==sym_replace) {
2498  ecflags |= ECONV_INVALID_REPLACE;
2499  }
2500  else {
2501  rb_raise(rb_eArgError, "unknown value for invalid character option");
2502  }
2503 
2504  v = rb_hash_aref(opt, sym_undef);
2505  if (NIL_P(v)) {
2506  }
2507  else if (v==sym_replace) {
2508  ecflags |= ECONV_UNDEF_REPLACE;
2509  }
2510  else {
2511  rb_raise(rb_eArgError, "unknown value for undefined character option");
2512  }
2513 
2514  v = rb_hash_aref(opt, sym_replace);
2515  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2516  ecflags |= ECONV_UNDEF_REPLACE;
2517  }
2518 
2519  v = rb_hash_aref(opt, sym_xml);
2520  if (!NIL_P(v)) {
2521  if (v==sym_text) {
2523  }
2524  else if (v==sym_attr) {
2526  }
2527  else if (SYMBOL_P(v)) {
2528  rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2529  }
2530  else {
2531  rb_raise(rb_eArgError, "unexpected value for xml option");
2532  }
2533  }
2534 
2535 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2536  v = rb_hash_aref(opt, sym_newline);
2537  if (!NIL_P(v)) {
2538  newlineflag = 2;
2539  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2540  if (v == sym_universal) {
2542  }
2543  else if (v == sym_crlf) {
2544  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2545  }
2546  else if (v == sym_cr) {
2547  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2548  }
2549  else if (v == sym_lf) {
2550  ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2551  }
2552  else if (SYMBOL_P(v)) {
2553  rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2554  rb_sym2str(v));
2555  }
2556  else {
2557  rb_raise(rb_eArgError, "unexpected value for newline option");
2558  }
2559  }
2560 #endif
2561  {
2562  int setflags = 0;
2563 
2564  v = rb_hash_aref(opt, sym_universal_newline);
2565  if (RTEST(v))
2567  newlineflag |= !NIL_P(v);
2568 
2569  v = rb_hash_aref(opt, sym_crlf_newline);
2570  if (RTEST(v))
2571  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2572  newlineflag |= !NIL_P(v);
2573 
2574  v = rb_hash_aref(opt, sym_cr_newline);
2575  if (RTEST(v))
2576  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2577  newlineflag |= !NIL_P(v);
2578 
2579  v = rb_hash_aref(opt, sym_lf_newline);
2580  if (RTEST(v))
2581  setflags |= ECONV_LF_NEWLINE_DECORATOR;
2582  newlineflag |= !NIL_P(v);
2583 
2584  switch (newlineflag) {
2585  case 1:
2586  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2587  ecflags |= setflags;
2588  break;
2589 
2590  case 3:
2591  rb_warning(":newline option precedes other newline options");
2592  break;
2593  }
2594  }
2595 
2596  return ecflags;
2597 }
2598 
2599 int
2600 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2601 {
2602  VALUE newhash = Qnil;
2603  VALUE v;
2604 
2605  if (NIL_P(opthash)) {
2606  *opts = Qnil;
2607  return ecflags;
2608  }
2609  ecflags = econv_opts(opthash, ecflags);
2610 
2611  v = rb_hash_aref(opthash, sym_replace);
2612  if (!NIL_P(v)) {
2613  StringValue(v);
2614  if (is_broken_string(v)) {
2615  VALUE dumped = rb_str_dump(v);
2616  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2617  StringValueCStr(dumped),
2618  rb_enc_name(rb_enc_get(v)));
2619  }
2620  v = rb_str_new_frozen(v);
2621  newhash = rb_hash_new();
2622  rb_hash_aset(newhash, sym_replace, v);
2623  }
2624 
2625  v = rb_hash_aref(opthash, sym_fallback);
2626  if (!NIL_P(v)) {
2627  VALUE h = rb_check_hash_type(v);
2628  if (NIL_P(h)
2629  ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2630  : (v = h, 1)) {
2631  if (NIL_P(newhash))
2632  newhash = rb_hash_new();
2633  rb_hash_aset(newhash, sym_fallback, v);
2634  }
2635  }
2636 
2637  if (!NIL_P(newhash))
2638  rb_hash_freeze(newhash);
2639  *opts = newhash;
2640 
2641  return ecflags;
2642 }
2643 
2644 int
2646 {
2647  return rb_econv_prepare_options(opthash, opts, 0);
2648 }
2649 
2650 rb_econv_t *
2651 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2652 {
2653  rb_econv_t *ec;
2654  VALUE replacement;
2655 
2656  if (NIL_P(opthash)) {
2657  replacement = Qnil;
2658  }
2659  else {
2660  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2661  rb_bug("rb_econv_open_opts called with invalid opthash");
2662  replacement = rb_hash_aref(opthash, sym_replace);
2663  }
2664 
2665  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2666  if (!ec)
2667  return ec;
2668 
2669  if (!NIL_P(replacement)) {
2670  int ret;
2671  rb_encoding *enc = rb_enc_get(replacement);
2672 
2673  ret = rb_econv_set_replacement(ec,
2674  (const unsigned char *)RSTRING_PTR(replacement),
2675  RSTRING_LEN(replacement),
2676  rb_enc_name(enc));
2677  if (ret == -1) {
2678  rb_econv_close(ec);
2679  return NULL;
2680  }
2681  }
2682  return ec;
2683 }
2684 
2685 static int
2686 enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2687 {
2688  rb_encoding *enc;
2689  const char *n;
2690  int encidx;
2691  VALUE encval;
2692 
2693  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2694  !(enc = rb_enc_from_index(encidx))) {
2695  enc = NULL;
2696  encidx = 0;
2697  n = StringValueCStr(*arg);
2698  }
2699  else {
2700  n = rb_enc_name(enc);
2701  }
2702 
2703  *name_p = n;
2704  *enc_p = enc;
2705 
2706  return encidx;
2707 }
2708 
2709 static int
2710 str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2711  const char **sname_p, rb_encoding **senc_p,
2712  const char **dname_p, rb_encoding **denc_p)
2713 {
2714  rb_encoding *senc, *denc;
2715  const char *sname, *dname;
2716  int sencidx, dencidx;
2717 
2718  dencidx = enc_arg(arg1, &dname, &denc);
2719 
2720  if (NIL_P(*arg2)) {
2721  sencidx = rb_enc_get_index(str);
2722  senc = rb_enc_from_index(sencidx);
2723  sname = rb_enc_name(senc);
2724  }
2725  else {
2726  sencidx = enc_arg(arg2, &sname, &senc);
2727  }
2728 
2729  *sname_p = sname;
2730  *senc_p = senc;
2731  *dname_p = dname;
2732  *denc_p = denc;
2733  return dencidx;
2734 }
2735 
2736 static int
2737 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2738 {
2739  VALUE dest;
2740  VALUE str = *self;
2741  VALUE arg1, arg2;
2742  long blen, slen;
2743  unsigned char *buf, *bp, *sp;
2744  const unsigned char *fromp;
2745  rb_encoding *senc, *denc;
2746  const char *sname, *dname;
2747  int dencidx;
2748  int explicitly_invalid_replace = TRUE;
2749 
2750  rb_check_arity(argc, 0, 2);
2751 
2752  if (argc == 0) {
2753  arg1 = rb_enc_default_internal();
2754  if (NIL_P(arg1)) {
2755  if (!ecflags) return -1;
2756  arg1 = rb_obj_encoding(str);
2757  }
2758  if (!(ecflags & ECONV_INVALID_MASK)) {
2759  explicitly_invalid_replace = FALSE;
2760  }
2762  }
2763  else {
2764  arg1 = argv[0];
2765  }
2766  arg2 = argc<=1 ? Qnil : argv[1];
2767  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2768 
2769  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2773  if (senc && senc == denc) {
2774  if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2775  VALUE rep = Qnil;
2776  if (!NIL_P(ecopts)) {
2777  rep = rb_hash_aref(ecopts, sym_replace);
2778  }
2779  dest = rb_enc_str_scrub(senc, str, rep);
2780  if (NIL_P(dest)) dest = str;
2781  *self = dest;
2782  return dencidx;
2783  }
2784  return NIL_P(arg2) ? -1 : dencidx;
2785  }
2786  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2787  if (is_ascii_string(str)) {
2788  return dencidx;
2789  }
2790  }
2791  if (encoding_equal(sname, dname)) {
2792  return NIL_P(arg2) ? -1 : dencidx;
2793  }
2794  }
2795  else {
2796  if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2797  rb_encoding *utf8 = rb_utf8_encoding();
2798  str = rb_str_conv_enc(str, senc, utf8);
2799  senc = utf8;
2800  sname = "UTF-8";
2801  }
2802  if (encoding_equal(sname, dname)) {
2803  sname = "";
2804  dname = "";
2805  }
2806  }
2807 
2808  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2809  slen = RSTRING_LEN(str);
2810  blen = slen + 30; /* len + margin */
2811  dest = rb_str_tmp_new(blen);
2812  bp = (unsigned char *)RSTRING_PTR(dest);
2813 
2814  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2815  if (fromp != sp+slen) {
2816  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2817  }
2818  buf = (unsigned char *)RSTRING_PTR(dest);
2819  *bp = '\0';
2820  rb_str_set_len(dest, bp - buf);
2821 
2822  /* set encoding */
2823  if (!denc) {
2824  dencidx = rb_define_dummy_encoding(dname);
2825  RB_GC_GUARD(arg1);
2826  RB_GC_GUARD(arg2);
2827  }
2828  *self = dest;
2829 
2830  return dencidx;
2831 }
2832 
2833 static int
2834 str_transcode(int argc, VALUE *argv, VALUE *self)
2835 {
2836  VALUE opt;
2837  int ecflags = 0;
2838  VALUE ecopts = Qnil;
2839 
2840  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2841  if (!NIL_P(opt)) {
2842  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2843  }
2844  return str_transcode0(argc, argv, self, ecflags, ecopts);
2845 }
2846 
2847 static inline VALUE
2848 str_encode_associate(VALUE str, int encidx)
2849 {
2850  int cr = 0;
2851 
2852  rb_enc_associate_index(str, encidx);
2853 
2854  /* transcoded string never be broken. */
2855  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2857  }
2858  else {
2859  cr = ENC_CODERANGE_VALID;
2860  }
2861  ENC_CODERANGE_SET(str, cr);
2862  return str;
2863 }
2864 
2865 /*
2866  * call-seq:
2867  * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2868  * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2869  *
2870  * Like #encode, but applies encoding changes to +self+; returns +self+.
2871  *
2872  */
2873 
2874 static VALUE
2875 str_encode_bang(int argc, VALUE *argv, VALUE str)
2876 {
2877  VALUE newstr;
2878  int encidx;
2879 
2880  rb_check_frozen(str);
2881 
2882  newstr = str;
2883  encidx = str_transcode(argc, argv, &newstr);
2884 
2885  if (encidx < 0) return str;
2886  if (newstr == str) {
2887  rb_enc_associate_index(str, encidx);
2888  return str;
2889  }
2890  rb_str_shared_replace(str, newstr);
2891  return str_encode_associate(str, encidx);
2892 }
2893 
2894 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2895 
2896 /*
2897  * call-seq:
2898  * encode(dst_encoding = Encoding.default_internal, **enc_opts) -> string
2899  * encode(dst_encoding, src_encoding, **enc_opts) -> string
2900  *
2901  * :include: doc/string/encode.rdoc
2902  *
2903  */
2904 
2905 static VALUE
2906 str_encode(int argc, VALUE *argv, VALUE str)
2907 {
2908  VALUE newstr = str;
2909  int encidx = str_transcode(argc, argv, &newstr);
2910  return encoded_dup(newstr, str, encidx);
2911 }
2912 
2913 VALUE
2914 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2915 {
2916  int argc = 1;
2917  VALUE *argv = &to;
2918  VALUE newstr = str;
2919  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2920  return encoded_dup(newstr, str, encidx);
2921 }
2922 
2923 static VALUE
2924 encoded_dup(VALUE newstr, VALUE str, int encidx)
2925 {
2926  if (encidx < 0) return rb_str_dup(str);
2927  if (newstr == str) {
2928  newstr = rb_str_dup(str);
2929  rb_enc_associate_index(newstr, encidx);
2930  return newstr;
2931  }
2932  else {
2933  RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2934  }
2935  return str_encode_associate(newstr, encidx);
2936 }
2937 
2938 /*
2939  * Document-class: Encoding::Converter
2940  *
2941  * Encoding conversion class.
2942  */
2943 static void
2944 econv_free(void *ptr)
2945 {
2946  rb_econv_t *ec = ptr;
2947  rb_econv_close(ec);
2948 }
2949 
2950 static size_t
2951 econv_memsize(const void *ptr)
2952 {
2953  return sizeof(rb_econv_t);
2954 }
2955 
2956 static const rb_data_type_t econv_data_type = {
2957  "econv",
2958  {0, econv_free, econv_memsize,},
2959  0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2960 };
2961 
2962 static VALUE
2963 econv_s_allocate(VALUE klass)
2964 {
2965  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2966 }
2967 
2968 static rb_encoding *
2969 make_dummy_encoding(const char *name)
2970 {
2971  rb_encoding *enc;
2972  int idx;
2973  idx = rb_define_dummy_encoding(name);
2974  enc = rb_enc_from_index(idx);
2975  return enc;
2976 }
2977 
2978 static rb_encoding *
2979 make_encoding(const char *name)
2980 {
2981  rb_encoding *enc;
2982  enc = rb_enc_find(name);
2983  if (!enc)
2984  enc = make_dummy_encoding(name);
2985  return enc;
2986 }
2987 
2988 static VALUE
2989 make_encobj(const char *name)
2990 {
2991  return rb_enc_from_encoding(make_encoding(name));
2992 }
2993 
2994 /*
2995  * call-seq:
2996  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2997  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2998  *
2999  * Returns the corresponding ASCII compatible encoding.
3000  *
3001  * Returns nil if the argument is an ASCII compatible encoding.
3002  *
3003  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
3004  * can represents exactly the same characters as the given ASCII incompatible encoding.
3005  * So, no conversion undefined error occurs when converting between the two encodings.
3006  *
3007  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
3008  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
3009  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
3010  *
3011  */
3012 static VALUE
3013 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
3014 {
3015  const char *arg_name, *result_name;
3016  rb_encoding *arg_enc, *result_enc;
3017 
3018  enc_arg(&arg, &arg_name, &arg_enc);
3019 
3020  result_name = rb_econv_asciicompat_encoding(arg_name);
3021 
3022  if (result_name == NULL)
3023  return Qnil;
3024 
3025  result_enc = make_encoding(result_name);
3026 
3027  return rb_enc_from_encoding(result_enc);
3028 }
3029 
3030 static void
3031 econv_args(int argc, VALUE *argv,
3032  VALUE *snamev_p, VALUE *dnamev_p,
3033  const char **sname_p, const char **dname_p,
3034  rb_encoding **senc_p, rb_encoding **denc_p,
3035  int *ecflags_p,
3036  VALUE *ecopts_p)
3037 {
3038  VALUE opt, flags_v, ecopts;
3039  int sidx, didx;
3040  const char *sname, *dname;
3041  rb_encoding *senc, *denc;
3042  int ecflags;
3043 
3044  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3045 
3046  if (!NIL_P(flags_v)) {
3047  if (!NIL_P(opt)) {
3048  rb_error_arity(argc + 1, 2, 3);
3049  }
3050  ecflags = NUM2INT(rb_to_int(flags_v));
3051  ecopts = Qnil;
3052  }
3053  else if (!NIL_P(opt)) {
3054  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3055  }
3056  else {
3057  ecflags = 0;
3058  ecopts = Qnil;
3059  }
3060 
3061  senc = NULL;
3062  sidx = rb_to_encoding_index(*snamev_p);
3063  if (0 <= sidx) {
3064  senc = rb_enc_from_index(sidx);
3065  }
3066  else {
3067  StringValue(*snamev_p);
3068  }
3069 
3070  denc = NULL;
3071  didx = rb_to_encoding_index(*dnamev_p);
3072  if (0 <= didx) {
3073  denc = rb_enc_from_index(didx);
3074  }
3075  else {
3076  StringValue(*dnamev_p);
3077  }
3078 
3079  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3080  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3081 
3082  *sname_p = sname;
3083  *dname_p = dname;
3084  *senc_p = senc;
3085  *denc_p = denc;
3086  *ecflags_p = ecflags;
3087  *ecopts_p = ecopts;
3088 }
3089 
3090 static int
3091 decorate_convpath(VALUE convpath, int ecflags)
3092 {
3093  int num_decorators;
3094  const char *decorators[MAX_ECFLAGS_DECORATORS];
3095  int i;
3096  int n, len;
3097 
3098  num_decorators = decorator_names(ecflags, decorators);
3099  if (num_decorators == -1)
3100  return -1;
3101 
3102  len = n = RARRAY_LENINT(convpath);
3103  if (n != 0) {
3104  VALUE pair = RARRAY_AREF(convpath, n-1);
3105  if (RB_TYPE_P(pair, T_ARRAY)) {
3106  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3107  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3108  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3109  const rb_transcoder *tr = load_transcoder_entry(entry);
3110  if (!tr)
3111  return -1;
3112  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3113  tr->asciicompat_type == asciicompat_encoder) {
3114  n--;
3115  rb_ary_store(convpath, len + num_decorators - 1, pair);
3116  }
3117  }
3118  else {
3119  rb_ary_store(convpath, len + num_decorators - 1, pair);
3120  }
3121  }
3122 
3123  for (i = 0; i < num_decorators; i++)
3124  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3125 
3126  return 0;
3127 }
3128 
3129 static void
3130 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3131 {
3132  VALUE *ary_p = arg;
3133  VALUE v;
3134 
3135  if (NIL_P(*ary_p)) {
3136  *ary_p = rb_ary_new();
3137  }
3138 
3139  if (DECORATOR_P(sname, dname)) {
3140  v = rb_str_new_cstr(dname);
3141  }
3142  else {
3143  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3144  }
3145  rb_ary_store(*ary_p, depth, v);
3146 }
3147 
3148 /*
3149  * call-seq:
3150  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3151  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3152  *
3153  * Returns a conversion path.
3154  *
3155  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3156  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3157  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3158  *
3159  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3160  * or
3161  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3162  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3163  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3164  * # "universal_newline"]
3165  *
3166  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3167  * or
3168  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3169  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3170  * # "universal_newline",
3171  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3172  */
3173 static VALUE
3174 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3175 {
3176  VALUE snamev, dnamev;
3177  const char *sname, *dname;
3178  rb_encoding *senc, *denc;
3179  int ecflags;
3180  VALUE ecopts;
3181  VALUE convpath;
3182 
3183  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3184 
3185  convpath = Qnil;
3186  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3187 
3188  if (NIL_P(convpath)) {
3189  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3190  RB_GC_GUARD(snamev);
3191  RB_GC_GUARD(dnamev);
3192  rb_exc_raise(exc);
3193  }
3194 
3195  if (decorate_convpath(convpath, ecflags) == -1) {
3196  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3197  RB_GC_GUARD(snamev);
3198  RB_GC_GUARD(dnamev);
3199  rb_exc_raise(exc);
3200  }
3201 
3202  return convpath;
3203 }
3204 
3205 /*
3206  * Check the existence of a conversion path.
3207  * Returns the number of converters in the conversion path.
3208  * result: >=0:success -1:failure
3209  */
3210 int
3211 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3212 {
3213  VALUE convpath = Qnil;
3214  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3215  &convpath);
3216  return RTEST(convpath);
3217 }
3218 
3220  rb_econv_t *ec;
3221  int index;
3222  int ret;
3223 };
3224 
3225 static void
3226 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3227 {
3228  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3229  int ret;
3230 
3231  if (a->ret == -1)
3232  return;
3233 
3234  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3235 
3236  a->ret = ret;
3237  return;
3238 }
3239 
3240 static rb_econv_t *
3241 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3242  const char **sname_p, const char **dname_p,
3243  rb_encoding **senc_p, rb_encoding**denc_p)
3244 {
3245  rb_econv_t *ec;
3246  long i;
3247  int ret, first=1;
3248  VALUE elt;
3249  rb_encoding *senc = 0, *denc = 0;
3250  const char *sname, *dname;
3251 
3252  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3253  DATA_PTR(self) = ec;
3254 
3255  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3256  VALUE snamev, dnamev;
3257  VALUE pair;
3258  elt = rb_ary_entry(convpath, i);
3259  if (!NIL_P(pair = rb_check_array_type(elt))) {
3260  if (RARRAY_LEN(pair) != 2)
3261  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3262  snamev = rb_ary_entry(pair, 0);
3263  enc_arg(&snamev, &sname, &senc);
3264  dnamev = rb_ary_entry(pair, 1);
3265  enc_arg(&dnamev, &dname, &denc);
3266  }
3267  else {
3268  sname = "";
3269  dname = StringValueCStr(elt);
3270  }
3271  if (DECORATOR_P(sname, dname)) {
3272  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3273  if (ret == -1) {
3274  VALUE msg = rb_sprintf("decoration failed: %s", dname);
3275  RB_GC_GUARD(snamev);
3276  RB_GC_GUARD(dnamev);
3278  }
3279  }
3280  else {
3281  int j = ec->num_trans;
3282  struct rb_econv_init_by_convpath_t arg;
3283  arg.ec = ec;
3284  arg.index = ec->num_trans;
3285  arg.ret = 0;
3286  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3287  if (ret == -1 || arg.ret == -1) {
3288  VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3289  RB_GC_GUARD(snamev);
3290  RB_GC_GUARD(dnamev);
3292  }
3293  if (first) {
3294  first = 0;
3295  *senc_p = senc;
3296  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3297  }
3298  *denc_p = denc;
3299  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3300  }
3301  }
3302 
3303  if (first) {
3304  *senc_p = NULL;
3305  *denc_p = NULL;
3306  *sname_p = "";
3307  *dname_p = "";
3308  }
3309 
3310  ec->source_encoding_name = *sname_p;
3311  ec->destination_encoding_name = *dname_p;
3312 
3313  return ec;
3314 }
3315 
3316 /*
3317  * call-seq:
3318  * Encoding::Converter.new(source_encoding, destination_encoding)
3319  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3320  * Encoding::Converter.new(convpath)
3321  *
3322  * possible options elements:
3323  * hash form:
3324  * :invalid => nil # raise error on invalid byte sequence (default)
3325  * :invalid => :replace # replace invalid byte sequence
3326  * :undef => nil # raise error on undefined conversion (default)
3327  * :undef => :replace # replace undefined conversion
3328  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3329  * :newline => :universal # decorator for converting CRLF and CR to LF
3330  * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3331  * :newline => :crlf # decorator for converting LF to CRLF
3332  * :newline => :cr # decorator for converting LF to CR
3333  * :universal_newline => true # decorator for converting CRLF and CR to LF
3334  * :crlf_newline => true # decorator for converting LF to CRLF
3335  * :cr_newline => true # decorator for converting LF to CR
3336  * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3337  * :xml => :text # escape as XML CharData.
3338  * :xml => :attr # escape as XML AttValue
3339  * integer form:
3340  * Encoding::Converter::INVALID_REPLACE
3341  * Encoding::Converter::UNDEF_REPLACE
3342  * Encoding::Converter::UNDEF_HEX_CHARREF
3343  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3344  * Encoding::Converter::LF_NEWLINE_DECORATOR
3345  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3346  * Encoding::Converter::CR_NEWLINE_DECORATOR
3347  * Encoding::Converter::XML_TEXT_DECORATOR
3348  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3349  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3350  *
3351  * Encoding::Converter.new creates an instance of Encoding::Converter.
3352  *
3353  * Source_encoding and destination_encoding should be a string or
3354  * Encoding object.
3355  *
3356  * opt should be nil, a hash or an integer.
3357  *
3358  * convpath should be an array.
3359  * convpath may contain
3360  * - two-element arrays which contain encodings or encoding names, or
3361  * - strings representing decorator names.
3362  *
3363  * Encoding::Converter.new optionally takes an option.
3364  * The option should be a hash or an integer.
3365  * The option hash can contain :invalid => nil, etc.
3366  * The option integer should be logical-or of constants such as
3367  * Encoding::Converter::INVALID_REPLACE, etc.
3368  *
3369  * [:invalid => nil]
3370  * Raise error on invalid byte sequence. This is a default behavior.
3371  * [:invalid => :replace]
3372  * Replace invalid byte sequence by replacement string.
3373  * [:undef => nil]
3374  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3375  * This is a default behavior.
3376  * [:undef => :replace]
3377  * Replace undefined character in destination_encoding with replacement string.
3378  * [:replace => string]
3379  * Specify the replacement string.
3380  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3381  * [:universal_newline => true]
3382  * Convert CRLF and CR to LF.
3383  * [:crlf_newline => true]
3384  * Convert LF to CRLF.
3385  * [:cr_newline => true]
3386  * Convert LF to CR.
3387  * [:lf_newline => true]
3388  * Convert CRLF and CR to LF (when writing).
3389  * [:xml => :text]
3390  * Escape as XML CharData.
3391  * This form can be used as an HTML 4.0 #PCDATA.
3392  * - '&' -> '&amp;'
3393  * - '<' -> '&lt;'
3394  * - '>' -> '&gt;'
3395  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3396  * [:xml => :attr]
3397  * Escape as XML AttValue.
3398  * The converted result is quoted as "...".
3399  * This form can be used as an HTML 4.0 attribute value.
3400  * - '&' -> '&amp;'
3401  * - '<' -> '&lt;'
3402  * - '>' -> '&gt;'
3403  * - '"' -> '&quot;'
3404  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3405  *
3406  * Examples:
3407  * # UTF-16BE to UTF-8
3408  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3409  *
3410  * # Usually, decorators such as newline conversion are inserted last.
3411  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3412  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3413  * # "universal_newline"]
3414  *
3415  * # But, if the last encoding is ASCII incompatible,
3416  * # decorators are inserted before the last conversion.
3417  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3418  * p ec.convpath #=> ["crlf_newline",
3419  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3420  *
3421  * # Conversion path can be specified directly.
3422  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3423  * p ec.convpath #=> ["universal_newline",
3424  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3425  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3426  */
3427 static VALUE
3428 econv_init(int argc, VALUE *argv, VALUE self)
3429 {
3430  VALUE ecopts;
3431  VALUE snamev, dnamev;
3432  const char *sname, *dname;
3433  rb_encoding *senc, *denc;
3434  rb_econv_t *ec;
3435  int ecflags;
3436  VALUE convpath;
3437 
3438  if (rb_check_typeddata(self, &econv_data_type)) {
3439  rb_raise(rb_eTypeError, "already initialized");
3440  }
3441 
3442  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3443  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3444  ecflags = 0;
3445  ecopts = Qnil;
3446  }
3447  else {
3448  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3449  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3450  }
3451 
3452  if (!ec) {
3453  VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3454  RB_GC_GUARD(snamev);
3455  RB_GC_GUARD(dnamev);
3456  rb_exc_raise(exc);
3457  }
3458 
3459  if (!DECORATOR_P(sname, dname)) {
3460  if (!senc)
3461  senc = make_dummy_encoding(sname);
3462  if (!denc)
3463  denc = make_dummy_encoding(dname);
3464  RB_GC_GUARD(snamev);
3465  RB_GC_GUARD(dnamev);
3466  }
3467 
3468  ec->source_encoding = senc;
3469  ec->destination_encoding = denc;
3470 
3471  DATA_PTR(self) = ec;
3472 
3473  return self;
3474 }
3475 
3476 /*
3477  * call-seq:
3478  * ec.inspect -> string
3479  *
3480  * Returns a printable version of <i>ec</i>
3481  *
3482  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3483  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3484  *
3485  */
3486 static VALUE
3487 econv_inspect(VALUE self)
3488 {
3489  const char *cname = rb_obj_classname(self);
3490  rb_econv_t *ec;
3491 
3492  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3493  if (!ec)
3494  return rb_sprintf("#<%s: uninitialized>", cname);
3495  else {
3496  const char *sname = ec->source_encoding_name;
3497  const char *dname = ec->destination_encoding_name;
3498  VALUE str;
3499  str = rb_sprintf("#<%s: ", cname);
3500  econv_description(sname, dname, ec->flags, str);
3501  rb_str_cat2(str, ">");
3502  return str;
3503  }
3504 }
3505 
3506 static rb_econv_t *
3507 check_econv(VALUE self)
3508 {
3509  rb_econv_t *ec;
3510 
3511  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3512  if (!ec) {
3513  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3514  }
3515  return ec;
3516 }
3517 
3518 static VALUE
3519 econv_get_encoding(rb_encoding *encoding)
3520 {
3521  if (!encoding)
3522  return Qnil;
3523  return rb_enc_from_encoding(encoding);
3524 }
3525 
3526 /*
3527  * call-seq:
3528  * ec.source_encoding -> encoding
3529  *
3530  * Returns the source encoding as an Encoding object.
3531  */
3532 static VALUE
3533 econv_source_encoding(VALUE self)
3534 {
3535  rb_econv_t *ec = check_econv(self);
3536  return econv_get_encoding(ec->source_encoding);
3537 }
3538 
3539 /*
3540  * call-seq:
3541  * ec.destination_encoding -> encoding
3542  *
3543  * Returns the destination encoding as an Encoding object.
3544  */
3545 static VALUE
3546 econv_destination_encoding(VALUE self)
3547 {
3548  rb_econv_t *ec = check_econv(self);
3549  return econv_get_encoding(ec->destination_encoding);
3550 }
3551 
3552 /*
3553  * call-seq:
3554  * ec.convpath -> ary
3555  *
3556  * Returns the conversion path of ec.
3557  *
3558  * The result is an array of conversions.
3559  *
3560  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3561  * p ec.convpath
3562  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3563  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3564  * # "crlf_newline"]
3565  *
3566  * Each element of the array is a pair of encodings or a string.
3567  * A pair means an encoding conversion.
3568  * A string means a decorator.
3569  *
3570  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3571  * a converter from ISO-8859-1 to UTF-8.
3572  * "crlf_newline" means newline converter from LF to CRLF.
3573  */
3574 static VALUE
3575 econv_convpath(VALUE self)
3576 {
3577  rb_econv_t *ec = check_econv(self);
3578  VALUE result;
3579  int i;
3580 
3581  result = rb_ary_new();
3582  for (i = 0; i < ec->num_trans; i++) {
3583  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3584  VALUE v;
3585  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3586  v = rb_str_new_cstr(tr->dst_encoding);
3587  else
3588  v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3589  rb_ary_push(result, v);
3590  }
3591  return result;
3592 }
3593 
3594 /*
3595  * call-seq:
3596  * ec == other -> true or false
3597  */
3598 static VALUE
3599 econv_equal(VALUE self, VALUE other)
3600 {
3601  rb_econv_t *ec1 = check_econv(self);
3602  rb_econv_t *ec2;
3603  int i;
3604 
3605  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3606  return Qnil;
3607  }
3608  ec2 = DATA_PTR(other);
3609  if (!ec2) return Qfalse;
3610  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3611  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3612  return Qfalse;
3613  if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3614  strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3615  return Qfalse;
3616  if (ec1->flags != ec2->flags) return Qfalse;
3617  if (ec1->replacement_enc != ec2->replacement_enc &&
3618  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3619  return Qfalse;
3620  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3621  if (ec1->replacement_str != ec2->replacement_str &&
3622  memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3623  return Qfalse;
3624 
3625  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3626  for (i = 0; i < ec1->num_trans; i++) {
3627  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3628  return Qfalse;
3629  }
3630  return Qtrue;
3631 }
3632 
3633 static VALUE
3634 econv_result_to_symbol(rb_econv_result_t res)
3635 {
3636  switch (res) {
3637  case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3638  case econv_incomplete_input: return sym_incomplete_input;
3639  case econv_undefined_conversion: return sym_undefined_conversion;
3640  case econv_destination_buffer_full: return sym_destination_buffer_full;
3641  case econv_source_buffer_empty: return sym_source_buffer_empty;
3642  case econv_finished: return sym_finished;
3643  case econv_after_output: return sym_after_output;
3644  default: return INT2NUM(res); /* should not be reached */
3645  }
3646 }
3647 
3648 /*
3649  * call-seq:
3650  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3651  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3652  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3653  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3654  *
3655  * possible opt elements:
3656  * hash form:
3657  * :partial_input => true # source buffer may be part of larger source
3658  * :after_output => true # stop conversion after output before input
3659  * integer form:
3660  * Encoding::Converter::PARTIAL_INPUT
3661  * Encoding::Converter::AFTER_OUTPUT
3662  *
3663  * possible results:
3664  * :invalid_byte_sequence
3665  * :incomplete_input
3666  * :undefined_conversion
3667  * :after_output
3668  * :destination_buffer_full
3669  * :source_buffer_empty
3670  * :finished
3671  *
3672  * primitive_convert converts source_buffer into destination_buffer.
3673  *
3674  * source_buffer should be a string or nil.
3675  * nil means an empty string.
3676  *
3677  * destination_buffer should be a string.
3678  *
3679  * destination_byteoffset should be an integer or nil.
3680  * nil means the end of destination_buffer.
3681  * If it is omitted, nil is assumed.
3682  *
3683  * destination_bytesize should be an integer or nil.
3684  * nil means unlimited.
3685  * If it is omitted, nil is assumed.
3686  *
3687  * opt should be nil, a hash or an integer.
3688  * nil means no flags.
3689  * If it is omitted, nil is assumed.
3690  *
3691  * primitive_convert converts the content of source_buffer from beginning
3692  * and store the result into destination_buffer.
3693  *
3694  * destination_byteoffset and destination_bytesize specify the region which
3695  * the converted result is stored.
3696  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3697  * If destination_byteoffset is nil,
3698  * destination_buffer.bytesize is used for appending the result.
3699  * destination_bytesize specifies maximum number of bytes.
3700  * If destination_bytesize is nil,
3701  * destination size is unlimited.
3702  * After conversion, destination_buffer is resized to
3703  * destination_byteoffset + actually produced number of bytes.
3704  * Also destination_buffer's encoding is set to destination_encoding.
3705  *
3706  * primitive_convert drops the converted part of source_buffer.
3707  * the dropped part is converted in destination_buffer or
3708  * buffered in Encoding::Converter object.
3709  *
3710  * primitive_convert stops conversion when one of following condition met.
3711  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3712  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3713  * - unexpected end of source buffer (:incomplete_input)
3714  * this occur only when :partial_input is not specified.
3715  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3716  * - character not representable in output encoding (:undefined_conversion)
3717  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3718  * - after some output is generated, before input is done (:after_output)
3719  * this occur only when :after_output is specified.
3720  * - destination buffer is full (:destination_buffer_full)
3721  * this occur only when destination_bytesize is non-nil.
3722  * - source buffer is empty (:source_buffer_empty)
3723  * this occur only when :partial_input is specified.
3724  * - conversion is finished (:finished)
3725  *
3726  * example:
3727  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3728  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3729  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3730  *
3731  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3732  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3733  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3734  * ret = ec.primitive_convert(src, dst="", nil, 1)
3735  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3736  * ret = ec.primitive_convert(src, dst="", nil, 1)
3737  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3738  * ret = ec.primitive_convert(src, dst="", nil, 1)
3739  * p [ret, src, dst] #=> [:finished, "", "i"]
3740  *
3741  */
3742 static VALUE
3743 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3744 {
3745  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3746  rb_econv_t *ec = check_econv(self);
3747  rb_econv_result_t res;
3748  const unsigned char *ip, *is;
3749  unsigned char *op, *os;
3750  long output_byteoffset, output_bytesize;
3751  unsigned long output_byteend;
3752  int flags;
3753 
3754  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3755 
3756  if (NIL_P(output_byteoffset_v))
3757  output_byteoffset = 0; /* dummy */
3758  else
3759  output_byteoffset = NUM2LONG(output_byteoffset_v);
3760 
3761  if (NIL_P(output_bytesize_v))
3762  output_bytesize = 0; /* dummy */
3763  else
3764  output_bytesize = NUM2LONG(output_bytesize_v);
3765 
3766  if (!NIL_P(flags_v)) {
3767  if (!NIL_P(opt)) {
3768  rb_error_arity(argc + 1, 2, 5);
3769  }
3770  flags = NUM2INT(rb_to_int(flags_v));
3771  }
3772  else if (!NIL_P(opt)) {
3773  VALUE v;
3774  flags = 0;
3775  v = rb_hash_aref(opt, sym_partial_input);
3776  if (RTEST(v))
3777  flags |= ECONV_PARTIAL_INPUT;
3778  v = rb_hash_aref(opt, sym_after_output);
3779  if (RTEST(v))
3780  flags |= ECONV_AFTER_OUTPUT;
3781  }
3782  else {
3783  flags = 0;
3784  }
3785 
3786  StringValue(output);
3787  if (!NIL_P(input))
3788  StringValue(input);
3789  rb_str_modify(output);
3790 
3791  if (NIL_P(output_bytesize_v)) {
3792  output_bytesize = rb_str_capacity(output);
3793 
3794  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3795  output_bytesize = RSTRING_LEN(input);
3796  }
3797 
3798  retry:
3799 
3800  if (NIL_P(output_byteoffset_v))
3801  output_byteoffset = RSTRING_LEN(output);
3802 
3803  if (output_byteoffset < 0)
3804  rb_raise(rb_eArgError, "negative output_byteoffset");
3805 
3806  if (RSTRING_LEN(output) < output_byteoffset)
3807  rb_raise(rb_eArgError, "output_byteoffset too big");
3808 
3809  if (output_bytesize < 0)
3810  rb_raise(rb_eArgError, "negative output_bytesize");
3811 
3812  output_byteend = (unsigned long)output_byteoffset +
3813  (unsigned long)output_bytesize;
3814 
3815  if (output_byteend < (unsigned long)output_byteoffset ||
3816  LONG_MAX < output_byteend)
3817  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3818 
3819  if (rb_str_capacity(output) < output_byteend)
3820  rb_str_resize(output, output_byteend);
3821 
3822  if (NIL_P(input)) {
3823  ip = is = NULL;
3824  }
3825  else {
3826  ip = (const unsigned char *)RSTRING_PTR(input);
3827  is = ip + RSTRING_LEN(input);
3828  }
3829 
3830  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3831  os = op + output_bytesize;
3832 
3833  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3834  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3835  if (!NIL_P(input)) {
3836  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3837  }
3838 
3839  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3840  if (LONG_MAX / 2 < output_bytesize)
3841  rb_raise(rb_eArgError, "too long conversion result");
3842  output_bytesize *= 2;
3843  output_byteoffset_v = Qnil;
3844  goto retry;
3845  }
3846 
3847  if (ec->destination_encoding) {
3848  rb_enc_associate(output, ec->destination_encoding);
3849  }
3850 
3851  return econv_result_to_symbol(res);
3852 }
3853 
3854 /*
3855  * call-seq:
3856  * ec.convert(source_string) -> destination_string
3857  *
3858  * Convert source_string and return destination_string.
3859  *
3860  * source_string is assumed as a part of source.
3861  * i.e. :partial_input=>true is specified internally.
3862  * finish method should be used last.
3863  *
3864  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3865  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3866  * puts ec.finish.dump #=> ""
3867  *
3868  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3869  * puts ec.convert("\xA4").dump #=> ""
3870  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3871  * puts ec.finish.dump #=> ""
3872  *
3873  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3874  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3875  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3876  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3877  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3878  *
3879  * If a conversion error occur,
3880  * Encoding::UndefinedConversionError or
3881  * Encoding::InvalidByteSequenceError is raised.
3882  * Encoding::Converter#convert doesn't supply methods to recover or restart
3883  * from these exceptions.
3884  * When you want to handle these conversion errors,
3885  * use Encoding::Converter#primitive_convert.
3886  *
3887  */
3888 static VALUE
3889 econv_convert(VALUE self, VALUE source_string)
3890 {
3891  VALUE ret, dst;
3892  VALUE av[5];
3893  int ac;
3894  rb_econv_t *ec = check_econv(self);
3895 
3896  StringValue(source_string);
3897 
3898  dst = rb_str_new(NULL, 0);
3899 
3900  av[0] = rb_str_dup(source_string);
3901  av[1] = dst;
3902  av[2] = Qnil;
3903  av[3] = Qnil;
3904  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3905  ac = 5;
3906 
3907  ret = econv_primitive_convert(ac, av, self);
3908 
3909  if (ret == sym_invalid_byte_sequence ||
3910  ret == sym_undefined_conversion ||
3911  ret == sym_incomplete_input) {
3912  VALUE exc = make_econv_exception(ec);
3913  rb_exc_raise(exc);
3914  }
3915 
3916  if (ret == sym_finished) {
3917  rb_raise(rb_eArgError, "converter already finished");
3918  }
3919 
3920  if (ret != sym_source_buffer_empty) {
3921  rb_bug("unexpected result of econv_primitive_convert");
3922  }
3923 
3924  return dst;
3925 }
3926 
3927 /*
3928  * call-seq:
3929  * ec.finish -> string
3930  *
3931  * Finishes the converter.
3932  * It returns the last part of the converted string.
3933  *
3934  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3935  * p ec.convert("\u3042") #=> "\e$B$\""
3936  * p ec.finish #=> "\e(B"
3937  */
3938 static VALUE
3939 econv_finish(VALUE self)
3940 {
3941  VALUE ret, dst;
3942  VALUE av[5];
3943  int ac;
3944  rb_econv_t *ec = check_econv(self);
3945 
3946  dst = rb_str_new(NULL, 0);
3947 
3948  av[0] = Qnil;
3949  av[1] = dst;
3950  av[2] = Qnil;
3951  av[3] = Qnil;
3952  av[4] = INT2FIX(0);
3953  ac = 5;
3954 
3955  ret = econv_primitive_convert(ac, av, self);
3956 
3957  if (ret == sym_invalid_byte_sequence ||
3958  ret == sym_undefined_conversion ||
3959  ret == sym_incomplete_input) {
3960  VALUE exc = make_econv_exception(ec);
3961  rb_exc_raise(exc);
3962  }
3963 
3964  if (ret != sym_finished) {
3965  rb_bug("unexpected result of econv_primitive_convert");
3966  }
3967 
3968  return dst;
3969 }
3970 
3971 /*
3972  * call-seq:
3973  * ec.primitive_errinfo -> array
3974  *
3975  * primitive_errinfo returns important information regarding the last error
3976  * as a 5-element array:
3977  *
3978  * [result, enc1, enc2, error_bytes, readagain_bytes]
3979  *
3980  * result is the last result of primitive_convert.
3981  *
3982  * Other elements are only meaningful when result is
3983  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3984  *
3985  * enc1 and enc2 indicate a conversion step as a pair of strings.
3986  * For example, a converter from EUC-JP to ISO-8859-1 converts
3987  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3988  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3989  *
3990  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3991  * error_bytes is discarded portion.
3992  * readagain_bytes is buffered portion which is read again on next conversion.
3993  *
3994  * Example:
3995  *
3996  * # \xff is invalid as EUC-JP.
3997  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3998  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3999  * p ec.primitive_errinfo
4000  * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
4001  *
4002  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
4003  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
4004  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
4005  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4006  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
4007  * p ec.primitive_errinfo
4008  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
4009  *
4010  * # partial character is invalid
4011  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4012  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
4013  * p ec.primitive_errinfo
4014  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
4015  *
4016  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
4017  * # partial characters.
4018  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4019  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
4020  * p ec.primitive_errinfo
4021  * #=> [:source_buffer_empty, nil, nil, nil, nil]
4022  *
4023  * # \xd8\x00\x00@ is invalid as UTF-16BE because
4024  * # no low surrogate after high surrogate (\xd8\x00).
4025  * # It is detected by 3rd byte (\00) which is part of next character.
4026  * # So the high surrogate (\xd8\x00) is discarded and
4027  * # the 3rd byte is read again later.
4028  * # Since the byte is buffered in ec, it is dropped from src.
4029  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4030  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4031  * p ec.primitive_errinfo
4032  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4033  * p src
4034  * #=> "@"
4035  *
4036  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4037  * # The problem is detected by 4th byte.
4038  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4039  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4040  * p ec.primitive_errinfo
4041  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4042  * p src
4043  * #=> ""
4044  *
4045  */
4046 static VALUE
4047 econv_primitive_errinfo(VALUE self)
4048 {
4049  rb_econv_t *ec = check_econv(self);
4050 
4051  VALUE ary;
4052 
4053  ary = rb_ary_new2(5);
4054 
4055  rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4056  rb_ary_store(ary, 4, Qnil);
4057 
4058  if (ec->last_error.source_encoding)
4059  rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4060 
4061  if (ec->last_error.destination_encoding)
4062  rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4063 
4064  if (ec->last_error.error_bytes_start) {
4065  rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4066  rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4067  }
4068 
4069  return ary;
4070 }
4071 
4072 /*
4073  * call-seq:
4074  * ec.insert_output(string) -> nil
4075  *
4076  * Inserts string into the encoding converter.
4077  * The string will be converted to the destination encoding and
4078  * output on later conversions.
4079  *
4080  * If the destination encoding is stateful,
4081  * string is converted according to the state and the state is updated.
4082  *
4083  * This method should be used only when a conversion error occurs.
4084  *
4085  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4086  * src = "HIRAGANA LETTER A is \u{3042}."
4087  * dst = ""
4088  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4089  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4090  * ec.insert_output("<err>")
4091  * p ec.primitive_convert(src, dst) #=> :finished
4092  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4093  *
4094  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4095  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4096  * dst = ""
4097  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4098  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4099  * ec.insert_output "?" # state change required to output "?".
4100  * p ec.primitive_convert(src, dst) #=> :finished
4101  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4102  *
4103  */
4104 static VALUE
4105 econv_insert_output(VALUE self, VALUE string)
4106 {
4107  const char *insert_enc;
4108 
4109  int ret;
4110 
4111  rb_econv_t *ec = check_econv(self);
4112 
4113  StringValue(string);
4114  insert_enc = rb_econv_encoding_to_insert_output(ec);
4115  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4116 
4117  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4118  if (ret == -1) {
4119  rb_raise(rb_eArgError, "too big string");
4120  }
4121 
4122  return Qnil;
4123 }
4124 
4125 /*
4126  * call-seq:
4127  * ec.putback -> string
4128  * ec.putback(max_numbytes) -> string
4129  *
4130  * Put back the bytes which will be converted.
4131  *
4132  * The bytes are caused by invalid_byte_sequence error.
4133  * When invalid_byte_sequence error, some bytes are discarded and
4134  * some bytes are buffered to be converted later.
4135  * The latter bytes can be put back.
4136  * It can be observed by
4137  * Encoding::InvalidByteSequenceError#readagain_bytes and
4138  * Encoding::Converter#primitive_errinfo.
4139  *
4140  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4141  * src = "\x00\xd8\x61\x00"
4142  * dst = ""
4143  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4144  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4145  * p ec.putback #=> "a\x00"
4146  * p ec.putback #=> "" # no more bytes to put back
4147  *
4148  */
4149 static VALUE
4150 econv_putback(int argc, VALUE *argv, VALUE self)
4151 {
4152  rb_econv_t *ec = check_econv(self);
4153  int n;
4154  int putbackable;
4155  VALUE str, max;
4156 
4157  if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4158  n = rb_econv_putbackable(ec);
4159  }
4160  else {
4161  n = NUM2INT(max);
4162  putbackable = rb_econv_putbackable(ec);
4163  if (putbackable < n)
4164  n = putbackable;
4165  }
4166 
4167  str = rb_str_new(NULL, n);
4168  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4169 
4170  if (ec->source_encoding) {
4171  rb_enc_associate(str, ec->source_encoding);
4172  }
4173 
4174  return str;
4175 }
4176 
4177 /*
4178  * call-seq:
4179  * ec.last_error -> exception or nil
4180  *
4181  * Returns an exception object for the last conversion.
4182  * Returns nil if the last conversion did not produce an error.
4183  *
4184  * "error" means that
4185  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4186  * Encoding::Converter#convert and
4187  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4188  * Encoding::Converter#primitive_convert.
4189  *
4190  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4191  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4192  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4193  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4194  * p ec.last_error #=> nil
4195  *
4196  */
4197 static VALUE
4198 econv_last_error(VALUE self)
4199 {
4200  rb_econv_t *ec = check_econv(self);
4201  VALUE exc;
4202 
4203  exc = make_econv_exception(ec);
4204  if (NIL_P(exc))
4205  return Qnil;
4206  return exc;
4207 }
4208 
4209 /*
4210  * call-seq:
4211  * ec.replacement -> string
4212  *
4213  * Returns the replacement string.
4214  *
4215  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4216  * p ec.replacement #=> "?"
4217  *
4218  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4219  * p ec.replacement #=> "\uFFFD"
4220  */
4221 static VALUE
4222 econv_get_replacement(VALUE self)
4223 {
4224  rb_econv_t *ec = check_econv(self);
4225  int ret;
4226  rb_encoding *enc;
4227 
4228  ret = make_replacement(ec);
4229  if (ret == -1) {
4230  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4231  }
4232 
4233  enc = rb_enc_find(ec->replacement_enc);
4234  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4235 }
4236 
4237 /*
4238  * call-seq:
4239  * ec.replacement = string
4240  *
4241  * Sets the replacement string.
4242  *
4243  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4244  * ec.replacement = "<undef>"
4245  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4246  */
4247 static VALUE
4248 econv_set_replacement(VALUE self, VALUE arg)
4249 {
4250  rb_econv_t *ec = check_econv(self);
4251  VALUE string = arg;
4252  int ret;
4253  rb_encoding *enc;
4254 
4255  StringValue(string);
4256  enc = rb_enc_get(string);
4257 
4258  ret = rb_econv_set_replacement(ec,
4259  (const unsigned char *)RSTRING_PTR(string),
4260  RSTRING_LEN(string),
4261  rb_enc_name(enc));
4262 
4263  if (ret == -1) {
4264  /* xxx: rb_eInvalidByteSequenceError? */
4265  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4266  }
4267 
4268  return arg;
4269 }
4270 
4271 VALUE
4273 {
4274  return make_econv_exception(ec);
4275 }
4276 
4277 void
4279 {
4280  VALUE exc;
4281 
4282  exc = make_econv_exception(ec);
4283  if (NIL_P(exc))
4284  return;
4285  rb_exc_raise(exc);
4286 }
4287 
4288 /*
4289  * call-seq:
4290  * ecerr.source_encoding_name -> string
4291  *
4292  * Returns the source encoding name as a string.
4293  */
4294 static VALUE
4295 ecerr_source_encoding_name(VALUE self)
4296 {
4297  return rb_attr_get(self, id_source_encoding_name);
4298 }
4299 
4300 /*
4301  * call-seq:
4302  * ecerr.source_encoding -> encoding
4303  *
4304  * Returns the source encoding as an encoding object.
4305  *
4306  * Note that the result may not be equal to the source encoding of
4307  * the encoding converter if the conversion has multiple steps.
4308  *
4309  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4310  * begin
4311  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4312  * rescue Encoding::UndefinedConversionError
4313  * p $!.source_encoding #=> #<Encoding:UTF-8>
4314  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4315  * p $!.source_encoding_name #=> "UTF-8"
4316  * p $!.destination_encoding_name #=> "EUC-JP"
4317  * end
4318  *
4319  */
4320 static VALUE
4321 ecerr_source_encoding(VALUE self)
4322 {
4323  return rb_attr_get(self, id_source_encoding);
4324 }
4325 
4326 /*
4327  * call-seq:
4328  * ecerr.destination_encoding_name -> string
4329  *
4330  * Returns the destination encoding name as a string.
4331  */
4332 static VALUE
4333 ecerr_destination_encoding_name(VALUE self)
4334 {
4335  return rb_attr_get(self, id_destination_encoding_name);
4336 }
4337 
4338 /*
4339  * call-seq:
4340  * ecerr.destination_encoding -> string
4341  *
4342  * Returns the destination encoding as an encoding object.
4343  */
4344 static VALUE
4345 ecerr_destination_encoding(VALUE self)
4346 {
4347  return rb_attr_get(self, id_destination_encoding);
4348 }
4349 
4350 /*
4351  * call-seq:
4352  * ecerr.error_char -> string
4353  *
4354  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4355  *
4356  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4357  * begin
4358  * ec.convert("\xa0")
4359  * rescue Encoding::UndefinedConversionError
4360  * puts $!.error_char.dump #=> "\xC2\xA0"
4361  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4362  * end
4363  *
4364  */
4365 static VALUE
4366 ecerr_error_char(VALUE self)
4367 {
4368  return rb_attr_get(self, id_error_char);
4369 }
4370 
4371 /*
4372  * call-seq:
4373  * ecerr.error_bytes -> string
4374  *
4375  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4376  *
4377  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4378  * begin
4379  * ec.convert("abc\xA1\xFFdef")
4380  * rescue Encoding::InvalidByteSequenceError
4381  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4382  * puts $!.error_bytes.dump #=> "\xA1"
4383  * puts $!.readagain_bytes.dump #=> "\xFF"
4384  * end
4385  */
4386 static VALUE
4387 ecerr_error_bytes(VALUE self)
4388 {
4389  return rb_attr_get(self, id_error_bytes);
4390 }
4391 
4392 /*
4393  * call-seq:
4394  * ecerr.readagain_bytes -> string
4395  *
4396  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4397  */
4398 static VALUE
4399 ecerr_readagain_bytes(VALUE self)
4400 {
4401  return rb_attr_get(self, id_readagain_bytes);
4402 }
4403 
4404 /*
4405  * call-seq:
4406  * ecerr.incomplete_input? -> true or false
4407  *
4408  * Returns true if the invalid byte sequence error is caused by
4409  * premature end of string.
4410  *
4411  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4412  *
4413  * begin
4414  * ec.convert("abc\xA1z")
4415  * rescue Encoding::InvalidByteSequenceError
4416  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4417  * p $!.incomplete_input? #=> false
4418  * end
4419  *
4420  * begin
4421  * ec.convert("abc\xA1")
4422  * ec.finish
4423  * rescue Encoding::InvalidByteSequenceError
4424  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4425  * p $!.incomplete_input? #=> true
4426  * end
4427  */
4428 static VALUE
4429 ecerr_incomplete_input(VALUE self)
4430 {
4431  return rb_attr_get(self, id_incomplete_input);
4432 }
4433 
4434 /*
4435  * Document-class: Encoding::UndefinedConversionError
4436  *
4437  * Raised by Encoding and String methods when a transcoding operation
4438  * fails.
4439  */
4440 
4441 /*
4442  * Document-class: Encoding::InvalidByteSequenceError
4443  *
4444  * Raised by Encoding and String methods when the string being
4445  * transcoded contains a byte invalid for the either the source or
4446  * target encoding.
4447  */
4448 
4449 /*
4450  * Document-class: Encoding::ConverterNotFoundError
4451  *
4452  * Raised by transcoding methods when a named encoding does not
4453  * correspond with a known converter.
4454  */
4455 
4456 void
4457 Init_transcode(void)
4458 {
4459  transcoder_table = st_init_strcasetable();
4460 
4461  id_destination_encoding = rb_intern_const("destination_encoding");
4462  id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4463  id_error_bytes = rb_intern_const("error_bytes");
4464  id_error_char = rb_intern_const("error_char");
4465  id_incomplete_input = rb_intern_const("incomplete_input");
4466  id_readagain_bytes = rb_intern_const("readagain_bytes");
4467  id_source_encoding = rb_intern_const("source_encoding");
4468  id_source_encoding_name = rb_intern_const("source_encoding_name");
4469 
4470  sym_invalid = ID2SYM(rb_intern_const("invalid"));
4471  sym_undef = ID2SYM(rb_intern_const("undef"));
4472  sym_replace = ID2SYM(rb_intern_const("replace"));
4473  sym_fallback = ID2SYM(rb_intern_const("fallback"));
4474  sym_xml = ID2SYM(rb_intern_const("xml"));
4475  sym_text = ID2SYM(rb_intern_const("text"));
4476  sym_attr = ID2SYM(rb_intern_const("attr"));
4477 
4478  sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4479  sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4480  sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4481  sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4482  sym_finished = ID2SYM(rb_intern_const("finished"));
4483  sym_after_output = ID2SYM(rb_intern_const("after_output"));
4484  sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4485  sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4486  sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4487  sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4488  sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4489  sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4490 
4491 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4492  sym_newline = ID2SYM(rb_intern_const("newline"));
4493  sym_universal = ID2SYM(rb_intern_const("universal"));
4494  sym_crlf = ID2SYM(rb_intern_const("crlf"));
4495  sym_cr = ID2SYM(rb_intern_const("cr"));
4496  sym_lf = ID2SYM(rb_intern_const("lf"));
4497 #endif
4498 
4499  InitVM(transcode);
4500 }
4501 
4502 void
4503 InitVM_transcode(void)
4504 {
4505  rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4506  rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4507  rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4508 
4509  rb_define_method(rb_cString, "encode", str_encode, -1);
4510  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4511 
4512  rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4513  rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4514  rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4515  rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4516  rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4517  rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4518  rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4519  rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4520  rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4521  rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4522  rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4523  rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4524  rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4525  rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4526  rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4527  rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4528  rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4529  rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4530  rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4531 
4532  /* Document-const: INVALID_MASK
4533  *
4534  * Mask for invalid byte sequences
4535  */
4536  rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4537 
4538  /* Document-const: INVALID_REPLACE
4539  *
4540  * Replace invalid byte sequences
4541  */
4542  rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4543 
4544  /* Document-const: UNDEF_MASK
4545  *
4546  * Mask for a valid character in the source encoding but no related
4547  * character(s) in destination encoding.
4548  */
4549  rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4550 
4551  /* Document-const: UNDEF_REPLACE
4552  *
4553  * Replace byte sequences that are undefined in the destination encoding.
4554  */
4555  rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4556 
4557  /* Document-const: UNDEF_HEX_CHARREF
4558  *
4559  * Replace byte sequences that are undefined in the destination encoding
4560  * with an XML hexadecimal character reference. This is valid for XML
4561  * conversion.
4562  */
4563  rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4564 
4565  /* Document-const: PARTIAL_INPUT
4566  *
4567  * Indicates the source may be part of a larger string. See
4568  * primitive_convert for an example.
4569  */
4570  rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4571 
4572  /* Document-const: AFTER_OUTPUT
4573  *
4574  * Stop converting after some output is complete but before all of the
4575  * input was consumed. See primitive_convert for an example.
4576  */
4577  rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4578 
4579  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4580  *
4581  * Decorator for converting CRLF and CR to LF
4582  */
4583  rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4584 
4585  /* Document-const: LF_NEWLINE_DECORATOR
4586  *
4587  * Decorator for converting CRLF and CR to LF when writing
4588  */
4589  rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4590 
4591  /* Document-const: CRLF_NEWLINE_DECORATOR
4592  *
4593  * Decorator for converting LF to CRLF
4594  */
4595  rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4596 
4597  /* Document-const: CR_NEWLINE_DECORATOR
4598  *
4599  * Decorator for converting LF to CR
4600  */
4601  rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4602 
4603  /* Document-const: XML_TEXT_DECORATOR
4604  *
4605  * Escape as XML CharData
4606  */
4607  rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4608 
4609  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4610  *
4611  * Escape as XML AttValue
4612  */
4613  rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4614 
4615  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4616  *
4617  * Escape as XML AttValue
4618  */
4619  rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4620 
4621  rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4622  rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4623  rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4624  rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4625  rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4626 
4627  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4628  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4629  rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4630  rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4631  rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4632  rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4633  rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4634 
4635  Init_newline();
4636 }
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition: coderange.h:33
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
Definition: cxxanyargs.hpp:685
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:1012
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition: class.c:2635
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a method.
Definition: class.c:2142
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition: transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition: transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition: string.h:1675
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition: coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition: coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition: transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition: memory.h:398
#define ALLOC
Old name of RB_ALLOC.
Definition: memory.h:395
#define xfree
Old name of ruby_xfree.
Definition: xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition: long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition: fl_type.h:137
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition: transcode.h:537
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition: transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition: transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition: xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition: symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition: fl_type.h:135
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition: transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition: transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition: array.h:659
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition: coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition: transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition: xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition: transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition: value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition: memory.h:394
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition: encoding.h:517
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition: error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition: transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition: transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition: int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition: transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition: int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition: coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition: transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition: value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition: encoding.h:516
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition: transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition: long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition: transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition: array.h:657
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition: coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition: value_type.h:88
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition: error.c:1375
void rb_raise(VALUE exc_class, const char *fmt,...)
Exception entry point.
Definition: error.c:3635
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition: eval.c:676
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition: error.c:1358
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition: error.c:1089
VALUE rb_eTypeError
TypeError exception.
Definition: error.c:1408
VALUE rb_eRuntimeError
RuntimeError exception.
Definition: error.c:1406
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition: error.c:1459
VALUE rb_eArgError
ArgumentError exception.
Definition: error.c:1409
VALUE rb_eEncodingError
EncodingError exception.
Definition: error.c:1414
void rb_warning(const char *fmt,...)
Issues a warning.
Definition: error.c:497
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition: object.c:247
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
VALUE rb_cString
String class.
Definition: string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition: object.c:3188
Encoding relates APIs.
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1191
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:920
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:261
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1022
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:859
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:323
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:182
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:768
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:566
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1475
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:402
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1685
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:994
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1028
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:591
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:417
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:824
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition: string.c:1285
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition: string.c:900
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_new(), except it additionally takes an encoding.
Definition: string.c:1068
VALUE rb_obj_encoding(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1163
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition: string.c:784
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition: transcode.c:2600
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition: transcode.c:2097
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition: transcode.c:2645
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition: transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition: transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition: transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition: transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition: transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition: transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition: transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition: transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition: transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition: transcode.c:1770
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition: transcode.c:3211
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition: transcode.c:1919
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition: transcode.c:1910
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition: transcode.c:1616
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition: transcode.c:1931
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition: transcode.c:1979
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition: transcode.c:1996
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition: transcode.c:1962
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition: transcode.c:2914
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition: transcode.c:4272
struct rb_econv_t rb_econv_t
An opaque struct that represents a lowest level of encoding conversion.
Definition: transcode.h:73
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition: transcode.c:4278
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition: transcode.c:1814
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition: transcode.c:1925
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition: transcode.c:2651
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition: transcode.c:1731
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition: transcode.c:1847
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition: transcode.c:1781
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition: transcode.c:2259
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition: transcode.c:1098
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition: transcode.c:1532
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition: vm_eval.c:1150
VALUE rb_check_array_type(VALUE obj)
Try converting an object to its array representation using its to_ary method, if any.
Definition: array.c:1008
VALUE rb_ary_new(void)
Allocates a new, empty array.
Definition: array.c:741
VALUE rb_ary_push(VALUE ary, VALUE elem)
Special case of rb_ary_cat() that it adds only one element.
Definition: array.c:1378
VALUE rb_ary_entry(VALUE ary, long off)
Queries an element of an array.
Definition: array.c:1731
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Identical to rb_ary_new_from_values(), except it expects exactly two parameters.
Definition: array.c:995
void rb_ary_store(VALUE ary, long key, VALUE val)
Destructively stores the passed value to the passed array's passed index.
Definition: array.c:1201
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition: error.h:284
VALUE rb_check_hash_type(VALUE obj)
Try converting an object to its hash representation using its to_hash method, if any.
Definition: hash.c:1864
VALUE rb_hash_freeze(VALUE obj)
Just another name of rb_obj_freeze.
Definition: hash.c:108
VALUE rb_hash_aref(VALUE hash, VALUE key)
Queries the given key in the given hash table.
Definition: hash.c:2073
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Inserts or replaces ("upsert"s) the objects into the given hash table.
Definition: hash.c:2893
VALUE rb_hash_new(void)
Creates a new, empty hash object.
Definition: hash.c:1475
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition: proc.c:971
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition: proc.c:1610
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition: proc.c:2488
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition: proc.c:119
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition: string.c:1671
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition: string.c:1713
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition: string.c:954
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition: string.c:1461
VALUE rb_str_cat2(VALUE, const char *)
Just another name of rb_str_cat_cstr.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition: string.c:1916
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition: string.c:2640
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3267
VALUE rb_str_new(const char *ptr, long len)
Allocates an instance of rb_cString.
Definition: string.c:1050
VALUE rb_str_new_cstr(const char *ptr)
Identical to rb_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition: string.c:1074
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition: string.c:3315
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition: string.c:2648
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition: string.c:7310
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition: string.c:1643
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition: string.c:5695
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition: variable.c:1370
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition: variable.c:1871
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition: vm_method.c:2960
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition: symbol.h:277
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition: symbol.c:823
VALUE rb_sym2str(VALUE symbol)
Obtain a frozen string representation of a symbol (not including the leading colon).
Definition: symbol.c:970
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition: variable.c:3726
char * ptr
Pointer to the underlying memory region, of at least capa bytes.
Definition: io.h:2
int off
Offset inside of ptr.
Definition: io.h:5
int len
Length of the buffer.
Definition: io.h:8
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition: sprintf.c:1217
VALUE rb_str_catf(VALUE dst, const char *fmt,...)
Identical to rb_sprintf(), except it renders the output to the specified object rather than creating ...
Definition: sprintf.c:1240
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition: memory.h:367
#define ALLOCA_N(type, n)
Definition: memory.h:287
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition: memory.h:162
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition: memory.h:379
int st_foreach(st_table *q, int_type *w, st_data_t e)
Iteration over the given table.
Definition: cxxanyargs.hpp:432
#define RARRAY_LEN
Just another name of rb_array_len.
Definition: rarray.h:51
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition: rarray.h:281
#define RARRAY_AREF(a, i)
Definition: rarray.h:403
#define DATA_PTR(obj)
Convenient getter macro.
Definition: rdata.h:67
#define StringValue(v)
Ensures that the parameter object is a String.
Definition: rstring.h:66
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition: rstring.h:442
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition: rstring.h:416
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition: rstring.h:367
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition: rstring.h:89
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition: rtypeddata.h:515
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition: rtypeddata.h:449
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition: variable.c:427
#define InitVM(ext)
This macro is for internal use.
Definition: ruby.h:231
#define RTEST
This is an old name of RB_TEST.
This is the struct that holds necessary info for a struct.
Definition: rtypeddata.h:200
Definition: st.h:79
Definition: string.c:8268
Definition: transcode.c:175
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition: value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition: value_type.h:376