Ruby  3.1.0dev(2021-09-10revisionb76ad15ed0da636161de0243c547ee1e6fc95681)
nkf.c
Go to the documentation of this file.
1 /*
2  * NKF - Ruby extension for Network Kanji Filter
3  *
4  * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
5  *
6  * $Id$
7  *
8  */
9 
10 #define RUBY_NKF_REVISION "$Revision$"
11 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
12 
13 #include "ruby/ruby.h"
14 #include "ruby/encoding.h"
15 
16 /* Replace nkf's getchar/putchar for variable modification */
17 /* we never use getc, ungetc */
18 
19 #undef getc
20 #undef ungetc
21 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++])
22 #define ungetc(c,f) input_ctr--
23 
24 #define INCSIZE 32
25 #undef putchar
26 #undef TRUE
27 #undef FALSE
28 #define putchar(c) rb_nkf_putchar(c)
29 
30 /* Input/Output pointers */
31 
32 static unsigned char *output;
33 static unsigned char *input;
34 static int input_ctr;
35 static int i_len;
36 static int output_ctr;
37 static int o_len;
38 static int incsize;
39 
40 static VALUE result;
41 
42 static int
43 rb_nkf_putchar(unsigned int c)
44 {
45  if (output_ctr >= o_len) {
46  o_len += incsize;
47  rb_str_resize(result, o_len);
48  incsize *= 2;
49  output = (unsigned char *)RSTRING_PTR(result);
50  }
51  output[output_ctr++] = c;
52 
53  return c;
54 }
55 
56 /* Include kanji filter main part */
57 /* getchar and putchar will be replaced during inclusion */
58 
59 #define PERL_XS 1
60 #include "nkf-utf8/config.h"
61 #include "nkf-utf8/utf8tbl.c"
62 #include "nkf-utf8/nkf.c"
63 
65 {
66  int idx = rb_enc_find_index(name);
67  if (idx < 0) {
68  nkf_encoding *nkf_enc = nkf_enc_find(name);
70  if (idx < 0) {
72  }
73  }
74  return rb_enc_from_index(idx);
75 }
76 
77 int nkf_split_options(const char *arg)
78 {
79  int count = 0;
80  unsigned char option[256];
81  int i = 0, j = 0;
82  int is_escaped = FALSE;
83  int is_single_quoted = FALSE;
84  int is_double_quoted = FALSE;
85  for(i = 0; arg[i]; i++){
86  if(j == 255){
87  return -1;
88  }else if(is_single_quoted){
89  if(arg[i] == '\''){
90  is_single_quoted = FALSE;
91  }else{
92  option[j++] = arg[i];
93  }
94  }else if(is_escaped){
95  is_escaped = FALSE;
96  option[j++] = arg[i];
97  }else if(arg[i] == '\\'){
98  is_escaped = TRUE;
99  }else if(is_double_quoted){
100  if(arg[i] == '"'){
101  is_double_quoted = FALSE;
102  }else{
103  option[j++] = arg[i];
104  }
105  }else if(arg[i] == '\''){
106  is_single_quoted = TRUE;
107  }else if(arg[i] == '"'){
108  is_double_quoted = TRUE;
109  }else if(arg[i] == ' '){
110  option[j] = '\0';
111  options(option);
112  j = 0;
113  }else{
114  option[j++] = arg[i];
115  }
116  }
117  if(j){
118  option[j] = '\0';
119  options(option);
120  }
121  return count;
122 }
123 
124 /*
125  * call-seq:
126  * NKF.nkf(opt, str) => string
127  *
128  * Convert _str_ and return converted result.
129  * Conversion details are specified by _opt_ as String.
130  *
131  * require 'nkf'
132  * output = NKF.nkf("-s", input)
133  */
134 
135 static VALUE
136 rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
137 {
138  VALUE tmp;
139  reinit();
141  if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
142 
143  switch (nkf_enc_to_index(output_encoding)) {
144  case UTF_8_BOM: output_encoding = nkf_enc_from_index(UTF_8); break;
145  case UTF_16BE_BOM: output_encoding = nkf_enc_from_index(UTF_16BE); break;
146  case UTF_16LE_BOM: output_encoding = nkf_enc_from_index(UTF_16LE); break;
147  case UTF_32BE_BOM: output_encoding = nkf_enc_from_index(UTF_32BE); break;
148  case UTF_32LE_BOM: output_encoding = nkf_enc_from_index(UTF_32LE); break;
149  }
150  output_bom_f = FALSE;
151 
152  incsize = INCSIZE;
153 
154  input_ctr = 0;
155  input = (unsigned char *)StringValuePtr(src);
156  i_len = RSTRING_LENINT(src);
157  tmp = rb_str_new(0, i_len*3 + 10);
158 
159  output_ctr = 0;
160  output = (unsigned char *)RSTRING_PTR(tmp);
161  o_len = RSTRING_LENINT(tmp);
162  *output = '\0';
163 
164  /* use _result_ begin*/
165  result = tmp;
166  kanji_convert(NULL);
167  result = Qnil;
168  /* use _result_ end */
169 
170  rb_str_set_len(tmp, output_ctr);
171 
172  if (mimeout_f)
174  else
175  rb_enc_associate(tmp, rb_nkf_enc_get(nkf_enc_name(output_encoding)));
176 
177  return tmp;
178 }
179 
180 
181 /*
182  * call-seq:
183  * NKF.guess(str) => encoding
184  *
185  * Returns guessed encoding of _str_ by nkf routine.
186  *
187  */
188 
189 static VALUE
190 rb_nkf_guess(VALUE obj, VALUE src)
191 {
192  reinit();
193 
194  input_ctr = 0;
195  input = (unsigned char *)StringValuePtr(src);
196  i_len = RSTRING_LENINT(src);
197 
198  guess_f = TRUE;
199  kanji_convert( NULL );
200  guess_f = FALSE;
201 
202  return rb_enc_from_encoding(rb_nkf_enc_get(get_guessed_code()));
203 }
204 
205 
206 /*
207  * NKF - Ruby extension for Network Kanji Filter
208  *
209  * == Description
210  *
211  * This is a Ruby Extension version of nkf (Network Kanji Filter).
212  * It converts the first argument and returns converted result. Conversion
213  * details are specified by flags as the first argument.
214  *
215  * *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
216  * It converts input kanji code to designated kanji code
217  * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16.
218  *
219  * One of the most unique faculty of *nkf* is the guess of the input kanji encodings.
220  * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16.
221  * So users needn't set the input kanji code explicitly.
222  *
223  * By default, X0201 kana is converted into X0208 kana.
224  * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
225  * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
226  * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>.
227  *
228  * == Flags
229  *
230  * === -b -u
231  *
232  * Output is buffered (DEFAULT), Output is unbuffered.
233  *
234  * === -j -s -e -w -w16 -w32
235  *
236  * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
237  * UTF-8N, UTF-16BE, UTF-32BE.
238  * Without this option and compile option, ISO-2022-JP is assumed.
239  *
240  * === -J -S -E -W -W16 -W32
241  *
242  * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
243  * UTF-8, UTF-16, UTF-32.
244  *
245  * ==== -J
246  *
247  * Assume JIS input. It also accepts EUC-JP.
248  * This is the default. This flag does not exclude Shift_JIS.
249  *
250  * ==== -S
251  *
252  * Assume Shift_JIS and X0201 kana input. It also accepts JIS.
253  * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag,
254  * X0201 kana (halfwidth kana) is converted into X0208.
255  *
256  * ==== -E
257  *
258  * Assume EUC-JP input. It also accepts JIS.
259  * Same as -J.
260  *
261  * === -t
262  *
263  * No conversion.
264  *
265  * === -i_
266  *
267  * Output sequence to designate JIS-kanji. (DEFAULT B)
268  *
269  * === -o_
270  *
271  * Output sequence to designate ASCII. (DEFAULT B)
272  *
273  * === -r
274  *
275  * {de/en}crypt ROT13/47
276  *
277  * === -h[123] --hiragana --katakana --katakana-hiragana
278  *
279  * [-h1 --hiragana] Katakana to Hiragana conversion.
280  *
281  * [-h2 --katakana] Hiragana to Katakana conversion.
282  *
283  * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion.
284  *
285  * === -T
286  *
287  * Text mode output (MS-DOS)
288  *
289  * === -l
290  *
291  * ISO8859-1 (Latin-1) support
292  *
293  * === -f[<code>m</code> [- <code>n</code>]]
294  *
295  * Folding on <code>m</code> length with <code>n</code> margin in a line.
296  * Without this option, fold length is 60 and fold margin is 10.
297  *
298  * === -F
299  *
300  * New line preserving line folding.
301  *
302  * === -Z[0-3]
303  *
304  * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
305  *
306  * [-Z -Z0] Convert X0208 alphabet to ASCII.
307  *
308  * [-Z1] Converts X0208 kankaku to single ASCII space.
309  *
310  * [-Z2] Converts X0208 kankaku to double ASCII spaces.
311  *
312  * [-Z3] Replacing Fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
313  *
314  * === -X -x
315  *
316  * Assume X0201 kana in MS-Kanji.
317  * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana.
318  * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
319  * In JIS output, ESC-(-I is used. In EUC output, SSO is used.
320  *
321  * === -B[0-2]
322  *
323  * Assume broken JIS-Kanji input, which lost ESC.
324  * Useful when your site is using old B-News Nihongo patch.
325  *
326  * [-B1] allows any char after ESC-( or ESC-$.
327  *
328  * [-B2] forces ASCII after NL.
329  *
330  * === -I
331  *
332  * Replacing non iso-2022-jp char into a geta character
333  * (substitute character in Japanese).
334  *
335  * === -d -c
336  *
337  * Delete \r in line feed, Add \r in line feed.
338  *
339  * === -m[BQN0]
340  *
341  * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
342  * To see ISO8859-1 (Latin-1) -l is necessary.
343  *
344  * [-mB] Decode MIME base64 encoded stream. Remove header or other part before
345  * conversion.
346  *
347  * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
348  *
349  * [-mN] Non-strict decoding.
350  * It allows line break in the middle of the base64 encoding.
351  *
352  * [-m0] No MIME decode.
353  *
354  * === -M
355  *
356  * MIME encode. Header style. All ASCII code and control characters are intact.
357  * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
358  *
359  * [-MB] MIME encode Base64 stream.
360  *
361  * [-MQ] Perfome quoted encoding.
362  *
363  * === -l
364  *
365  * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
366  * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option.
367  *
368  * === -L[uwm]
369  *
370  * new line mode
371  * Without this option, nkf doesn't convert line breaks.
372  *
373  * [-Lu] unix (LF)
374  *
375  * [-Lw] windows (CRLF)
376  *
377  * [-Lm] mac (CR)
378  *
379  * === --fj --unix --mac --msdos --windows
380  *
381  * convert for these system
382  *
383  * === --jis --euc --sjis --mime --base64
384  *
385  * convert for named code
386  *
387  * === --jis-input --euc-input --sjis-input --mime-input --base64-input
388  *
389  * assume input system
390  *
391  * === --ic=<code>input codeset</code> --oc=<code>output codeset</code>
392  *
393  * Set the input or output codeset.
394  * NKF supports following codesets and those codeset name are case insensitive.
395  *
396  * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET
397  *
398  * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS
399  *
400  * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii
401  *
402  * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms
403  *
404  * [CP51932] Microsoft Version of EUC-JP.
405  *
406  * [Shift_JIS] SJIS, MS-Kanji
407  *
408  * [Windows-31J] a.k.a. CP932
409  *
410  * [UTF-8] same as UTF-8N
411  *
412  * [UTF-8N] UTF-8 without BOM
413  *
414  * [UTF-8-BOM] UTF-8 with BOM
415  *
416  * [UTF-16] same as UTF-16BE
417  *
418  * [UTF-16BE] UTF-16 Big Endian without BOM
419  *
420  * [UTF-16BE-BOM] UTF-16 Big Endian with BOM
421  *
422  * [UTF-16LE] UTF-16 Little Endian without BOM
423  *
424  * [UTF-16LE-BOM] UTF-16 Little Endian with BOM
425  *
426  * [UTF-32] same as UTF-32BE
427  *
428  * [UTF-32BE] UTF-32 Big Endian without BOM
429  *
430  * [UTF-32BE-BOM] UTF-32 Big Endian with BOM
431  *
432  * [UTF-32LE] UTF-32 Little Endian without BOM
433  *
434  * [UTF-32LE-BOM] UTF-32 Little Endian with BOM
435  *
436  * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
437  *
438  * === --fb-{skip, html, xml, perl, java, subchar}
439  *
440  * Specify the way that nkf handles unassigned characters.
441  * Without this option, --fb-skip is assumed.
442  *
443  * === --prefix= <code>escape character</code> <code>target character</code> ..
444  *
445  * When nkf converts to Shift_JIS,
446  * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
447  * 1st byte of argument is the escape character and following bytes are target characters.
448  *
449  * === --no-cp932ext
450  *
451  * Handle the characters extended in CP932 as unassigned characters.
452  *
453  * == --no-best-fit-chars
454  *
455  * When Unicode to Encoded byte conversion,
456  * don't convert characters which is not round trip safe.
457  * When Unicode to Unicode conversion,
458  * with this and -x option, nkf can be used as UTF converter.
459  * (In other words, without this and -x option, nkf doesn't save some characters)
460  *
461  * When nkf convert string which related to path, you should use this option.
462  *
463  * === --cap-input
464  *
465  * Decode hex encoded characters.
466  *
467  * === --url-input
468  *
469  * Unescape percent escaped characters.
470  *
471  * === --
472  *
473  * Ignore rest of -option.
474  */
475 
476 void
477 Init_nkf(void)
478 {
479  VALUE mNKF = rb_define_module("NKF");
480 
481  rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
482  rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
483  rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
484 
485  rb_define_const(mNKF, "AUTO", Qnil);
486  rb_define_const(mNKF, "NOCONV", Qnil);
487  rb_define_const(mNKF, "UNKNOWN", Qnil);
488  rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
489  rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
490  rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
491  rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
492  rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
494  rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
495  rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
496 
497  /* Full version string of nkf */
498  rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
499  /* Version of nkf */
500  rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
501  /* Release date of nkf */
502  rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
503 }
rb_nkf_enc_get
rb_encoding * rb_nkf_enc_get(const char *name)
Definition: nkf.c:64
TRUE
#define TRUE
Definition: nkf.h:175
UTF_16BE_BOM
@ UTF_16BE_BOM
Definition: nkf.c:114
StringValuePtr
#define StringValuePtr(v)
Definition: rstring.h:51
encoding.h
rb_eArgError
VALUE rb_eArgError
Definition: error.c:1094
rb_str_resize
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2850
rb_define_module
VALUE rb_define_module(const char *name)
Definition: class.c:887
config.h
NKF_VERSION
#define NKF_VERSION
Definition: nkf.c:23
rb_str_set_len
void rb_str_set_len(VALUE, long)
Definition: string.c:2833
INCSIZE
#define INCSIZE
Definition: nkf.c:24
UTF_8
@ UTF_8
Definition: nkf.c:108
RUBY_NKF_VERSION
#define RUBY_NKF_VERSION
Definition: nkf.c:11
UTF_32BE
@ UTF_32BE
Definition: nkf.c:118
nkf_enc_name
#define nkf_enc_name(enc)
Definition: nkf.c:758
rb_enc_from_encoding
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:188
ruby.h
rb_define_alias
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:2050
UTF_32BE_BOM
@ UTF_32BE_BOM
Definition: nkf.c:119
rb_str_new
#define rb_str_new(str, len)
Definition: string.h:213
UTF_16LE_BOM
@ UTF_16LE_BOM
Definition: nkf.c:116
rb_raise
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:3022
Init_nkf
void Init_nkf(void)
Definition: nkf.c:477
UTF_16LE
@ UTF_16LE
Definition: nkf.c:115
rb_enc_from_index
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:414
rb_define_dummy_encoding
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:616
UTF_8_BOM
@ UTF_8_BOM
Definition: nkf.c:110
UTF_32LE_BOM
@ UTF_32LE_BOM
Definition: nkf.c:121
Qnil
#define Qnil
Definition: special_consts.h:51
input
unsigned int input
Definition: nkf.c:4325
NULL
#define NULL
Definition: regenc.h:69
FALSE
#define FALSE
Definition: nkf.h:174
nkf_split_options
int nkf_split_options(const char *arg)
Definition: nkf.c:77
UTF_16BE
@ UTF_16BE
Definition: nkf.c:113
nkf_enc_to_base_encoding
#define nkf_enc_to_base_encoding(enc)
Definition: nkf.c:760
VALUE
unsigned long VALUE
Definition: value.h:38
nkf_encoding
Definition: nkf.c:161
rb_utf8_encoding
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1539
RSTRING_PTR
#define RSTRING_PTR(string)
Definition: fbuffer.h:19
rb_enc_find_index
int rb_enc_find_index(const char *name)
Definition: encoding.c:879
nkf_enc_to_index
#define nkf_enc_to_index(enc)
Definition: nkf.c:759
rb_define_const
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:3168
rb_singleton_class
VALUE rb_singleton_class(VALUE obj)
Returns the singleton class of obj.
Definition: class.c:1975
rb_str_new2
#define rb_str_new2
Definition: string.h:276
rb_encoding
const typedef OnigEncodingType rb_encoding
Definition: encoding.h:104
count
int count
Definition: nkf.c:5055
StringValueCStr
#define StringValueCStr(v)
Definition: rstring.h:52
rb_define_module_function
#define rb_define_module_function(klass, mid, func, arity)
Defines klass#mid and makes it a module function.
Definition: cxxanyargs.hpp:674
NKF_RELEASE_DATE
#define NKF_RELEASE_DATE
Definition: nkf.c:24
utf8tbl.c
UTF_32LE
@ UTF_32LE
Definition: nkf.c:120
nkf.c
rb_enc_associate
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:1064
rb_usascii_encoding
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1551
name
const char * name
Definition: nkf.c:208