Ruby  3.1.0dev(2021-09-10revisionb76ad15ed0da636161de0243c547ee1e6fc95681)
escape.c
Go to the documentation of this file.
1 #include "ruby.h"
2 #include "ruby/encoding.h"
3 
4 RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow);
7 #define lower_hexdigits (ruby_hexdigits+0)
8 #define upper_hexdigits (ruby_hexdigits+16)
9 #define char_to_number(c) ruby_digit36_to_number_table[(unsigned char)(c)]
10 
11 static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
12 static ID id_accept_charset;
13 
14 #define HTML_ESCAPE_MAX_LEN 6
15 
16 static const struct {
19 } html_escape_table[UCHAR_MAX+1] = {
20 #define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
21  HTML_ESCAPE('\'', "'"),
22  HTML_ESCAPE('&', "&"),
23  HTML_ESCAPE('"', """),
24  HTML_ESCAPE('<', "&lt;"),
25  HTML_ESCAPE('>', "&gt;"),
26 #undef HTML_ESCAPE
27 };
28 
29 static inline void
30 preserve_original_state(VALUE orig, VALUE dest)
31 {
32  rb_enc_associate(dest, rb_enc_get(orig));
33 }
34 
35 static VALUE
36 optimized_escape_html(VALUE str)
37 {
38  VALUE vbuf;
39  char *buf = ALLOCV_N(char, vbuf, RSTRING_LEN(str) * HTML_ESCAPE_MAX_LEN);
40  const char *cstr = RSTRING_PTR(str);
41  const char *end = cstr + RSTRING_LEN(str);
42 
43  char *dest = buf;
44  while (cstr < end) {
45  const unsigned char c = *cstr++;
46  uint8_t len = html_escape_table[c].len;
47  if (len) {
48  memcpy(dest, html_escape_table[c].str, len);
49  dest += len;
50  }
51  else {
52  *dest++ = c;
53  }
54  }
55 
56  VALUE escaped;
57  if (RSTRING_LEN(str) < (dest - buf)) {
58  escaped = rb_str_new(buf, dest - buf);
59  preserve_original_state(str, escaped);
60  }
61  else {
62  escaped = rb_str_dup(str);
63  }
64  ALLOCV_END(vbuf);
65  return escaped;
66 }
67 
68 static VALUE
69 optimized_unescape_html(VALUE str)
70 {
71  enum {UNICODE_MAX = 0x10ffff};
72  rb_encoding *enc = rb_enc_get(str);
73  unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX :
74  strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 :
75  128);
76  long i, len, beg = 0;
77  size_t clen, plen;
78  int overflow;
79  const char *cstr;
80  char buf[6];
81  VALUE dest = 0;
82 
83  len = RSTRING_LEN(str);
84  cstr = RSTRING_PTR(str);
85 
86  for (i = 0; i < len; i++) {
87  unsigned long cc;
88  char c = cstr[i];
89  if (c != '&') continue;
90  plen = i - beg;
91  if (++i >= len) break;
92  c = (unsigned char)cstr[i];
93 #define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \
94  memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \
95  (i += rb_strlen_lit(s) - 1, 1))
96  switch (c) {
97  case 'a':
98  ++i;
99  if (MATCH("pos;")) {
100  c = '\'';
101  }
102  else if (MATCH("mp;")) {
103  c = '&';
104  }
105  else continue;
106  break;
107  case 'q':
108  ++i;
109  if (MATCH("uot;")) {
110  c = '"';
111  }
112  else continue;
113  break;
114  case 'g':
115  ++i;
116  if (MATCH("t;")) {
117  c = '>';
118  }
119  else continue;
120  break;
121  case 'l':
122  ++i;
123  if (MATCH("t;")) {
124  c = '<';
125  }
126  else continue;
127  break;
128  case '#':
129  if (len - ++i >= 2 && ISDIGIT(cstr[i])) {
130  cc = ruby_scan_digits(&cstr[i], len-i, 10, &clen, &overflow);
131  }
132  else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) {
133  cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow);
134  }
135  else continue;
136  i += clen;
137  if (overflow || cc >= charlimit || cstr[i] != ';') continue;
138  if (!dest) {
139  dest = rb_str_buf_new(len);
140  }
141  rb_str_cat(dest, cstr + beg, plen);
142  if (charlimit > 256) {
143  rb_str_cat(dest, buf, rb_enc_mbcput((OnigCodePoint)cc, buf, enc));
144  }
145  else {
146  c = (unsigned char)cc;
147  rb_str_cat(dest, &c, 1);
148  }
149  beg = i + 1;
150  continue;
151  default:
152  --i;
153  continue;
154  }
155  if (!dest) {
156  dest = rb_str_buf_new(len);
157  }
158  rb_str_cat(dest, cstr + beg, plen);
159  rb_str_cat(dest, &c, 1);
160  beg = i + 1;
161  }
162 
163  if (dest) {
164  rb_str_cat(dest, cstr + beg, len - beg);
165  preserve_original_state(str, dest);
166  return dest;
167  }
168  else {
169  return rb_str_dup(str);
170  }
171 }
172 
173 static unsigned char
174 url_unreserved_char(unsigned char c)
175 {
176  switch (c) {
177  case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
178  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j':
179  case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
180  case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
181  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J':
182  case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':
183  case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
184  case '-': case '.': case '_': case '~':
185  return 1;
186  default:
187  break;
188  }
189  return 0;
190 }
191 
192 static VALUE
193 optimized_escape(VALUE str)
194 {
195  long i, len, beg = 0;
196  VALUE dest = 0;
197  const char *cstr;
198  char buf[4] = {'%'};
199 
200  len = RSTRING_LEN(str);
201  cstr = RSTRING_PTR(str);
202 
203  for (i = 0; i < len; ++i) {
204  const unsigned char c = (unsigned char)cstr[i];
205  if (!url_unreserved_char(c)) {
206  if (!dest) {
207  dest = rb_str_buf_new(len);
208  }
209 
210  rb_str_cat(dest, cstr + beg, i - beg);
211  beg = i + 1;
212 
213  if (c == ' ') {
214  rb_str_cat_cstr(dest, "+");
215  }
216  else {
217  buf[1] = upper_hexdigits[(c >> 4) & 0xf];
218  buf[2] = upper_hexdigits[c & 0xf];
219  rb_str_cat(dest, buf, 3);
220  }
221  }
222  }
223 
224  if (dest) {
225  rb_str_cat(dest, cstr + beg, len - beg);
226  preserve_original_state(str, dest);
227  return dest;
228  }
229  else {
230  return rb_str_dup(str);
231  }
232 }
233 
234 static VALUE
235 optimized_unescape(VALUE str, VALUE encoding)
236 {
237  long i, len, beg = 0;
238  VALUE dest = 0;
239  const char *cstr;
240  rb_encoding *enc = rb_to_encoding(encoding);
241  int cr, origenc, encidx = rb_enc_to_index(enc);
242 
243  len = RSTRING_LEN(str);
244  cstr = RSTRING_PTR(str);
245 
246  for (i = 0; i < len; ++i) {
247  char buf[1];
248  const char c = cstr[i];
249  int clen = 0;
250  if (c == '%') {
251  if (i + 3 > len) break;
252  if (!ISXDIGIT(cstr[i+1])) continue;
253  if (!ISXDIGIT(cstr[i+2])) continue;
254  buf[0] = ((char_to_number(cstr[i+1]) << 4)
255  | char_to_number(cstr[i+2]));
256  clen = 2;
257  }
258  else if (c == '+') {
259  buf[0] = ' ';
260  }
261  else {
262  continue;
263  }
264 
265  if (!dest) {
266  dest = rb_str_buf_new(len);
267  }
268 
269  rb_str_cat(dest, cstr + beg, i - beg);
270  i += clen;
271  beg = i + 1;
272 
273  rb_str_cat(dest, buf, 1);
274  }
275 
276  if (dest) {
277  rb_str_cat(dest, cstr + beg, len - beg);
278  preserve_original_state(str, dest);
280  }
281  else {
282  dest = rb_str_dup(str);
283  cr = ENC_CODERANGE(str);
284  }
285  origenc = rb_enc_get_index(str);
286  if (origenc != encidx) {
287  rb_enc_associate_index(dest, encidx);
289  rb_enc_associate_index(dest, origenc);
290  if (cr != ENC_CODERANGE_UNKNOWN)
291  ENC_CODERANGE_SET(dest, cr);
292  }
293  }
294  return dest;
295 }
296 
297 /*
298  * call-seq:
299  * CGI.escapeHTML(string) -> string
300  *
301  * Returns HTML-escaped string.
302  *
303  */
304 static VALUE
305 cgiesc_escape_html(VALUE self, VALUE str)
306 {
307  StringValue(str);
308 
310  return optimized_escape_html(str);
311  }
312  else {
313  return rb_call_super(1, &str);
314  }
315 }
316 
317 /*
318  * call-seq:
319  * CGI.unescapeHTML(string) -> string
320  *
321  * Returns HTML-unescaped string.
322  *
323  */
324 static VALUE
325 cgiesc_unescape_html(VALUE self, VALUE str)
326 {
327  StringValue(str);
328 
330  return optimized_unescape_html(str);
331  }
332  else {
333  return rb_call_super(1, &str);
334  }
335 }
336 
337 /*
338  * call-seq:
339  * CGI.escape(string) -> string
340  *
341  * Returns URL-escaped string.
342  *
343  */
344 static VALUE
345 cgiesc_escape(VALUE self, VALUE str)
346 {
347  StringValue(str);
348 
350  return optimized_escape(str);
351  }
352  else {
353  return rb_call_super(1, &str);
354  }
355 }
356 
357 static VALUE
358 accept_charset(int argc, VALUE *argv, VALUE self)
359 {
360  if (argc > 0)
361  return argv[0];
362  return rb_cvar_get(CLASS_OF(self), id_accept_charset);
363 }
364 
365 /*
366  * call-seq:
367  * CGI.unescape(string, encoding=@@accept_charset) -> string
368  *
369  * Returns URL-unescaped string.
370  *
371  */
372 static VALUE
373 cgiesc_unescape(int argc, VALUE *argv, VALUE self)
374 {
375  VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
376 
377  StringValue(str);
378 
380  VALUE enc = accept_charset(argc-1, argv+1, self);
381  return optimized_unescape(str, enc);
382  }
383  else {
384  return rb_call_super(argc, argv);
385  }
386 }
387 
388 void
390 {
391 #ifdef HAVE_RB_EXT_RACTOR_SAFE
392  rb_ext_ractor_safe(true);
393 #endif
394 
395  id_accept_charset = rb_intern_const("@@accept_charset");
396  InitVM(escape);
397 }
398 
399 void
401 {
402  rb_cCGI = rb_define_class("CGI", rb_cObject);
403  rb_mEscape = rb_define_module_under(rb_cCGI, "Escape");
404  rb_mUtil = rb_define_module_under(rb_cCGI, "Util");
405  rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
406  rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
407  rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
408  rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
409  rb_prepend_module(rb_mUtil, rb_mEscape);
410  rb_extend_object(rb_cCGI, rb_mEscape);
411 }
rb_prepend_module
void rb_prepend_module(VALUE klass, VALUE module)
Definition: class.c:1218
rb_define_class
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:759
strcasecmp
#define strcasecmp
Definition: win32.h:207
StringValue
#define StringValue(v)
Definition: rstring.h:50
rb_define_module_under
VALUE rb_define_module_under(VALUE outer, const char *name)
Definition: class.c:914
encoding.h
CLASS_OF
#define CLASS_OF
Definition: globals.h:154
ruby_hexdigits
const RUBY_EXTERN char ruby_hexdigits[]
Definition: escape.c:5
Init_escape
void Init_escape(void)
Definition: escape.c:389
ALLOCV_N
#define ALLOCV_N
Definition: memory.h:139
rb_enc_get
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:1070
OnigCodePoint
unsigned int OnigCodePoint
Definition: onigmo.h:80
argv
char ** argv
Definition: ruby.c:243
ID
unsigned long ID
Definition: value.h:39
ENC_CODERANGE
#define ENC_CODERANGE(obj)
Definition: encoding.h:97
RSTRING_LEN
#define RSTRING_LEN(string)
Definition: fbuffer.h:22
ruby_scan_digits
RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow)
Definition: util.c:98
ISXDIGIT
#define ISXDIGIT
Definition: ctype.h:44
ENC_CODERANGE_CLEAN_P
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:96
InitVM_escape
void InitVM_escape(void)
Definition: escape.c:400
rb_str_new
#define rb_str_new(str, len)
Definition: string.h:213
HTML_ESCAPE
#define HTML_ESCAPE(c, str)
MATCH
#define MATCH(s)
rb_enc_get_index
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:977
ISDIGIT
#define ISDIGIT(c)
Definition: dtoa.c:211
ruby_digit36_to_number_table
const RUBY_EXTERN signed char ruby_digit36_to_number_table[]
Definition: escape.c:6
rb_cObject
VALUE rb_cObject
Object class.
Definition: object.c:50
rb_ext_ractor_safe
void rb_ext_ractor_safe(bool flag)
Definition: load.c:1039
rb_check_arity
#define rb_check_arity
Definition: error.h:34
len
uint8_t len
Definition: escape.c:17
rb_enc_name
#define rb_enc_name(enc)
Definition: encoding.h:168
ruby.h
UNICODE_MAX
#define UNICODE_MAX
Definition: nkf.c:427
rb_enc_to_index
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:197
memcpy
#define memcpy
Definition: memory.h:278
uint8_t
unsigned char uint8_t
Definition: sha2.h:100
upper_hexdigits
#define upper_hexdigits
Definition: escape.c:8
rb_call_super
VALUE rb_call_super(int, const VALUE *)
Definition: vm_eval.c:331
InitVM
#define InitVM(ext)
Definition: ruby.h:107
rb_extend_object
void rb_extend_object(VALUE obj, VALUE module)
Extend the object with the module.
Definition: eval.c:1723
rb_str_dup
VALUE rb_str_dup(VALUE)
Definition: string.c:1624
rb_enc_mbcput
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:208
rb_str_cat
VALUE rb_str_cat(VALUE, const char *, long)
Definition: string.c:2953
VALUE
unsigned long VALUE
Definition: value.h:38
ALLOCV_END
#define ALLOCV_END
Definition: memory.h:140
buf
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4322
rb_to_encoding
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:329
rb_str_buf_new
VALUE rb_str_buf_new(long)
Definition: string.c:1391
RSTRING_PTR
#define RSTRING_PTR(string)
Definition: fbuffer.h:19
str
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
ENC_CODERANGE_SET
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:99
RUBY_EXTERN
#define RUBY_EXTERN
Definition: dllexport.h:36
HTML_ESCAPE_MAX_LEN
#define HTML_ESCAPE_MAX_LEN
Definition: escape.c:14
ENC_CODERANGE_UNKNOWN
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:92
argc
int argc
Definition: ruby.c:242
rb_str_cat_cstr
#define rb_str_cat_cstr(buf, str)
Definition: string.h:266
rb_encoding
const typedef OnigEncodingType rb_encoding
Definition: encoding.h:104
rb_enc_str_coderange
int rb_enc_str_coderange(VALUE)
Definition: string.c:718
rb_enc_str_asciicompat_p
#define rb_enc_str_asciicompat_p(str)
Definition: encoding.h:246
char_to_number
#define char_to_number(c)
Definition: escape.c:9
rb_cvar_get
VALUE rb_cvar_get(VALUE, ID)
Definition: variable.c:3471
rb_define_method
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
Definition: cxxanyargs.hpp:655
rb_enc_associate
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:1064
rb_enc_associate_index
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:1036