Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
encoding.h
Go to the documentation of this file.
1 #ifndef RUBY_INTERNAL_ENCODING_ENCODING_H /*-*-C++-*-vi:se ft=cpp:*/
2 #define RUBY_INTERNAL_ENCODING_ENCODING_H
24 #include "ruby/oniguruma.h"
32 #include "ruby/internal/value.h"
34 #include "ruby/internal/fl_type.h"
35 
37 
38 
44 
51 enum ruby_encoding_consts {
52 
54  RUBY_ENCODING_INLINE_MAX = 127,
55 
57  RUBY_ENCODING_SHIFT = (RUBY_FL_USHIFT+10),
58 
60  RUBY_ENCODING_MASK = (RUBY_ENCODING_INLINE_MAX<<RUBY_ENCODING_SHIFT
61  /* RUBY_FL_USER10..RUBY_FL_USER16 */),
62 
64  RUBY_ENCODING_MAXNAMELEN = 42
65 };
66 
67 #define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX
68 #define ENCODING_SHIFT RUBY_ENCODING_SHIFT
69 #define ENCODING_MASK RUBY_ENCODING_MASK
80 static inline void
81 RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
82 {
83  VALUE f = /* upcast */ RBIMPL_CAST((VALUE)encindex);
84 
85  f <<= RUBY_ENCODING_SHIFT;
86  RB_FL_UNSET_RAW(obj, RUBY_ENCODING_MASK);
87  RB_FL_SET_RAW(obj, f);
88 }
89 
98 static inline int
100 {
101  VALUE ret = RB_FL_TEST_RAW(obj, RUBY_ENCODING_MASK) >> RUBY_ENCODING_SHIFT;
102 
103  return RBIMPL_CAST((int)ret);
104 }
105 
106 #define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i)
107 #define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i)
108 #define ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj)
109 #define ENCODING_GET(obj) RB_ENCODING_GET(obj)
110 #define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj)
111 #define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN
118 
140 int rb_char_to_option_kcode(int c, int *option, int *kcode);
141 
153 int rb_define_dummy_encoding(const char *name);
154 
163 int rb_enc_dummy_p(rb_encoding *enc);
164 
175 int rb_enc_to_index(rb_encoding *enc);
176 
184 int rb_enc_get_index(VALUE obj);
185 
194 static inline int
196 {
197  int encindex = RB_ENCODING_GET_INLINED(obj);
198 
199  if (encindex == RUBY_ENCODING_INLINE_MAX) {
200  return rb_enc_get_index(obj);
201  }
202  else {
203  return encindex;
204  }
205 }
206 
217 void rb_enc_set_index(VALUE obj, int encindex);
218 
220 static inline void
221 RB_ENCODING_SET(VALUE obj, int encindex)
222 {
223  rb_enc_set_index(obj, encindex);
224 }
225 
237 static inline void
239 {
240  RB_ENCODING_SET(obj, encindex);
241  RB_ENC_CODERANGE_SET(obj, cr);
242 }
243 
252 int rb_enc_capable(VALUE obj);
253 
262 int rb_enc_find_index(const char *name);
263 
277 int rb_enc_alias(const char *alias, const char *orig);
278 
287 int rb_to_encoding_index(VALUE obj);
288 
299 
309 
318 
332 
343 rb_encoding *rb_enc_check(VALUE str1,VALUE str2);
344 
359 VALUE rb_enc_associate_index(VALUE obj, int encindex);
360 
373 
387 void rb_enc_copy(VALUE dst, VALUE src);
388 
389 
399 
408 rb_encoding *rb_enc_find(const char *name);
409 
416 static inline const char *
418 {
419  return enc->name;
420 }
421 
431 static inline int
433 {
434  return enc->min_enc_len;
435 }
436 
446 static inline int
448 {
449  return enc->max_enc_len;
450 }
451 
468 int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc);
469 
486 int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc);
487 
514 int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
515 
516 #define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret)
517 #define MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret)
518 #define MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret)
519 #define MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret)
520 #define MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret)
536 int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc);
537 
550 unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc);
551 
570 static inline unsigned int
571 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
572 {
573  return rb_enc_codepoint_len(p, e, 0, enc);
574  /* ^^^
575  * This can be `NULL` in C, `nullptr` in C++, and `0` for both.
576  * We choose the most portable one here.
577  */
578 }
579 
580 
590 static inline OnigCodePoint
591 rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
592 {
593  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
594  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
595 
596  return ONIGENC_MBC_TO_CODE(enc, up, ue);
597 }
598 
608 int rb_enc_codelen(int code, rb_encoding *enc);
609 
618 static inline int
620 {
621  OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
622 
623  return ONIGENC_CODE_TO_MBCLEN(enc, uc);
624 }
625 
642 static inline int
643 rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
644 {
645  OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c);
646  OnigUChar *ubuf = RBIMPL_CAST((OnigUChar *)buf);
647 
648  return ONIGENC_CODE_TO_MBC(enc, uc, ubuf);
649 }
650 
661 static inline char *
662 rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
663 {
664  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
665  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
666  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
667  OnigUChar *ur = onigenc_get_prev_char_head(enc, us, up, ue);
668 
669  return RBIMPL_CAST((char *)ur);
670 }
671 
682 static inline char *
683 rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
684 {
685  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
686  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
687  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
688  OnigUChar *ur = onigenc_get_left_adjust_char_head(enc, us, up, ue);
689 
690  return RBIMPL_CAST((char *)ur);
691 }
692 
703 static inline char *
704 rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
705 {
706  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
707  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
708  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
709  OnigUChar *ur = onigenc_get_right_adjust_char_head(enc, us, up, ue);
710 
711  return RBIMPL_CAST((char *)ur);
712 }
713 
725 static inline char *
726 rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
727 {
728  const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s);
729  const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p);
730  const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e);
731  const OnigUChar *ur = onigenc_step_back(enc, us, up, ue, n);
732 
733  return RBIMPL_CAST((char *)ur);
734 }
735 
746 static inline int
747 rb_enc_asciicompat_inline(rb_encoding *enc)
748 {
749  return rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc);
750 }
751 
767 static inline bool
769 {
770  if (rb_enc_mbminlen(enc) != 1) {
771  return false;
772  }
773  else if (rb_enc_dummy_p(enc)) {
774  return false;
775  }
776  else {
777  return true;
778  }
779 }
780 
788 static inline bool
790 {
791  rb_encoding *enc = rb_enc_get(str);
792 
793  return rb_enc_asciicompat(enc);
794 }
795 
805 
821 int rb_enc_unicode_p(rb_encoding *enc);
822 
834 
846 
858 
872 
883 
892 
901 
902 #ifndef rb_ascii8bit_encindex
914 int rb_ascii8bit_encindex(void);
915 #endif
916 
926 static inline bool
928 {
930 }
931 
932 #ifndef rb_utf8_encindex
940 int rb_utf8_encindex(void);
941 #endif
942 
943 #ifndef rb_usascii_encindex
951 int rb_usascii_encindex(void);
952 #endif
953 
960 int rb_locale_encindex(void);
961 
968 int rb_filesystem_encindex(void);
969 
978 
987 
997 void rb_enc_set_default_external(VALUE encoding);
998 
1008 void rb_enc_set_default_internal(VALUE encoding);
1009 
1020 
1022 
1023 
1024 #define RB_ENCODING_GET RB_ENCODING_GET
1025 #define RB_ENCODING_GET_INLINED RB_ENCODING_GET_INLINED
1026 #define RB_ENCODING_IS_ASCII8BIT RB_ENCODING_IS_ASCII8BIT
1027 #define RB_ENCODING_SET RB_ENCODING_SET
1028 #define RB_ENCODING_SET_INLINED RB_ENCODING_SET_INLINED
1029 #define rb_enc_asciicompat rb_enc_asciicompat
1030 #define rb_enc_code_to_mbclen rb_enc_code_to_mbclen
1031 #define rb_enc_codepoint rb_enc_codepoint
1032 #define rb_enc_left_char_head rb_enc_left_char_head
1033 #define rb_enc_mbc_to_codepoint rb_enc_mbc_to_codepoint
1034 #define rb_enc_mbcput rb_enc_mbcput
1035 #define rb_enc_mbmaxlen rb_enc_mbmaxlen
1036 #define rb_enc_mbminlen rb_enc_mbminlen
1037 #define rb_enc_name rb_enc_name
1038 #define rb_enc_prev_char rb_enc_prev_char
1039 #define rb_enc_right_char_head rb_enc_right_char_head
1040 #define rb_enc_step_back rb_enc_step_back
1041 #define rb_enc_str_asciicompat_p rb_enc_str_asciicompat_p
1044 #endif /* RUBY_INTERNAL_ENCODING_ENCODING_H */
Routines for code ranges.
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition: coderange.h:33
static void RB_ENC_CODERANGE_SET(VALUE obj, enum ruby_coderange_type cr)
Destructively modifies the passed object so that its (inline) code range is the passed one.
Definition: coderange.h:129
Defines RBIMPL_ATTR_CONST.
Defines RBIMPL_ATTR_DEPRECATED.
Tweaking visibility of C variables/functions.
#define RUBY_EXTERN
Declaration of externally visible global variables.
Definition: dllexport.h:45
#define RBIMPL_SYMBOL_EXPORT_END()
Counterpart of RBIMPL_SYMBOL_EXPORT_BEGIN.
Definition: dllexport.h:74
#define RBIMPL_SYMBOL_EXPORT_BEGIN()
Shortcut macro equivalent to RUBY_SYMBOL_EXPORT_BEGIN extern "C" {.
Definition: dllexport.h:65
Defines enum ruby_fl_type.
@ RUBY_FL_USHIFT
Number of bits in ruby_fl_type that are not open to users.
Definition: fl_type.h:159
static VALUE RB_FL_TEST_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_TEST().
Definition: fl_type.h:469
static void RB_FL_SET_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_SET().
Definition: fl_type.h:606
static void RB_FL_UNSET_RAW(VALUE obj, VALUE flags)
This is an implementation detail of RB_FL_UNSET().
Definition: fl_type.h:666
VALUE rb_cEncoding
Encoding class.
Definition: encoding.c:57
rb_encoding * rb_locale_encoding(void)
Queries the encoding that represents the current locale.
Definition: encoding.c:1523
rb_encoding * rb_default_external_encoding(void)
Queries the "default external" encoding.
Definition: encoding.c:1589
int rb_enc_dummy_p(rb_encoding *enc)
Queries if the passed encoding is dummy.
Definition: encoding.c:197
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1191
int rb_enc_get_index(VALUE obj)
Queries the index of the encoding of the passed object, if any.
Definition: encoding.c:920
int rb_to_encoding_index(VALUE obj)
Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).
Definition: encoding.c:261
int rb_filesystem_encindex(void)
Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding...
Definition: encoding.c:1529
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.
Definition: encoding.c:1022
rb_encoding * rb_usascii_encoding(void)
Queries the encoding that represents US-ASCII.
Definition: encoding.c:1487
static void RB_ENCODING_SET_INLINED(VALUE obj, int encindex)
Destructively assigns the passed encoding to the passed object.
Definition: encoding.h:81
static bool RB_ENCODING_IS_ASCII8BIT(VALUE obj)
Queries if the passed object is in ascii 8bit (== binary) encoding.
Definition: encoding.h:927
int rb_enc_codelen(int code, rb_encoding *enc)
Queries the number of bytes requested to represent the passed code point using the passed encoding.
Definition: encoding.c:1241
const OnigEncodingType rb_encoding
The type of encoding.
Definition: encoding.h:117
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition: encoding.h:683
void rb_enc_copy(VALUE dst, VALUE src)
Destructively copies the encoding of the latter object to that of former one.
Definition: encoding.c:1149
int rb_utf8_encindex(void)
Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itsel...
Definition: encoding.c:1481
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_mbclen() unless the character at p overruns e.
Definition: encoding.c:1173
static int RB_ENCODING_GET(VALUE obj)
Just another name of rb_enc_get_index.
Definition: encoding.h:195
int rb_ascii8bit_encindex(void)
Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding ...
Definition: encoding.c:1469
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Look for the "common" encoding between the two.
Definition: encoding.c:1140
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1227
int rb_enc_unicode_p(rb_encoding *enc)
Queries if the passed encoding is either one of UTF-8/16/32.
Definition: encoding.c:638
rb_encoding * rb_default_internal_encoding(void)
Queries the "default internal" encoding.
Definition: encoding.c:1676
int rb_enc_to_index(rb_encoding *enc)
Queries the index of the encoding.
Definition: encoding.c:191
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition: encoding.h:704
rb_encoding * rb_find_encoding(VALUE obj)
Identical to rb_to_encoding_index(), except the return type.
Definition: encoding.c:330
rb_encoding * rb_enc_find(const char *name)
Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.
Definition: encoding.c:859
rb_encoding * rb_to_encoding(VALUE obj)
Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.
Definition: encoding.c:323
void rb_enc_set_index(VALUE obj, int encindex)
Destructively assigns an encoding (via its index) to an object.
Definition: encoding.c:986
VALUE rb_locale_charmap(VALUE klass)
Returns a platform-depended "charmap" of the current locale.
Definition: localeinit.c:91
rb_encoding * rb_ascii8bit_encoding(void)
Queries the encoding that represents ASCII-8BIT a.k.a.
Definition: encoding.c:1463
void rb_enc_set_default_internal(VALUE encoding)
Destructively assigns the passed encoding as the default internal encoding.
Definition: encoding.c:1726
VALUE rb_enc_default_external(void)
Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1603
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.
Definition: encoding.c:1062
VALUE rb_enc_from_encoding(rb_encoding *enc)
Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.
Definition: encoding.c:182
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition: encoding.h:768
int rb_define_dummy_encoding(const char *name)
Creates a new "dummy" encoding.
Definition: encoding.c:566
rb_encoding * rb_utf8_encoding(void)
Queries the encoding that represents UTF-8.
Definition: encoding.c:1475
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition: encoding.h:662
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.h:571
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition: encoding.h:643
int rb_locale_encindex(void)
Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding its...
Definition: encoding.c:1501
int rb_char_to_option_kcode(int c, int *option, int *kcode)
Converts a character option to its encoding.
Definition: re.c:333
rb_encoding * rb_enc_from_index(int idx)
Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.
Definition: encoding.c:402
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:447
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1179
static void RB_ENCODING_SET(VALUE obj, int encindex)
Just another name of rb_enc_set_index.
Definition: encoding.h:221
int rb_enc_capable(VALUE obj)
Queries if the passed object can have its encoding.
Definition: encoding.c:884
static void RB_ENCODING_CODERANGE_SET(VALUE obj, int encindex, enum ruby_coderange_type cr)
This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.
Definition: encoding.h:238
VALUE rb_enc_default_internal(void)
Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of...
Definition: encoding.c:1685
VALUE rb_enc_associate_index(VALUE obj, int encindex)
Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed ...
Definition: encoding.c:994
rb_encoding * rb_enc_get(VALUE obj)
Identical to rb_enc_get_index(), except the return type.
Definition: encoding.c:1028
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition: encoding.h:99
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition: encoding.h:591
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition: encoding.h:417
void rb_enc_set_default_external(VALUE encoding)
Destructively assigns the passed encoding as the default external encoding.
Definition: encoding.c:1643
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition: encoding.h:726
int rb_enc_find_index(const char *name)
Queries the index of the encoding.
Definition: encoding.c:824
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition: encoding.h:432
int rb_enc_alias(const char *alias, const char *orig)
Registers an "alias" name.
Definition: encoding.c:670
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition: encoding.h:619
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition: encoding.c:1203
static bool rb_enc_str_asciicompat_p(VALUE str)
Queries if the passed string is in an ASCII-compatible encoding.
Definition: encoding.h:789
int rb_usascii_encindex(void)
Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding it...
Definition: encoding.c:1493
rb_encoding * rb_filesystem_encoding(void)
Queries the "filesystem" encoding.
Definition: encoding.c:1537
RBIMPL_ATTR_CONST() int rb_io_oflags_fmode(int oflags)
Converts an oflags (that rb_io_modestr_oflags() returns) to a fmode (that rb_io_mode_flags() returns)...
RBIMPL_ATTR_PURE() int rb_io_read_pending(rb_io_t *fptr)
Queries if the passed IO has any pending reads.
int len
Length of the buffer.
Definition: io.h:8
Defines RBIMPL_ATTR_NOALIAS.
#define RBIMPL_ATTR_NOALIAS()
Wraps (or simulates) __declspec((noalias))
Definition: noalias.h:62
#define inline
Old Visual Studio versions do not support the inline keyword, so we need to define it to be __inline.
Definition: defines.h:88
Defines RBIMPL_ATTR_PURE.
Defines struct RBasic.
Defines RBIMPL_ATTR_RETURNS_NONNULL.
#define RBIMPL_ATTR_RETURNS_NONNULL()
Wraps (or simulates) __attribute__((returns_nonnull))
Defines VALUE and ID.
uintptr_t VALUE
Type that represents a Ruby object.
Definition: value.h:40