Ruby  3.4.0dev (2024-12-06 revision 892c46283a5ea4179500d951c9d4866c0051f27b)
Macros | Typedefs | Functions | Variables
encoding.h File Reference

(892c46283a5ea4179500d951c9d4866c0051f27b)

Defines rb_encoding. More...

#include "ruby/oniguruma.h"
#include "ruby/internal/attr/const.h"
#include "ruby/internal/attr/deprecated.h"
#include "ruby/internal/attr/noalias.h"
#include "ruby/internal/attr/pure.h"
#include "ruby/internal/attr/returns_nonnull.h"
#include "ruby/internal/dllexport.h"
#include "ruby/internal/encoding/coderange.h"
#include "ruby/internal/value.h"
#include "ruby/internal/core/rbasic.h"
#include "ruby/internal/fl_type.h"
Include dependency graph for encoding.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros

#define ENCODING_INLINE_MAX   RUBY_ENCODING_INLINE_MAX
 Old name of RUBY_ENCODING_INLINE_MAX. More...
 
#define ENCODING_SHIFT   RUBY_ENCODING_SHIFT
 Old name of RUBY_ENCODING_SHIFT. More...
 
#define ENCODING_MASK   RUBY_ENCODING_MASK
 Old name of RUBY_ENCODING_MASK. More...
 
#define ENCODING_SET_INLINED(obj, i)   RB_ENCODING_SET_INLINED(obj,i)
 Old name of RB_ENCODING_SET_INLINED. More...
 
#define ENCODING_SET(obj, i)   RB_ENCODING_SET(obj,i)
 Old name of RB_ENCODING_SET. More...
 
#define ENCODING_GET_INLINED(obj)   RB_ENCODING_GET_INLINED(obj)
 Old name of RB_ENCODING_GET_INLINED. More...
 
#define ENCODING_GET(obj)   RB_ENCODING_GET(obj)
 Old name of RB_ENCODING_GET. More...
 
#define ENCODING_IS_ASCII8BIT(obj)   RB_ENCODING_IS_ASCII8BIT(obj)
 Old name of RB_ENCODING_IS_ASCII8BIT. More...
 
#define ENCODING_MAXNAMELEN   RUBY_ENCODING_MAXNAMELEN
 Old name of RUBY_ENCODING_MAXNAMELEN. More...
 
#define MBCLEN_CHARFOUND_P(ret)   ONIGENC_MBCLEN_CHARFOUND_P(ret)
 Old name of ONIGENC_MBCLEN_CHARFOUND_P. More...
 
#define MBCLEN_CHARFOUND_LEN(ret)   ONIGENC_MBCLEN_CHARFOUND_LEN(ret)
 Old name of ONIGENC_MBCLEN_CHARFOUND_LEN. More...
 
#define MBCLEN_INVALID_P(ret)   ONIGENC_MBCLEN_INVALID_P(ret)
 Old name of ONIGENC_MBCLEN_INVALID_P. More...
 
#define MBCLEN_NEEDMORE_P(ret)   ONIGENC_MBCLEN_NEEDMORE_P(ret)
 Old name of ONIGENC_MBCLEN_NEEDMORE_P. More...
 
#define MBCLEN_NEEDMORE_LEN(ret)   ONIGENC_MBCLEN_NEEDMORE_LEN(ret)
 Old name of ONIGENC_MBCLEN_NEEDMORE_LEN. More...
 

Typedefs

typedef const OnigEncodingType rb_encoding
 The type of encoding. More...
 

Functions

static void RB_ENCODING_SET_INLINED (VALUE obj, int encindex)
 Destructively assigns the passed encoding to the passed object. More...
 
static int RB_ENCODING_GET_INLINED (VALUE obj)
 Queries the encoding of the passed object. More...
 
int rb_char_to_option_kcode (int c, int *option, int *kcode)
 Converts a character option to its encoding. More...
 
int rb_define_dummy_encoding (const char *name)
 Creates a new "dummy" encoding. More...
 
int rb_enc_dummy_p (rb_encoding *enc)
 Queries if the passed encoding is dummy. More...
 
int rb_enc_to_index (rb_encoding *enc)
 Queries the index of the encoding. More...
 
int rb_enc_get_index (VALUE obj)
 Queries the index of the encoding of the passed object, if any. More...
 
static int RB_ENCODING_GET (VALUE obj)
 Just another name of rb_enc_get_index. More...
 
void rb_enc_set_index (VALUE obj, int encindex)
 Destructively assigns an encoding (via its index) to an object. More...
 
static void RB_ENCODING_SET (VALUE obj, int encindex)
 Just another name of rb_enc_set_index. More...
 
static void RB_ENCODING_CODERANGE_SET (VALUE obj, int encindex, enum ruby_coderange_type cr)
 This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo. More...
 
int rb_enc_capable (VALUE obj)
 Queries if the passed object can have its encoding. More...
 
int rb_enc_find_index (const char *name)
 Queries the index of the encoding. More...
 
int rb_enc_alias (const char *alias, const char *orig)
 Registers an "alias" name. More...
 
int rb_to_encoding_index (VALUE obj)
 Obtains a encoding index from a wider range of objects (than rb_enc_find_index()). More...
 
rb_encodingrb_to_encoding (VALUE obj)
 Identical to rb_find_encoding(), except it raises an exception instead of returning NULL. More...
 
rb_encodingrb_find_encoding (VALUE obj)
 Identical to rb_to_encoding_index(), except the return type. More...
 
rb_encodingrb_enc_get (VALUE obj)
 Identical to rb_enc_get_index(), except the return type. More...
 
rb_encodingrb_enc_compatible (VALUE str1, VALUE str2)
 Look for the "common" encoding between the two. More...
 
rb_encodingrb_enc_check (VALUE str1, VALUE str2)
 Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL. More...
 
VALUE rb_enc_associate_index (VALUE obj, int encindex)
 Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed object. More...
 
VALUE rb_enc_associate (VALUE obj, rb_encoding *enc)
 Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index. More...
 
void rb_enc_copy (VALUE dst, VALUE src)
 Destructively copies the encoding of the latter object to that of former one. More...
 
rb_encodingrb_enc_from_index (int idx)
 Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object. More...
 
rb_encodingrb_enc_find (const char *name)
 Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's. More...
 
static const char * rb_enc_name (rb_encoding *enc)
 Queries the (canonical) name of the passed encoding. More...
 
static int rb_enc_mbminlen (rb_encoding *enc)
 Queries the minimum number of bytes that the passed encoding needs to represent a character. More...
 
static int rb_enc_mbmaxlen (rb_encoding *enc)
 Queries the maximum number of bytes that the passed encoding needs to represent a character. More...
 
int rb_enc_mbclen (const char *p, const char *e, rb_encoding *enc)
 Queries the number of bytes of the character at the passed pointer. More...
 
int rb_enc_fast_mbclen (const char *p, const char *e, rb_encoding *enc)
 Identical to rb_enc_mbclen() unless the character at p overruns e. More...
 
int rb_enc_precise_mbclen (const char *p, const char *e, rb_encoding *enc)
 Queries the number of bytes of the character at the passed pointer. More...
 
int rb_enc_ascget (const char *p, const char *e, int *len, rb_encoding *enc)
 Queries the code point of character pointed by the passed pointer. More...
 
unsigned int rb_enc_codepoint_len (const char *p, const char *e, int *len, rb_encoding *enc)
 Queries the code point of character pointed by the passed pointer. More...
 
static unsigned int rb_enc_codepoint (const char *p, const char *e, rb_encoding *enc)
 Queries the code point of character pointed by the passed pointer. More...
 
static OnigCodePoint rb_enc_mbc_to_codepoint (const char *p, const char *e, rb_encoding *enc)
 Identical to rb_enc_codepoint(), except it assumes the passed character is not broken. More...
 
int rb_enc_codelen (int code, rb_encoding *enc)
 Queries the number of bytes requested to represent the passed code point using the passed encoding. More...
 
static int rb_enc_code_to_mbclen (int c, rb_encoding *enc)
 Identical to rb_enc_codelen(), except it returns 0 for invalid code points. More...
 
static int rb_enc_mbcput (unsigned int c, void *buf, rb_encoding *enc)
 Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one. More...
 
static char * rb_enc_prev_char (const char *s, const char *p, const char *e, rb_encoding *enc)
 Queries the previous (left) character. More...
 
static char * rb_enc_left_char_head (const char *s, const char *p, const char *e, rb_encoding *enc)
 Queries the left boundary of a character. More...
 
static char * rb_enc_right_char_head (const char *s, const char *p, const char *e, rb_encoding *enc)
 Queries the right boundary of a character. More...
 
static char * rb_enc_step_back (const char *s, const char *p, const char *e, int n, rb_encoding *enc)
 Scans the string backwards for n characters. More...
 
static bool rb_enc_asciicompat (rb_encoding *enc)
 Queries if the passed encoding is in some sense compatible with ASCII. More...
 
static bool rb_enc_str_asciicompat_p (VALUE str)
 Queries if the passed string is in an ASCII-compatible encoding. More...
 
VALUE rb_enc_from_encoding (rb_encoding *enc)
 Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding. More...
 
int rb_enc_unicode_p (rb_encoding *enc)
 Queries if the passed encoding is either one of UTF-8/16/32. More...
 
rb_encodingrb_ascii8bit_encoding (void)
 Queries the encoding that represents ASCII-8BIT a.k.a. More...
 
rb_encodingrb_utf8_encoding (void)
 Queries the encoding that represents UTF-8. More...
 
rb_encodingrb_usascii_encoding (void)
 Queries the encoding that represents US-ASCII. More...
 
rb_encodingrb_locale_encoding (void)
 Queries the encoding that represents the current locale. More...
 
rb_encodingrb_filesystem_encoding (void)
 Queries the "filesystem" encoding. More...
 
rb_encodingrb_default_external_encoding (void)
 Queries the "default external" encoding. More...
 
rb_encodingrb_default_internal_encoding (void)
 Queries the "default internal" encoding. More...
 
int rb_ascii8bit_encindex (void)
 Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding itself. More...
 
static bool RB_ENCODING_IS_ASCII8BIT (VALUE obj)
 Queries if the passed object is in ascii 8bit (== binary) encoding. More...
 
int rb_utf8_encindex (void)
 Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itself. More...
 
int rb_usascii_encindex (void)
 Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding itself. More...
 
int rb_locale_encindex (void)
 Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding itself. More...
 
int rb_filesystem_encindex (void)
 Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding itself. More...
 
VALUE rb_enc_default_external (void)
 Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default external encoding. More...
 
VALUE rb_enc_default_internal (void)
 Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default internal encoding. More...
 
void rb_enc_set_default_external (VALUE encoding)
 Destructively assigns the passed encoding as the default external encoding. More...
 
void rb_enc_set_default_internal (VALUE encoding)
 Destructively assigns the passed encoding as the default internal encoding. More...
 
VALUE rb_locale_charmap (VALUE klass)
 Returns a platform-depended "charmap" of the current locale. More...
 

Variables

VALUE rb_cEncoding
 Encoding class. More...
 

Detailed Description

Defines rb_encoding.

Author
Ruby developers ruby-.nosp@m.core.nosp@m.@ruby.nosp@m.-lan.nosp@m.g.org
Warning
Symbols prefixed with either RBIMPL or rbimpl are implementation details. Don't take them as canon. They could rapidly appear then vanish. The name (path) of this header file is also an implementation detail. Do not expect it to persist at the place it is now. Developers are free to move it anywhere anytime at will.
Note
To ruby-core: remember that this header can be possibly recursively included from extension libraries written in C++. Do not expect for instance __VA_ARGS__ is always available. We assume C99 for ruby itself but we don't assume languages of extension libraries. They could be written in C++98.

Definition in file encoding.h.

Typedef Documentation

◆ rb_encoding

The type of encoding.

Our design here is we take Oniguruma/Onigmo's multilingualisation schema as our base data structure.

Definition at line 117 of file encoding.h.

Function Documentation

◆ rb_ascii8bit_encindex()

int rb_ascii8bit_encindex ( void  )

Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns
The index of encoding of ASCII-8BIT.

Definition at line 1469 of file encoding.c.

Referenced by rb_char_to_option_kcode(), RB_ENCODING_IS_ASCII8BIT(), rb_external_str_new_with_enc(), rb_str_coderange_scan_restartable(), and rb_str_dump().

◆ rb_ascii8bit_encoding()

rb_encoding* rb_ascii8bit_encoding ( void  )

Queries the encoding that represents ASCII-8BIT a.k.a.

binary.

Returns
The encoding that represents ASCII-8BIT.

Definition at line 1463 of file encoding.c.

Referenced by rb_define_dummy_encoding(), rb_io_extract_modeenc(), rb_reg_new(), rb_symname_p(), ruby_brace_glob(), ruby_glob(), and ruby_init_loadpath().

◆ rb_char_to_option_kcode()

int rb_char_to_option_kcode ( int  c,
int *  option,
int *  kcode 
)

Converts a character option to its encoding.

It only supports a very limited set of Japanese encodings due to its Japanese origin. Ruby still has this in-core for backwards compatibility. But new codes must not bother such concept like one-character encoding option. Consider deprecated in practice.

Parameters
[in]cOne of ‘['n’, 'e', 's', 'u', 'i', 'x', 'm']`.
[out]optionReturn buffer.
[out]kcodeReturn buffer.
Return values
1c understood properly.
0c is not understood.
Postcondition
option is a ::OnigOptionType.
kcode is an enum ruby_preserved_encindex.

Definition at line 333 of file re.c.

◆ rb_default_external_encoding()

rb_encoding* rb_default_external_encoding ( void  )

Queries the "default external" encoding.

This is used to interact with outer-process things such as File. Though not recommended, you can set this using rb_enc_set_default_external().

Returns
The "default external" encoding.

Definition at line 1589 of file encoding.c.

Referenced by rb_enc_default_external(), rb_external_str_new(), rb_external_str_new_cstr(), rb_inspect(), rb_str_export(), and rb_str_inspect().

◆ rb_default_internal_encoding()

rb_encoding* rb_default_internal_encoding ( void  )

Queries the "default internal" encoding.

This could be a null pointer. Otherwise, outer-process info are transcoded from default external encoding to this one during reading from an IO.

Returns
The "default internal" encoding (if any).

Definition at line 1676 of file encoding.c.

Referenced by rb_enc_default_internal(), rb_external_str_new_with_enc(), rb_inspect(), and rb_str_inspect().

◆ rb_define_dummy_encoding()

int rb_define_dummy_encoding ( const char *  name)

Creates a new "dummy" encoding.

Roughly speaking, an encoding is dummy when it is stateful. Notable example of dummy encoding are those defined in ISO/IEC 2022

Parameters
[in]nameName of the creating encoding.
Exceptions
rb_eArgErrorDuplicated or malformed name.
Returns
New dummy encoding's index.
Postcondition
Encoding named name is created, whose index is the return value.

Definition at line 566 of file encoding.c.

◆ rb_enc_alias()

int rb_enc_alias ( const char *  alias,
const char *  orig 
)

Registers an "alias" name.

In the wild, an encoding can be called using multiple names. For instance an encoding known as "CP932" is also called "SJIS" on occasions. This API registers such relationships.

Parameters
[in]aliasNew name.
[in]origOld name.
Exceptions
rb_eArgErroralias is duplicated or malformed.
Return values
-1Failed to load orig.
otherwiseThe index of orig and alias.
Postcondition
alias is a synonym of orig. They refer to the identical encoding.

Definition at line 670 of file encoding.c.

◆ rb_enc_ascget()

int rb_enc_ascget ( const char *  p,
const char *  e,
int *  len,
rb_encoding enc 
)

Queries the code point of character pointed by the passed pointer.

If that code point is included in ASCII that code point is returned. Otherwise -1. This can be different from just looking at the first byte. For instance it reads 2 bytes in case of UTF-16BE.

Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]lenReturn buffer.
[in]encEncoding of the string.
Return values
-1The character at p is not i ASCII.
otherwiseA code point of the character at p.
Postcondition
len (if set) is the number of bytes of p.

Definition at line 1203 of file encoding.c.

Referenced by rb_reg_quote().

◆ rb_enc_asciicompat()

static bool rb_enc_asciicompat ( rb_encoding enc)
inlinestatic

Queries if the passed encoding is in some sense compatible with ASCII.

The concept of ASCII compatibility is nuanced, and private to our implementation. For instance SJIS is ASCII compatible to us, despite their having different characters at code point 0x5C. This is based on some practical consideration that Japanese people confuses SJIS to be "upper compatible" with ASCII (which is in fact a wrong idea, but we just don't go strict here). An example of ASCII incompatible encoding is UTF-16. UTF-16 shares code points with ASCII, but employs a completely different encoding scheme.

Parameters
[in]encEncoding in question.
Return values
0It is incompatible.
1It is compatible.

Definition at line 768 of file encoding.h.

Referenced by rb_enc_ascget(), rb_enc_str_asciicompat_p(), rb_external_str_new_with_enc(), rb_inspect(), rb_must_asciicompat(), rb_reg_regsub(), rb_str_buf_cat_ascii(), rb_str_coderange_scan_restartable(), rb_str_comparable(), rb_str_conv_enc_opts(), rb_str_dump(), rb_str_ellipsize(), rb_str_inspect(), and rb_to_encoding_index().

◆ rb_enc_associate()

VALUE rb_enc_associate ( VALUE  obj,
rb_encoding enc 
)

Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.

Parameters
[out]objObject in question.
[in]encAn encoding.
Exceptions
rb_eFrozenErrorobj is frozen.
rb_eArgErrorobj is incapable of having an encoding.
Returns
The passed obj.
Postcondition
obj's contents might be fixed according to enc.

Definition at line 1022 of file encoding.c.

Referenced by rb_ary_join(), rb_econv_append(), rb_enc_vsprintf(), rb_reg_quote(), rb_str_conv_enc_opts(), rb_str_ellipsize(), rb_str_format(), rb_str_inspect(), rb_str_intern(), and rb_str_update().

◆ rb_enc_associate_index()

VALUE rb_enc_associate_index ( VALUE  obj,
int  encindex 
)

Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed object.

It for instance changes the byte length of terminating U+0000 according to the passed encoding.

Parameters
[out]objObject in question.
[in]encindexAn encoding index.
Exceptions
rb_eFrozenErrorobj is frozen.
rb_eArgErrorobj is incapable of having an encoding.
rb_eEncodingErrorencindex is out of bounds.
rb_eLoadErrorFailed to load the encoding.
Returns
The passed obj.
Postcondition
obj's contents might be fixed according to encindex.

Definition at line 994 of file encoding.c.

Referenced by rb_dir_getwd(), rb_enc_associate(), rb_enc_copy(), rb_find_file(), rb_find_file_ext(), rb_str_dump(), and rb_str_format().

◆ rb_enc_capable()

int rb_enc_capable ( VALUE  obj)

Queries if the passed object can have its encoding.

Parameters
[in]objObject in question.
Return values
1It can.
0It cannot.

Definition at line 884 of file encoding.c.

◆ rb_enc_check()

rb_encoding* rb_enc_check ( VALUE  str1,
VALUE  str2 
)

Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.

Parameters
[in]str1An object.
[in]str2Another object.
Exceptions
rb_eEncCompatErrorNo encoding can satisfy both.
Returns
Common encoding between the two.
Note
Arguments can be non-string, e.g. Regexp.

Definition at line 1062 of file encoding.c.

Referenced by rb_str_format(), and rb_str_update().

◆ rb_enc_code_to_mbclen()

static int rb_enc_code_to_mbclen ( int  c,
rb_encoding enc 
)
inlinestatic

Identical to rb_enc_codelen(), except it returns 0 for invalid code points.

Parameters
[in]cCode point in question.
[in]encEncoding to convert c into a byte sequence.
Return values
0c is invalid.
Returns
otherwise Number of bytes needed for enc to encode c.

Definition at line 619 of file encoding.h.

◆ rb_enc_codelen()

int rb_enc_codelen ( int  code,
rb_encoding enc 
)

Queries the number of bytes requested to represent the passed code point using the passed encoding.

Parameters
[in]codeCode point in question.
[in]encEncoding to convert the code into a byte sequence.
Exceptions
rb_eArgErrorenc does not glean code.
Returns
Number of bytes requested to represent code using enc.

Definition at line 1241 of file encoding.c.

Referenced by rb_enc_uint_chr(), rb_str_buf_cat_ascii(), rb_str_concat(), and rb_str_format().

◆ rb_enc_codepoint()

static unsigned int rb_enc_codepoint ( const char *  p,
const char *  e,
rb_encoding enc 
)
inlinestatic

Queries the code point of character pointed by the passed pointer.

Exceptions happen in case of broken input.

Deprecated:
Use rb_enc_codepoint_len() instead.
Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]encEncoding of the string.
Exceptions
rb_eArgErrorp is broken.
Returns
Code point of the character pointed by p.

Definition at line 571 of file encoding.h.

Referenced by rb_str_inspect().

◆ rb_enc_codepoint_len()

unsigned int rb_enc_codepoint_len ( const char *  p,
const char *  e,
int *  len,
rb_encoding enc 
)

Queries the code point of character pointed by the passed pointer.

Exceptions happen in case of broken input.

Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]lenReturn buffer.
[in]encEncoding of the string.
Exceptions
rb_eArgErrorp is broken.
Returns
Code point of the character pointed by p.
Postcondition
len (if set) is the number of bytes of p.

Definition at line 1227 of file encoding.c.

Referenced by rb_enc_codepoint().

◆ rb_enc_compatible()

rb_encoding* rb_enc_compatible ( VALUE  str1,
VALUE  str2 
)

Look for the "common" encoding between the two.

One character can or cannot be expressed depending on an encoding. This function finds the super-set of encodings that satisfy contents of both arguments. If that is impossible returns NULL.

Parameters
[in]str1An object.
[in]str2Another object.
Return values
NULLNo encoding can satisfy both at once.
otherwiseCommon encoding between the two.
Note
Arguments can be non-string, e.g. Regexp.

Definition at line 1140 of file encoding.c.

Referenced by rb_enc_check().

◆ rb_enc_copy()

void rb_enc_copy ( VALUE  dst,
VALUE  src 
)

Destructively copies the encoding of the latter object to that of former one.

It can also be seen as a routine identical to rb_enc_associate_index(), except it takes an object's encoding instead of an encoding's index.

Parameters
[out]dstObject to modify.
[in]srcObject to reference.
Exceptions
rb_eFrozenErrordst is frozen.
rb_eArgErrordst is incapable of having an encoding.
rb_eEncodingErrorsrc is incapable of having an encoding.
Postcondition
dst's encoding is that of src's.

Definition at line 1149 of file encoding.c.

Referenced by rb_reg_quote(), and rb_str_times().

◆ rb_enc_default_external()

VALUE rb_enc_default_external ( void  )

Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default external encoding.

Returns
An instance of rb_cEncoding of default external.

Definition at line 1603 of file encoding.c.

◆ rb_enc_default_internal()

VALUE rb_enc_default_internal ( void  )

Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default internal encoding.

Returns
An instance of rb_cEncoding of default internal.

Definition at line 1685 of file encoding.c.

◆ rb_enc_dummy_p()

int rb_enc_dummy_p ( rb_encoding enc)

Queries if the passed encoding is dummy.

Parameters
[in]encEncoding in question.
Return values
1It is.
0It isn't.

Definition at line 197 of file encoding.c.

Referenced by rb_enc_asciicompat().

◆ rb_enc_fast_mbclen()

int rb_enc_fast_mbclen ( const char *  p,
const char *  e,
rb_encoding enc 
)

Identical to rb_enc_mbclen() unless the character at p overruns e.

That can happen for instance when you read from a socket and its partial read cuts a wide character in-between. In those situations this function "estimates" theoretical length of the character in question. Typically it tends to be possible to know how many bytes a character needs before actually reaching its end; for instance UTF-8 encodes a character's length in the first byte of it. This function returns that info.

Note
This implies that the string is not broken.
Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]encEncoding of the string.
Returns
Number of bytes of character at p, measured or estimated.

Definition at line 1173 of file encoding.c.

◆ rb_enc_find()

rb_encoding* rb_enc_find ( const char *  name)

Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.

Parameters
[in]nameName of the encoding to query.
Return values
NULLNo such encoding.
otherwiseAn encoding whose index is idx.

Definition at line 859 of file encoding.c.

◆ rb_enc_find_index()

int rb_enc_find_index ( const char *  name)

Queries the index of the encoding.

Parameters
[in]nameName of the encoding to find.
Exceptions
rb_eArgErrorNo such encoding named name.
Return values
-1name exists, but unable to load.
otherwiseIndex of encoding named name.

Definition at line 824 of file encoding.c.

Referenced by rb_enc_alias(), rb_enc_find(), and rb_to_encoding_index().

◆ rb_enc_from_encoding()

VALUE rb_enc_from_encoding ( rb_encoding enc)

Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.

Parameters
[in]encAn encoding
Return values
RUBY_Qnilenc is a null pointer.
otherwiseAn instance of rb_cEncoding.

Definition at line 182 of file encoding.c.

Referenced by rb_enc_default_external(), rb_enc_default_internal(), rb_io_extract_encoding_option(), and rb_str_ellipsize().

◆ rb_enc_from_index()

rb_encoding* rb_enc_from_index ( int  idx)

Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.

Parameters
[in]idxAn encoding index.
Return values
NULLNo such encoding.
otherwiseAn encoding whose index is idx.

Definition at line 402 of file encoding.c.

Referenced by rb_default_external_encoding(), rb_default_internal_encoding(), rb_enc_find(), rb_enc_find_index(), rb_enc_get(), rb_filesystem_encoding(), rb_find_encoding(), rb_io_extract_encoding_option(), rb_locale_encoding(), rb_must_asciicompat(), rb_str_buf_cat_ascii(), rb_str_comparable(), rb_str_dump(), rb_str_encode_ospath(), rb_str_format(), and rb_str_inspect().

◆ rb_enc_get()

rb_encoding* rb_enc_get ( VALUE  obj)

Identical to rb_enc_get_index(), except the return type.

Parameters
[in]objObject in question.
Return values
NULLObj is incapable of having an encoding.
otherwiseobj's encoding.

Definition at line 1028 of file encoding.c.

Referenced by rb_econv_append(), rb_econv_open_opts(), rb_econv_prepare_options(), rb_enc_str_asciicompat_p(), rb_inspect(), rb_io_extract_encoding_option(), rb_io_extract_modeenc(), rb_path_to_class(), rb_reg_quote(), rb_reg_regsub(), rb_str_conv_enc_opts(), rb_str_ellipsize(), rb_str_format(), rb_str_intern(), rb_str_set_len(), and rb_to_encoding_index().

◆ rb_enc_get_index()

int rb_enc_get_index ( VALUE  obj)

Queries the index of the encoding of the passed object, if any.

Parameters
[in]objObject in question.
Return values
-1obj is incapable of having an encoding.
otherwiseobj's encoding's index.

Definition at line 920 of file encoding.c.

Referenced by rb_enc_compatible(), rb_enc_copy(), rb_enc_get(), rb_must_asciicompat(), rb_obj_encoding(), and rb_str_dump().

◆ rb_enc_left_char_head()

static char* rb_enc_left_char_head ( const char *  s,
const char *  p,
const char *  e,
rb_encoding enc 
)
inlinestatic

Queries the left boundary of a character.

This function takes a pointer that is not necessarily a head of a character, and searches for its head.

Parameters
[in]sStart of the string.
[in]pPointer to a possibly-middle of a character.
[in]eEnd of the string.
[in]encEncoding.
Returns
Pointer to the head of the character that contains p.

Definition at line 683 of file encoding.h.

◆ rb_enc_mbc_to_codepoint()

static OnigCodePoint rb_enc_mbc_to_codepoint ( const char *  p,
const char *  e,
rb_encoding enc 
)
inlinestatic

Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.

Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]encEncoding of the string.
Returns
Code point of the character pointed by p.

Definition at line 591 of file encoding.h.

Referenced by rb_enc_ascget(), rb_enc_codepoint_len(), rb_str_dump(), and rb_str_inspect().

◆ rb_enc_mbclen()

int rb_enc_mbclen ( const char *  p,
const char *  e,
rb_encoding enc 
)

Queries the number of bytes of the character at the passed pointer.

Parameters
[in]pPointer to a character's first byte.
[in]eEnd of the string that has p.
[in]encEncoding of the string.
Returns
If the character at p does not end until e, number of bytes between p and e. Otherwise the number of bytes that the character at p is encoded.

Definition at line 1179 of file encoding.c.

Referenced by rb_str_format().

◆ rb_enc_mbcput()

static int rb_enc_mbcput ( unsigned int  c,
void *  buf,
rb_encoding enc 
)
inlinestatic

Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.

Parameters
[in]cCode point.
[out]bufReturn buffer.
[in]encTarget encoding scheme.
Return values
<=0 c is invalid in enc.
Returns
otherwise Number of bytes written to buf.
Postcondition
c is encoded according to enc, then written to buf.

Definition at line 643 of file encoding.h.

Referenced by rb_enc_uint_chr(), rb_reg_quote(), rb_str_buf_cat_ascii(), rb_str_concat(), and rb_str_format().

◆ rb_enc_mbmaxlen()

static int rb_enc_mbmaxlen ( rb_encoding enc)
inlinestatic

Queries the maximum number of bytes that the passed encoding needs to represent a character.

Fixed-width encodings have the same value for this one and rb_enc_mbminlen. However there are variable-width encodings. UTF-8, for instance, takes from 1 up to 6 bytes.

Parameters
[in]encAn encoding.
Returns
Its maximum possible number of bytes of a character.

Definition at line 447 of file encoding.h.

Referenced by rb_str_buf_cat_ascii(), and rb_str_subpos().

◆ rb_enc_mbminlen()

static int rb_enc_mbminlen ( rb_encoding enc)
inlinestatic

Queries the minimum number of bytes that the passed encoding needs to represent a character.

For ASCII and compatible encodings this is typically

  1. There are however encodings whose minimum is not 1; they are historically called wide characters.
Parameters
[in]encAn encoding.
Returns
Its least possible number of bytes except 0.

Definition at line 432 of file encoding.h.

Referenced by rb_enc_asciicompat(), rb_enc_mbclen(), rb_enc_str_new_cstr(), rb_enc_vsprintf(), rb_memsearch(), rb_str_ellipsize(), rb_str_inspect(), rb_str_plus(), and rb_str_subpos().

◆ rb_enc_name()

static const char* rb_enc_name ( rb_encoding enc)
inlinestatic

Queries the (canonical) name of the passed encoding.

Parameters
[in]encAn encoding.
Returns
Its name.

Definition at line 417 of file encoding.h.

Referenced by rb_econv_open_opts(), rb_econv_prepare_options(), rb_enc_codelen(), rb_enc_codepoint_len(), rb_enc_uint_chr(), rb_enc_vsprintf(), rb_must_asciicompat(), and rb_str_concat().

◆ rb_enc_precise_mbclen()

int rb_enc_precise_mbclen ( const char *  p,
const char *  e,
rb_encoding enc 
)

Queries the number of bytes of the character at the passed pointer.

This function returns 3 different types of information:

auto n = rb_enc_precise_mbclen(p, q, r);
if (ONIGENC_MBCLEN_CHARFOUND_P(n)) {
// Character found. Normal return.
auto found_length = ONIGENC_MBCLEN_CHARFOUND_LEN(n);
}
else if (ONIGENC_MBCLEN_NEEDMORE_P(n)) {
// Character overruns past `q`; needs more.
auto requested_length = ONIGENC_MBCLEN_NEEDMORE_LEN(n);
}
else {
// `p` is broken.
assert(ONIGENC_MBCLEN_INVALID_P(n));
}
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Queries the number of bytes of the character at the passed pointer.
Definition: encoding.c:1191
Parameters
[in]pPointer to the character's first byte.
[in]eEnd of the string that has p.
[in]encEncoding of the string.
Returns
Encoded read/needed number of bytes (see above).

Definition at line 1191 of file encoding.c.

Referenced by rb_enc_ascget(), rb_enc_codepoint_len(), rb_enc_uint_chr(), rb_str_coderange_scan_restartable(), rb_str_concat(), rb_str_dump(), and rb_str_inspect().

◆ rb_enc_prev_char()

static char* rb_enc_prev_char ( const char *  s,
const char *  p,
const char *  e,
rb_encoding enc 
)
inlinestatic

Queries the previous (left) character.

Parameters
[in]sStart of the string.
[in]pPointer to a character.
[in]eEnd of the string.
[in]encEncoding.
Return values
NULLNo previous character.
otherwisePointer to the head of the previous character.

Definition at line 662 of file encoding.h.

Referenced by rb_str_subpos().

◆ rb_enc_right_char_head()

static char* rb_enc_right_char_head ( const char *  s,
const char *  p,
const char *  e,
rb_encoding enc 
)
inlinestatic

Queries the right boundary of a character.

This function takes a pointer that is not necessarily a head of a character, and searches for its tail.

Parameters
[in]sStart of the string.
[in]pPointer to a possibly-middle of a character.
[in]eEnd of the string.
[in]encEncoding.
Returns
Pointer to the end of the character that contains p.

Definition at line 704 of file encoding.h.

Referenced by rb_str_format().

◆ rb_enc_set_default_external()

void rb_enc_set_default_external ( VALUE  encoding)

Destructively assigns the passed encoding as the default external encoding.

You should not use this API. It has process-global side effects. Also it doesn't change encodings of strings that have already been read.

Parameters
[in]encodingRuby level encoding.
Exceptions
rb_eArgErrorencoding is RUBY_Qnil.
Postcondition
The default external encoding is encoding.

Definition at line 1643 of file encoding.c.

◆ rb_enc_set_default_internal()

void rb_enc_set_default_internal ( VALUE  encoding)

Destructively assigns the passed encoding as the default internal encoding.

You should not use this API. It has process-global side effects. Also it doesn't change encodings of strings that have already been read.

Parameters
[in]encodingRuby level encoding.
Postcondition
The default internal encoding is encoding.
Note
Unlike rb_enc_set_default_external() you can pass RUBY_Qnil.

Definition at line 1726 of file encoding.c.

◆ rb_enc_set_index()

void rb_enc_set_index ( VALUE  obj,
int  encindex 
)

Destructively assigns an encoding (via its index) to an object.

Parameters
[out]objObject in question.
[in]encindexAn encoding index.
Exceptions
rb_eFrozenErrorobj is frozen.
rb_eArgErrorobj is incapable of having an encoding.
rb_eEncodingErrorencindex is out of bounds.
rb_eLoadErrorFailed to load the encoding.

Definition at line 986 of file encoding.c.

Referenced by RB_ENCODING_SET().

◆ rb_enc_step_back()

static char* rb_enc_step_back ( const char *  s,
const char *  p,
const char *  e,
int  n,
rb_encoding enc 
)
inlinestatic

Scans the string backwards for n characters.

Parameters
[in]sStart of the string.
[in]pPointer to a character.
[in]eEnd of the string.
[in]nSteps.
[in]encEncoding.
Return values
NULLThere are no n characters left.
otherwisePointer to n character before p.

Definition at line 726 of file encoding.h.

Referenced by rb_str_ellipsize().

◆ rb_enc_str_asciicompat_p()

static bool rb_enc_str_asciicompat_p ( VALUE  str)
inlinestatic

Queries if the passed string is in an ASCII-compatible encoding.

Parameters
[in]strA Ruby's string to query.
Return values
0str is not a String, or an ASCII-incompatible string.
1Otherwise.

Definition at line 789 of file encoding.h.

◆ rb_enc_to_index()

int rb_enc_to_index ( rb_encoding enc)

Queries the index of the encoding.

An encoding's index is a Ruby-local concept. It is a (sequential) number assigned to each encoding.

Parameters
[in]encEncoding in question.
Returns
Its index.
Note
You can pass null pointers to this function. It is equivalent to rb_usascii_encindex() then.

Definition at line 191 of file encoding.c.

Referenced by rb_dir_getwd(), rb_enc_associate(), rb_enc_str_buf_cat(), rb_enc_str_new_static(), rb_external_str_new_with_enc(), rb_str_coderange_scan_restartable(), rb_str_format(), and rb_str_plus().

◆ rb_enc_unicode_p()

int rb_enc_unicode_p ( rb_encoding enc)

Queries if the passed encoding is either one of UTF-8/16/32.

Note
It does not take UTF-7, which we actually support, into account.
Parameters
[in]encEncoding in question.
Return values
0It is not a Unicode variant.
otherwiseIt is.

Definition at line 638 of file encoding.c.

Referenced by rb_str_inspect().

◆ RB_ENCODING_CODERANGE_SET()

static void RB_ENCODING_CODERANGE_SET ( VALUE  obj,
int  encindex,
enum ruby_coderange_type  cr 
)
inlinestatic

This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters
[out]objTarget object.
[in]encindexEncoding in encindex format.
[in]crAn enum ruby_coderange_type.
Postcondition
obj's encoding is encindex.
obj's code range is cr.

Definition at line 238 of file encoding.h.

◆ RB_ENCODING_GET()

static int RB_ENCODING_GET ( VALUE  obj)
inlinestatic

Just another name of rb_enc_get_index.

Definition at line 195 of file encoding.h.

◆ RB_ENCODING_GET_INLINED()

static int RB_ENCODING_GET_INLINED ( VALUE  obj)
inlinestatic

Queries the encoding of the passed object.

The encoding must be smaller than ::RUBY_ENCODING_INLINE_MAX, which means you have some assumption on the return value. This means the API is for internal use only.

Parameters
[in]objTarget object.
Returns
obj's encoding index.

Definition at line 99 of file encoding.h.

Referenced by RB_ENCODING_GET(), and RB_ENCODING_IS_ASCII8BIT().

◆ RB_ENCODING_IS_ASCII8BIT()

static bool RB_ENCODING_IS_ASCII8BIT ( VALUE  obj)
inlinestatic

Queries if the passed object is in ascii 8bit (== binary) encoding.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters
[in]objAn object to check.
Return values
1It is.
0It isn't.

Definition at line 927 of file encoding.h.

◆ RB_ENCODING_SET()

static void RB_ENCODING_SET ( VALUE  obj,
int  encindex 
)
inlinestatic

Just another name of rb_enc_set_index.

Definition at line 221 of file encoding.h.

Referenced by RB_ENCODING_CODERANGE_SET().

◆ RB_ENCODING_SET_INLINED()

static void RB_ENCODING_SET_INLINED ( VALUE  obj,
int  encindex 
)
inlinestatic

Destructively assigns the passed encoding to the passed object.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters
[out]objTarget object to modify.
[in]ecindexEncoding in encindex format.
Postcondition
obj's encoding is encindex.

Definition at line 81 of file encoding.h.

◆ rb_filesystem_encindex()

int rb_filesystem_encindex ( void  )

Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns
The index of the filesystem encoding.

Definition at line 1529 of file encoding.c.

Referenced by rb_filesystem_encoding(), and rb_str_encode_ospath().

◆ rb_filesystem_encoding()

rb_encoding* rb_filesystem_encoding ( void  )

Queries the "filesystem" encoding.

This is the encoding that ruby expects info from the OS' file system are in. This affects for instance return value of rb_dir_getwd(). Most notably on Windows it can be an alias of OS codepage. Most notably on Linux users can set this via default external encoding.

Returns
The "filesystem" encoding.

Definition at line 1537 of file encoding.c.

Referenced by rb_dir_getwd(), rb_filesystem_str_new(), and rb_filesystem_str_new_cstr().

◆ rb_find_encoding()

rb_encoding* rb_find_encoding ( VALUE  obj)

Identical to rb_to_encoding_index(), except the return type.

Parameters
[in]objAn rb_cEncoding, or its name in rb_cString.
Exceptions
rb_eTypeErrorobj is neither rb_cEncoding nor rb_cString.
Return values
NULLNo such encoding.
Returns
otherwise Encoding of obj.

Definition at line 330 of file encoding.c.

◆ rb_locale_charmap()

VALUE rb_locale_charmap ( VALUE  klass)

Returns a platform-depended "charmap" of the current locale.

This information is called a "Codeset name" in IEEE 1003.1 section 13 (<langinfo.h>). This is a very low-level API. The return value can have no corresponding encoding when passed to rb_find_encoding().

Parameters
[in]klassIgnored for no reason (why...)
Returns
The low-level locale charmap, in Ruby's String.

Definition at line 91 of file localeinit.c.

◆ rb_locale_encindex()

int rb_locale_encindex ( void  )

Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns
The index of the locale encoding.

Definition at line 1501 of file encoding.c.

Referenced by rb_locale_encoding().

◆ rb_locale_encoding()

rb_encoding* rb_locale_encoding ( void  )

Queries the encoding that represents the current locale.

Returns
The encoding that represents the process' locale.

Definition at line 1523 of file encoding.c.

Referenced by rb_default_external_encoding(), rb_loaderror(), rb_loaderror_with_path(), rb_locale_str_new(), rb_locale_str_new_cstr(), and rb_str_export_locale().

◆ rb_to_encoding()

rb_encoding* rb_to_encoding ( VALUE  obj)

Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.

Parameters
[in]objAn rb_cEncoding, or its name in rb_cString.
Exceptions
rb_eTypeErrorobj is neither rb_cEncoding nor rb_cString.
rb_eArgErrorobj is an unknown encoding name.
Returns
Encoding of obj.

Definition at line 323 of file encoding.c.

Referenced by rb_io_extract_encoding_option().

◆ rb_to_encoding_index()

int rb_to_encoding_index ( VALUE  obj)

Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).

Parameters
[in]objAn rb_cEncoding, or its name in rb_cString.
Return values
-1obj is unexpected type/contents.
otherwiseIndex corresponding to obj.

Definition at line 261 of file encoding.c.

Referenced by rb_io_extract_encoding_option().

◆ rb_usascii_encindex()

int rb_usascii_encindex ( void  )

Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns
The index of encoding of UTF-8.

Definition at line 1493 of file encoding.c.

Referenced by rb_external_str_new_with_enc(), rb_find_file(), and rb_find_file_ext().

◆ rb_usascii_encoding()

rb_encoding* rb_usascii_encoding ( void  )

Queries the encoding that represents US-ASCII.

Returns
The encoding that represents US-ASCII.

Definition at line 1487 of file encoding.c.

Referenced by rb_ary_join(), rb_intern2(), rb_iv_get(), rb_reg_quote(), rb_str_inspect(), rb_str_intern(), rb_usascii_str_new(), and rb_usascii_str_new_cstr().

◆ rb_utf8_encindex()

int rb_utf8_encindex ( void  )

Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns
The index of encoding of UTF-8.

Definition at line 1481 of file encoding.c.

Referenced by rb_char_to_option_kcode(), and rb_str_dump().

◆ rb_utf8_encoding()

rb_encoding* rb_utf8_encoding ( void  )

Queries the encoding that represents UTF-8.

Returns
The encoding that represents UTF-8.

Definition at line 1475 of file encoding.c.

Referenced by rb_memsearch(), rb_str_encode_ospath(), rb_str_subpos(), rb_utf8_str_new(), and rb_utf8_str_new_cstr().