(bd2d6845f1c48a572e202409367a4e756c4bb3c5)

Defines rb_encoding. More...

#include "ruby/oniguruma.h"
#include "ruby/internal/attr/const.h"
#include "ruby/internal/attr/deprecated.h"
#include "ruby/internal/attr/noalias.h"
#include "ruby/internal/attr/pure.h"
#include "ruby/internal/attr/returns_nonnull.h"
#include "ruby/internal/dllexport.h"
#include "ruby/internal/encoding/coderange.h"
#include "ruby/internal/value.h"
#include "ruby/internal/core/rbasic.h"
#include "ruby/internal/fl_type.h"

Include dependency graph for encoding.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Macros
#define	ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX
	Old name of RUBY_ENCODING_INLINE_MAX.

#define	ENCODING_SHIFT RUBY_ENCODING_SHIFT
	Old name of RUBY_ENCODING_SHIFT.

#define	ENCODING_MASK RUBY_ENCODING_MASK
	Old name of RUBY_ENCODING_MASK.

#define	ENCODING_SET_INLINED(obj, i) RB_ENCODING_SET_INLINED(obj,i)
	Old name of RB_ENCODING_SET_INLINED.

#define	ENCODING_SET(obj, i) RB_ENCODING_SET(obj,i)
	Old name of RB_ENCODING_SET.

#define	ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj)
	Old name of RB_ENCODING_GET_INLINED.

#define	ENCODING_GET(obj) RB_ENCODING_GET(obj)
	Old name of RB_ENCODING_GET.

#define	ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj)
	Old name of RB_ENCODING_IS_ASCII8BIT.

#define	ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN
	Old name of RUBY_ENCODING_MAXNAMELEN.

#define	MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret)
	Old name of ONIGENC_MBCLEN_CHARFOUND_P.

#define	MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret)
	Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.

#define	MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret)
	Old name of ONIGENC_MBCLEN_INVALID_P.

#define	MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret)
	Old name of ONIGENC_MBCLEN_NEEDMORE_P.

#define	MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret)
	Old name of ONIGENC_MBCLEN_NEEDMORE_LEN.

Typedefs
typedef const OnigEncodingType	rb_encoding
	The type of encoding.

Functions
static void	RB_ENCODING_SET_INLINED (VALUE obj, int encindex)
	Destructively assigns the passed encoding to the passed object.

static int	RB_ENCODING_GET_INLINED (VALUE obj)
	Queries the encoding of the passed object.

int	rb_char_to_option_kcode (int c, int option, int kcode)
	Converts a character option to its encoding.

int	rb_define_dummy_encoding (const char *name)
	Creates a new "dummy" encoding.

int	rb_enc_dummy_p (rb_encoding *enc)
	Queries if the passed encoding is dummy.

int	rb_enc_to_index (rb_encoding *enc)
	Queries the index of the encoding.

int	rb_enc_get_index (VALUE obj)
	Queries the index of the encoding of the passed object, if any.

static int	RB_ENCODING_GET (VALUE obj)
	Just another name of rb_enc_get_index.

void	rb_enc_set_index (VALUE obj, int encindex)
	Destructively assigns an encoding (via its index) to an object.

static void	RB_ENCODING_SET (VALUE obj, int encindex)
	Just another name of rb_enc_set_index.

static void	RB_ENCODING_CODERANGE_SET (VALUE obj, int encindex, enum ruby_coderange_type cr)
	This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.

int	rb_enc_capable (VALUE obj)
	Queries if the passed object can have its encoding.

int	rb_enc_find_index (const char *name)
	Queries the index of the encoding.

int	rb_enc_alias (const char alias, const char orig)
	Registers an "alias" name.

int	rb_to_encoding_index (VALUE obj)
	Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).

rb_encoding *	rb_to_encoding (VALUE obj)
	Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.

rb_encoding *	rb_find_encoding (VALUE obj)
	Identical to rb_to_encoding_index(), except the return type.

rb_encoding *	rb_enc_get (VALUE obj)
	Identical to rb_enc_get_index(), except the return type.

rb_encoding *	rb_enc_compatible (VALUE str1, VALUE str2)
	Look for the "common" encoding between the two.

rb_encoding *	rb_enc_check (VALUE str1, VALUE str2)
	Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.

VALUE	rb_enc_associate_index (VALUE obj, int encindex)
	Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed object.

VALUE	rb_enc_associate (VALUE obj, rb_encoding *enc)
	Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.

void	rb_enc_copy (VALUE dst, VALUE src)
	Destructively copies the encoding of the latter object to that of former one.

rb_encoding *	rb_enc_from_index (int idx)
	Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.

rb_encoding *	rb_enc_find (const char *name)
	Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.

static const char *	rb_enc_name (rb_encoding *enc)
	Queries the (canonical) name of the passed encoding.

static int	rb_enc_mbminlen (rb_encoding *enc)
	Queries the minimum number of bytes that the passed encoding needs to represent a character.

static int	rb_enc_mbmaxlen (rb_encoding *enc)
	Queries the maximum number of bytes that the passed encoding needs to represent a character.

int	rb_enc_mbclen (const char p, const char e, rb_encoding *enc)
	Queries the number of bytes of the character at the passed pointer.

int	rb_enc_fast_mbclen (const char p, const char e, rb_encoding *enc)
	Identical to rb_enc_mbclen() unless the character at `p` overruns `e`.

int	rb_enc_precise_mbclen (const char p, const char e, rb_encoding *enc)
	Queries the number of bytes of the character at the passed pointer.

int	rb_enc_ascget (const char p, const char e, int len, rb_encoding enc)
	Queries the code point of character pointed by the passed pointer.

unsigned int	rb_enc_codepoint_len (const char p, const char e, int len, rb_encoding enc)
	Queries the code point of character pointed by the passed pointer.

static unsigned int	rb_enc_codepoint (const char p, const char e, rb_encoding *enc)
	Queries the code point of character pointed by the passed pointer.

static OnigCodePoint	rb_enc_mbc_to_codepoint (const char p, const char e, rb_encoding *enc)
	Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.

int	rb_enc_codelen (int code, rb_encoding *enc)
	Queries the number of bytes requested to represent the passed code point using the passed encoding.

static int	rb_enc_code_to_mbclen (int c, rb_encoding *enc)
	Identical to rb_enc_codelen(), except it returns 0 for invalid code points.

static int	rb_enc_mbcput (unsigned int c, void buf, rb_encoding enc)
	Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.

static char *	rb_enc_prev_char (const char s, const char p, const char e, rb_encoding enc)
	Queries the previous (left) character.

static char *	rb_enc_left_char_head (const char s, const char p, const char e, rb_encoding enc)
	Queries the left boundary of a character.

static char *	rb_enc_right_char_head (const char s, const char p, const char e, rb_encoding enc)
	Queries the right boundary of a character.

static char *	rb_enc_step_back (const char s, const char p, const char e, int n, rb_encoding enc)
	Scans the string backwards for n characters.

static bool	rb_enc_asciicompat (rb_encoding *enc)
	Queries if the passed encoding is in some sense compatible with ASCII.

static bool	rb_enc_str_asciicompat_p (VALUE str)
	Queries if the passed string is in an ASCII-compatible encoding.

VALUE	rb_enc_from_encoding (rb_encoding *enc)
	Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.

int	rb_enc_unicode_p (rb_encoding *enc)
	Queries if the passed encoding is either one of UTF-8/16/32.

rb_encoding *	rb_ascii8bit_encoding (void)
	Queries the encoding that represents ASCII-8BIT a.k.a.

rb_encoding *	rb_utf8_encoding (void)
	Queries the encoding that represents UTF-8.

rb_encoding *	rb_usascii_encoding (void)
	Queries the encoding that represents US-ASCII.

rb_encoding *	rb_locale_encoding (void)
	Queries the encoding that represents the current locale.

rb_encoding *	rb_filesystem_encoding (void)
	Queries the "filesystem" encoding.

rb_encoding *	rb_default_external_encoding (void)
	Queries the "default external" encoding.

rb_encoding *	rb_default_internal_encoding (void)
	Queries the "default internal" encoding.

int	rb_ascii8bit_encindex (void)
	Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding itself.

static bool	RB_ENCODING_IS_ASCII8BIT (VALUE obj)
	Queries if the passed object is in ascii 8bit (== binary) encoding.

int	rb_utf8_encindex (void)
	Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itself.

int	rb_usascii_encindex (void)
	Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding itself.

int	rb_locale_encindex (void)
	Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding itself.

int	rb_filesystem_encindex (void)
	Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding itself.

VALUE	rb_enc_default_external (void)
	Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default external encoding.

VALUE	rb_enc_default_internal (void)
	Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default internal encoding.

void	rb_enc_set_default_external (VALUE encoding)
	Destructively assigns the passed encoding as the default external encoding.

void	rb_enc_set_default_internal (VALUE encoding)
	Destructively assigns the passed encoding as the default internal encoding.

VALUE	rb_locale_charmap (VALUE klass)
	Returns a platform-depended "charmap" of the current locale.

Variables
VALUE	rb_cEncoding
	`Encoding` class.

Detailed Description

Defines rb_encoding.

Author: Ruby developers ruby-.nosp@m.core.nosp@m.@ruby.nosp@m.-lan.nosp@m.g.org

Copyright: This file is a part of the programming language Ruby. Permission is hereby granted, to either redistribute and/or modify this file, provided that the conditions mentioned in the file COPYING are met. Consult the file for details.

Warning: Symbols prefixed with either RBIMPL or rbimpl are implementation details. Don't take them as canon. They could rapidly appear then vanish. The name (path) of this header file is also an implementation detail. Do not expect it to persist at the place it is now. Developers are free to move it anywhere anytime at will.

Note: To ruby-core: remember that this header can be possibly recursively included from extension libraries written in C++. Do not expect for instance __VA_ARGS__ is always available. We assume C99 for ruby itself but we don't assume languages of extension libraries. They could be written in C++98.

Definition in file encoding.h.

Typedef Documentation

◆ rb_encoding

typedef const OnigEncodingType rb_encoding

The type of encoding.

Our design here is we take Oniguruma/Onigmo's multilingualisation schema as our base data structure.

Definition at line 117 of file encoding.h.

Function Documentation

◆ rb_ascii8bit_encindex()

int rb_ascii8bit_encindex ( void )

Identical to rb_ascii8bit_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns: The index of encoding of ASCII-8BIT.

Definition at line 1495 of file encoding.c.

Referenced by RB_ENCODING_IS_ASCII8BIT().

◆ rb_ascii8bit_encoding()

rb_encoding * rb_ascii8bit_encoding ( void )

Queries the encoding that represents ASCII-8BIT a.k.a.

binary.

Returns: The encoding that represents ASCII-8BIT.

Definition at line 1489 of file encoding.c.

◆ rb_char_to_option_kcode()

int rb_char_to_option_kcode	(	int	c,
		int *	option,
		int *	kcode
	)

Converts a character option to its encoding.

It only supports a very limited set of Japanese encodings due to its Japanese origin. Ruby still has this in-core for backwards compatibility. But new codes must not bother such concept like one-character encoding option. Consider deprecated in practice.

Parameters

[in] c One of ‘['n’, 'e', 's', 'u', 'i', 'x', 'm']. @param[out] option Return buffer. @param[out] kcode Return buffer. @retval 1cunderstood properly. @retval 0cis not understood. @postoptionis a OnigOptionType. @postkcodeis an enumruby_preserved_encindex`.

Definition at line 334 of file re.c.

Referenced by rb_char_to_option_kcode().

◆ rb_default_external_encoding()

rb_encoding * rb_default_external_encoding ( void )

Queries the "default external" encoding.

This is used to interact with outer-process things such as File. Though not recommended, you can set this using rb_enc_set_default_external().

Returns: The "default external" encoding.

Definition at line 1616 of file encoding.c.

◆ rb_default_internal_encoding()

rb_encoding * rb_default_internal_encoding ( void )

Queries the "default internal" encoding.

This could be a null pointer. Otherwise, outer-process info are transcoded from default external encoding to this one during reading from an IO.

Returns: The "default internal" encoding (if any).

Definition at line 1703 of file encoding.c.

◆ rb_define_dummy_encoding()

int rb_define_dummy_encoding ( const char * name )

Creates a new "dummy" encoding.

Roughly speaking, an encoding is dummy when it is stateful. Notable example of dummy encoding are those defined in ISO/IEC 2022

Parameters

[in] name Name of the creating encoding.

Exceptions

rb_eArgError Duplicated or malformed `name`.

Returns: New dummy encoding's index.

Postcondition: Encoding named name is created, whose index is the return value.

Definition at line 566 of file encoding.c.

◆ rb_enc_alias()

int rb_enc_alias	(	const char *	alias,
		const char *	orig
	)

Registers an "alias" name.

In the wild, an encoding can be called using multiple names. For instance an encoding known as "CP932" is also called "SJIS" on occasions. This API registers such relationships.

Parameters

[in]	alias	New name.
[in]	orig	Old name.

Exceptions

rb_eArgError `alias` is duplicated or malformed.

Return values

-1	Failed to load `orig`.
otherwise	The index of `orig` and `alias`.

Postcondition: alias is a synonym of orig. They refer to the identical encoding.

Definition at line 667 of file encoding.c.

◆ rb_enc_ascget()

int rb_enc_ascget	(	const char *	p,
		const char *	e,
		int *	len,
		rb_encoding *	enc
	)

Queries the code point of character pointed by the passed pointer.

If that code point is included in ASCII that code point is returned. Otherwise -1. This can be different from just looking at the first byte. For instance it reads 2 bytes in case of UTF-16BE.

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	len	Return buffer.
[in]	enc	Encoding of the string.

Return values

-1	The character at `p` is not i ASCII.
otherwise	A code point of the character at `p`.

Postcondition: len (if set) is the number of bytes of p.

Definition at line 1229 of file encoding.c.

◆ rb_enc_asciicompat()

static bool rb_enc_asciicompat ( rb_encoding * enc )

inlinestatic

Queries if the passed encoding is in some sense compatible with ASCII.

The concept of ASCII compatibility is nuanced, and private to our implementation. For instance SJIS is ASCII compatible to us, despite their having different characters at code point 0x5C. This is based on some practical consideration that Japanese people confuses SJIS to be "upper compatible" with ASCII (which is in fact a wrong idea, but we just don't go strict here). An example of ASCII incompatible encoding is UTF-16. UTF-16 shares code points with ASCII, but employs a completely different encoding scheme.

Parameters

[in] enc Encoding in question.

Return values

0	It is incompatible.
1	It is compatible.

Definition at line 768 of file encoding.h.

◆ rb_enc_associate()

VALUE rb_enc_associate	(	VALUE	obj,
		rb_encoding *	enc
	)

Identical to rb_enc_associate_index(), except it takes an encoding itself instead of its index.

Parameters

[out]	obj	Object in question.
[in]	enc	An encoding.

Exceptions

rb_eFrozenError	`obj` is frozen.
rb_eArgError	`obj` is incapable of having an encoding.

Returns: The passed obj.

Postcondition: obj's contents might be fixed according to enc.

Definition at line 1045 of file encoding.c.

◆ rb_enc_associate_index()

VALUE rb_enc_associate_index	(	VALUE	obj,
		int	encindex
	)

Identical to rb_enc_set_index(), except it additionally does contents fix-up depending on the passed object.

It for instance changes the byte length of terminating U+0000 according to the passed encoding.

Parameters

[out]	obj	Object in question.
[in]	encindex	An encoding index.

Exceptions

rb_eFrozenError	`obj` is frozen.
rb_eArgError	`obj` is incapable of having an encoding.
rb_eEncodingError	`encindex` is out of bounds.
rb_eLoadError	Failed to load the encoding.

Returns: The passed obj.

Postcondition: obj's contents might be fixed according to encindex.

Definition at line 1017 of file encoding.c.

◆ rb_enc_capable()

int rb_enc_capable ( VALUE obj )

Queries if the passed object can have its encoding.

Parameters

[in] obj Object in question.

Return values

1	It can.
0	It cannot.

Definition at line 907 of file encoding.c.

◆ rb_enc_check()

rb_encoding * rb_enc_check	(	VALUE	str1,
		VALUE	str2
	)

Identical to rb_enc_compatible(), except it raises an exception instead of returning NULL.

Parameters

[in]	str1	An object.
[in]	str2	Another object.

Exceptions

rb_eEncCompatError No encoding can satisfy both.

Returns: Common encoding between the two.

Note: Arguments can be non-string, e.g. Regexp.

Definition at line 1085 of file encoding.c.

◆ rb_enc_code_to_mbclen()

static int rb_enc_code_to_mbclen	(	int	c,
		rb_encoding *	enc
	)

inlinestatic

Identical to rb_enc_codelen(), except it returns 0 for invalid code points.

Parameters

[in]	c	Code point in question.
[in]	enc	Encoding to convert `c` into a byte sequence.

Return values

0	`c` is invalid.

Returns: otherwise Number of bytes needed for enc to encode c.

Definition at line 619 of file encoding.h.

◆ rb_enc_codelen()

int rb_enc_codelen	(	int	code,
		rb_encoding *	enc
	)

Queries the number of bytes requested to represent the passed code point using the passed encoding.

Parameters

[in]	code	Code point in question.
[in]	enc	Encoding to convert the code into a byte sequence.

Exceptions

rb_eArgError `enc` does not glean `code`.

Returns: Number of bytes requested to represent code using enc.

Definition at line 1267 of file encoding.c.

◆ rb_enc_codepoint()

static unsigned int rb_enc_codepoint	(	const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

inlinestatic

Queries the code point of character pointed by the passed pointer.

Exceptions happen in case of broken input.

Deprecated:: Use rb_enc_codepoint_len() instead.

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	enc	Encoding of the string.

Exceptions

rb_eArgError `p` is broken.

Returns: Code point of the character pointed by p.

Definition at line 571 of file encoding.h.

Referenced by rb_str_inspect().

◆ rb_enc_codepoint_len()

unsigned int rb_enc_codepoint_len	(	const char *	p,
		const char *	e,
		int *	len,
		rb_encoding *	enc
	)

Queries the code point of character pointed by the passed pointer.

Exceptions happen in case of broken input.

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	len	Return buffer.
[in]	enc	Encoding of the string.

Exceptions

rb_eArgError `p` is broken.

Returns: Code point of the character pointed by p.

Postcondition: len (if set) is the number of bytes of p.

Definition at line 1253 of file encoding.c.

Referenced by rb_enc_codepoint().

◆ rb_enc_compatible()

rb_encoding * rb_enc_compatible	(	VALUE	str1,
		VALUE	str2
	)

Look for the "common" encoding between the two.

One character can or cannot be expressed depending on an encoding. This function finds the super-set of encodings that satisfy contents of both arguments. If that is impossible returns NULL.

Parameters

[in]	str1	An object.
[in]	str2	Another object.

Return values

NULL	No encoding can satisfy both at once.
otherwise	Common encoding between the two.

Note: Arguments can be non-string, e.g. Regexp.

Definition at line 1163 of file encoding.c.

◆ rb_enc_copy()

void rb_enc_copy	(	VALUE	dst,
		VALUE	src
	)

Destructively copies the encoding of the latter object to that of former one.

It can also be seen as a routine identical to rb_enc_associate_index(), except it takes an object's encoding instead of an encoding's index.

Parameters

[out]	dst	Object to modify.
[in]	src	Object to reference.

Exceptions

rb_eFrozenError	`dst` is frozen.
rb_eArgError	`dst` is incapable of having an encoding.
rb_eEncodingError	`src` is incapable of having an encoding.

Postcondition: dst's encoding is that of src's.

Definition at line 1172 of file encoding.c.

◆ rb_enc_default_external()

VALUE rb_enc_default_external ( void )

Identical to rb_default_external_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default external encoding.

Returns: An instance of rb_cEncoding of default external.

Definition at line 1630 of file encoding.c.

◆ rb_enc_default_internal()

VALUE rb_enc_default_internal ( void )

Identical to rb_default_internal_encoding(), except it returns the Ruby-level counterpart instance of rb_cEncoding that corresponds to the default internal encoding.

Returns: An instance of rb_cEncoding of default internal.

Definition at line 1712 of file encoding.c.

◆ rb_enc_dummy_p()

int rb_enc_dummy_p ( rb_encoding * enc )

Queries if the passed encoding is dummy.

Parameters

[in] enc Encoding in question.

Return values

1	It is.
0	It isn't.

Definition at line 195 of file encoding.c.

Referenced by rb_enc_asciicompat().

◆ rb_enc_fast_mbclen()

int rb_enc_fast_mbclen	(	const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

Identical to rb_enc_mbclen() unless the character at p overruns e.

That can happen for instance when you read from a socket and its partial read cuts a wide character in-between. In those situations this function "estimates" theoretical length of the character in question. Typically it tends to be possible to know how many bytes a character needs before actually reaching its end; for instance UTF-8 encodes a character's length in the first byte of it. This function returns that info.

Note: This implies that the string is not broken.

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	enc	Encoding of the string.

Returns: Number of bytes of character at p, measured or estimated.

Definition at line 1199 of file encoding.c.

◆ rb_enc_find()

rb_encoding * rb_enc_find ( const char * name )

Identical to rb_find_encoding(), except it takes a C's string instead of Ruby's.

Parameters

[in] name Name of the encoding to query.

Return values

NULL	No such encoding.
otherwise	An encoding whose index is `idx`.

Definition at line 882 of file encoding.c.

◆ rb_enc_find_index()

int rb_enc_find_index ( const char * name )

Queries the index of the encoding.

Parameters

[in] name Name of the encoding to find.

Exceptions

rb_eArgError No such encoding named `name`.

Return values

-1	`name` exists, but unable to load.
otherwise	Index of encoding named `name`.

Definition at line 844 of file encoding.c.

◆ rb_enc_from_encoding()

VALUE rb_enc_from_encoding ( rb_encoding * enc )

Queries the Ruby-level counterpart instance of rb_cEncoding that corresponds to the passed encoding.

Parameters

[in] enc An encoding

Return values

RUBY_Qnil	`enc` is a null pointer.
otherwise	An instance of rb_cEncoding.

Definition at line 180 of file encoding.c.

◆ rb_enc_from_index()

rb_encoding * rb_enc_from_index ( int idx )

Identical to rb_find_encoding(), except it takes an encoding index instead of a Ruby object.

Parameters

[in] idx An encoding index.

Return values

NULL	No such encoding.
otherwise	An encoding whose index is `idx`.

Definition at line 409 of file encoding.c.

◆ rb_enc_get()

rb_encoding * rb_enc_get ( VALUE obj )

Identical to rb_enc_get_index(), except the return type.

Parameters

[in] obj Object in question.

Return values

NULL	Obj is incapable of having an encoding.
otherwise	`obj`'s encoding.

Definition at line 1051 of file encoding.c.

◆ rb_enc_get_index()

int rb_enc_get_index ( VALUE obj )

Queries the index of the encoding of the passed object, if any.

Parameters

[in] obj Object in question.

Return values

-1	`obj` is incapable of having an encoding.
otherwise	`obj`'s encoding's index.

Definition at line 943 of file encoding.c.

Referenced by RB_ENCODING_GET().

◆ rb_enc_left_char_head()

static char * rb_enc_left_char_head	(	const char *	s,
		const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

inlinestatic

Queries the left boundary of a character.

This function takes a pointer that is not necessarily a head of a character, and searches for its head.

Parameters

[in]	s	Start of the string.
[in]	p	Pointer to a possibly-middle of a character.
[in]	e	End of the string.
[in]	enc	Encoding.

Returns: Pointer to the head of the character that contains p.

Definition at line 683 of file encoding.h.

◆ rb_enc_mbc_to_codepoint()

static OnigCodePoint rb_enc_mbc_to_codepoint	(	const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

inlinestatic

Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	enc	Encoding of the string.

Returns: Code point of the character pointed by p.

Definition at line 591 of file encoding.h.

◆ rb_enc_mbclen()

int rb_enc_mbclen	(	const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

Queries the number of bytes of the character at the passed pointer.

Parameters

[in]	p	Pointer to a character's first byte.
[in]	e	End of the string that has `p`.
[in]	enc	Encoding of the string.

Returns: If the character at p does not end until e, number of bytes between p and e. Otherwise the number of bytes that the character at p is encoded.

Definition at line 1205 of file encoding.c.

◆ rb_enc_mbcput()

static int rb_enc_mbcput	(	unsigned int	c,
		void *	buf,
		rb_encoding *	enc
	)

inlinestatic

Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.

Parameters

[in]	c	Code point.
[out]	buf	Return buffer.
[in]	enc	Target encoding scheme.

Return values

<=	0 `c` is invalid in `enc`.

Returns: otherwise Number of bytes written to buf.

Postcondition: c is encoded according to enc, then written to buf.

Definition at line 643 of file encoding.h.

◆ rb_enc_mbmaxlen()

static int rb_enc_mbmaxlen ( rb_encoding * enc )

inlinestatic

Queries the maximum number of bytes that the passed encoding needs to represent a character.

Fixed-width encodings have the same value for this one and rb_enc_mbminlen. However there are variable-width encodings. UTF-8, for instance, takes from 1 up to 6 bytes.

Parameters

[in] enc An encoding.

Returns: Its maximum possible number of bytes of a character.

Definition at line 447 of file encoding.h.

Referenced by rb_str_buf_cat_ascii(), and rb_str_subpos().

◆ rb_enc_mbminlen()

static int rb_enc_mbminlen ( rb_encoding * enc )

inlinestatic

Queries the minimum number of bytes that the passed encoding needs to represent a character.

For ASCII and compatible encodings this is typically

There are however encodings whose minimum is not 1; they are historically called wide characters.

Parameters

[in] enc An encoding.

Returns: Its least possible number of bytes except 0.

Definition at line 432 of file encoding.h.

◆ rb_enc_name()

static const char * rb_enc_name ( rb_encoding * enc )

inlinestatic

Queries the (canonical) name of the passed encoding.

Parameters

[in] enc An encoding.

Returns: Its name.

Definition at line 417 of file encoding.h.

◆ rb_enc_precise_mbclen()

int rb_enc_precise_mbclen	(	const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

Queries the number of bytes of the character at the passed pointer.

This function returns 3 different types of information:

auto n = rb_enc_precise_mbclen(p, q, r);
 
if (ONIGENC_MBCLEN_CHARFOUND_P(n)) {
    // Character found.  Normal return.
    auto found_length = ONIGENC_MBCLEN_CHARFOUND_LEN(n);
}
else if (ONIGENC_MBCLEN_NEEDMORE_P(n)) {
    // Character overruns past `q`; needs more.
    auto requested_length = ONIGENC_MBCLEN_NEEDMORE_LEN(n);
}
else {
    // `p` is broken.
    assert(ONIGENC_MBCLEN_INVALID_P(n));
}

Parameters

[in]	p	Pointer to the character's first byte.
[in]	e	End of the string that has `p`.
[in]	enc	Encoding of the string.

Returns: Encoded read/needed number of bytes (see above).

Definition at line 1217 of file encoding.c.

◆ rb_enc_prev_char()

static char * rb_enc_prev_char	(	const char *	s,
		const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

inlinestatic

Queries the previous (left) character.

Parameters

[in]	s	Start of the string.
[in]	p	Pointer to a character.
[in]	e	End of the string.
[in]	enc	Encoding.

Return values

NULL	No previous character.
otherwise	Pointer to the head of the previous character.

Definition at line 662 of file encoding.h.

◆ rb_enc_right_char_head()

static char * rb_enc_right_char_head	(	const char *	s,
		const char *	p,
		const char *	e,
		rb_encoding *	enc
	)

inlinestatic

Queries the right boundary of a character.

This function takes a pointer that is not necessarily a head of a character, and searches for its tail.

Parameters

[in]	s	Start of the string.
[in]	p	Pointer to a possibly-middle of a character.
[in]	e	End of the string.
[in]	enc	Encoding.

Returns: Pointer to the end of the character that contains p.

Definition at line 704 of file encoding.h.

Referenced by rb_str_format().

◆ rb_enc_set_default_external()

void rb_enc_set_default_external ( VALUE encoding )

Destructively assigns the passed encoding as the default external encoding.

You should not use this API. It has process-global side effects. Also it doesn't change encodings of strings that have already been read.

Parameters

[in] encoding Ruby level encoding.

Exceptions

rb_eArgError `encoding` is RUBY_Qnil.

Postcondition: The default external encoding is encoding.

Definition at line 1670 of file encoding.c.

◆ rb_enc_set_default_internal()

void rb_enc_set_default_internal ( VALUE encoding )

Destructively assigns the passed encoding as the default internal encoding.

You should not use this API. It has process-global side effects. Also it doesn't change encodings of strings that have already been read.

Parameters

[in] encoding Ruby level encoding.

Postcondition: The default internal encoding is encoding.

Note: Unlike rb_enc_set_default_external() you can pass RUBY_Qnil.

Definition at line 1753 of file encoding.c.

◆ rb_enc_set_index()

void rb_enc_set_index	(	VALUE	obj,
		int	encindex
	)

Destructively assigns an encoding (via its index) to an object.

Parameters

[out]	obj	Object in question.
[in]	encindex	An encoding index.

Exceptions

rb_eFrozenError	`obj` is frozen.
rb_eArgError	`obj` is incapable of having an encoding.
rb_eEncodingError	`encindex` is out of bounds.
rb_eLoadError	Failed to load the encoding.

Definition at line 1009 of file encoding.c.

Referenced by RB_ENCODING_SET().

◆ rb_enc_step_back()

static char * rb_enc_step_back	(	const char *	s,
		const char *	p,
		const char *	e,
		int	n,
		rb_encoding *	enc
	)

inlinestatic

Scans the string backwards for n characters.

Parameters

[in]	s	Start of the string.
[in]	p	Pointer to a character.
[in]	e	End of the string.
[in]	n	Steps.
[in]	enc	Encoding.

Return values

NULL	There are no `n` characters left.
otherwise	Pointer to `n` character before `p`.

Definition at line 726 of file encoding.h.

Referenced by rb_str_ellipsize().

◆ rb_enc_str_asciicompat_p()

static bool rb_enc_str_asciicompat_p ( VALUE str )

inlinestatic

Queries if the passed string is in an ASCII-compatible encoding.

Parameters

[in] str A Ruby's string to query.

Return values

0	`str` is not a String, or an ASCII-incompatible string.
1	Otherwise.

Definition at line 789 of file encoding.h.

◆ rb_enc_to_index()

int rb_enc_to_index ( rb_encoding * enc )

Queries the index of the encoding.

An encoding's index is a Ruby-local concept. It is a (sequential) number assigned to each encoding.

Parameters

[in] enc Encoding in question.

Returns: Its index.

Note: You can pass null pointers to this function. It is equivalent to rb_usascii_encindex() then.

Definition at line 189 of file encoding.c.

◆ rb_enc_unicode_p()

int rb_enc_unicode_p ( rb_encoding * enc )

Queries if the passed encoding is either one of UTF-8/16/32.

Note: It does not take UTF-7, which we actually support, into account.

Parameters

[in] enc Encoding in question.

Return values

0	It is not a Unicode variant.
otherwise	It is.

Definition at line 634 of file encoding.c.

◆ RB_ENCODING_CODERANGE_SET()

static void RB_ENCODING_CODERANGE_SET	(	VALUE	obj,
		int	encindex,
		enum ruby_coderange_type	cr
	)

inlinestatic

This is RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters

[out]	obj	Target object.
[in]	encindex	Encoding in encindex format.
[in]	cr	An enum ruby_coderange_type.

Postcondition: obj's encoding is encindex.; obj's code range is cr.

Definition at line 238 of file encoding.h.

◆ RB_ENCODING_GET()

static int RB_ENCODING_GET ( VALUE obj )

inlinestatic

Just another name of rb_enc_get_index.

Definition at line 195 of file encoding.h.

◆ RB_ENCODING_GET_INLINED()

static int RB_ENCODING_GET_INLINED ( VALUE obj )

inlinestatic

Queries the encoding of the passed object.

The encoding must be smaller than RUBY_ENCODING_INLINE_MAX, which means you have some assumption on the return value. This means the API is for internal use only.

Parameters

[in] obj Target object.

Returns: obj's encoding index.

Definition at line 99 of file encoding.h.

Referenced by RB_ENCODING_GET(), and RB_ENCODING_IS_ASCII8BIT().

◆ RB_ENCODING_IS_ASCII8BIT()

static bool RB_ENCODING_IS_ASCII8BIT ( VALUE obj )

inlinestatic

Queries if the passed object is in ascii 8bit (== binary) encoding.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters

[in] obj An object to check.

Return values

1	It is.
0	It isn't.

Definition at line 927 of file encoding.h.

◆ RB_ENCODING_SET()

static void RB_ENCODING_SET	(	VALUE	obj,
		int	encindex
	)

inlinestatic

Just another name of rb_enc_set_index.

Definition at line 221 of file encoding.h.

Referenced by RB_ENCODING_CODERANGE_SET().

◆ RB_ENCODING_SET_INLINED()

static void RB_ENCODING_SET_INLINED	(	VALUE	obj,
		int	encindex
	)

inlinestatic

Destructively assigns the passed encoding to the passed object.

The object must be capable of having inline encoding. Using this macro needs deep understanding of bit level object binary layout.

Parameters

[out]	obj	Target object to modify.
[in]	ecindex	Encoding in encindex format.

Postcondition: obj's encoding is encindex.

Definition at line 81 of file encoding.h.

◆ rb_filesystem_encindex()

int rb_filesystem_encindex ( void )

Identical to rb_filesystem_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns: The index of the filesystem encoding.

Definition at line 1555 of file encoding.c.

◆ rb_filesystem_encoding()

rb_encoding * rb_filesystem_encoding ( void )

Queries the "filesystem" encoding.

This is the encoding that ruby expects info from the OS' file system are in. This affects for instance return value of rb_dir_getwd(). Most notably on Windows it can be an alias of OS codepage. Most notably on Linux users can set this via default external encoding.

Returns: The "filesystem" encoding.

Definition at line 1566 of file encoding.c.

◆ rb_find_encoding()

rb_encoding * rb_find_encoding ( VALUE obj )

Identical to rb_to_encoding_index(), except the return type.

Parameters

[in] obj An rb_cEncoding, or its name in rb_cString.

Exceptions

rb_eTypeError `obj` is neither rb_cEncoding nor rb_cString.

Return values

NULL	No such encoding.

Returns: otherwise Encoding of obj.

Definition at line 328 of file encoding.c.

◆ rb_locale_charmap()

VALUE rb_locale_charmap ( VALUE klass )

Returns a platform-depended "charmap" of the current locale.

This information is called a "Codeset name" in IEEE 1003.1 section 13 (<langinfo.h>). This is a very low-level API. The return value can have no corresponding encoding when passed to rb_find_encoding().

Parameters

[in] klass Ignored for no reason (why...)

Returns: The low-level locale charmap, in Ruby's String.

Definition at line 91 of file localeinit.c.

Referenced by rb_locale_charmap().

◆ rb_locale_encindex()

int rb_locale_encindex ( void )

Identical to rb_locale_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns: The index of the locale encoding.

Definition at line 1527 of file encoding.c.

◆ rb_locale_encoding()

rb_encoding * rb_locale_encoding ( void )

Queries the encoding that represents the current locale.

Returns: The encoding that represents the process' locale.

Definition at line 1549 of file encoding.c.

◆ rb_to_encoding()

rb_encoding * rb_to_encoding ( VALUE obj )

Identical to rb_find_encoding(), except it raises an exception instead of returning NULL.

Parameters

[in] obj An rb_cEncoding, or its name in rb_cString.

Exceptions

rb_eTypeError	`obj` is neither rb_cEncoding nor rb_cString.
rb_eArgError	`obj` is an unknown encoding name.

Returns: Encoding of obj.

Definition at line 321 of file encoding.c.

◆ rb_to_encoding_index()

int rb_to_encoding_index ( VALUE obj )

Obtains a encoding index from a wider range of objects (than rb_enc_find_index()).

Parameters

[in] obj An rb_cEncoding, or its name in rb_cString.

Return values

-1	`obj` is unexpected type/contents.
otherwise	Index corresponding to `obj`.

Definition at line 259 of file encoding.c.

◆ rb_usascii_encindex()

int rb_usascii_encindex ( void )

Identical to rb_usascii_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns: The index of encoding of UTF-8.

Definition at line 1519 of file encoding.c.

◆ rb_usascii_encoding()

rb_encoding * rb_usascii_encoding ( void )

Queries the encoding that represents US-ASCII.

Returns: The encoding that represents US-ASCII.

Definition at line 1513 of file encoding.c.

◆ rb_utf8_encindex()

int rb_utf8_encindex ( void )

Identical to rb_utf8_encoding(), except it returns the encoding's index instead of the encoding itself.

Returns: The index of encoding of UTF-8.

Definition at line 1507 of file encoding.c.

◆ rb_utf8_encoding()

rb_encoding * rb_utf8_encoding ( void )

Queries the encoding that represents UTF-8.

Returns: The encoding that represents UTF-8.

Definition at line 1501 of file encoding.c.

(bd2d6845f1c48a572e202409367a4e756c4bb3c5)

Macros

Typedefs

Functions

Variables

Detailed Description

Typedef Documentation

◆ rb_encoding

Function Documentation

◆ rb_ascii8bit_encindex()

◆ rb_ascii8bit_encoding()

◆ rb_char_to_option_kcode()

◆ rb_default_external_encoding()

◆ rb_default_internal_encoding()

◆ rb_define_dummy_encoding()

◆ rb_enc_alias()

◆ rb_enc_ascget()

◆ rb_enc_asciicompat()

◆ rb_enc_associate()

◆ rb_enc_associate_index()

◆ rb_enc_capable()

◆ rb_enc_check()

◆ rb_enc_code_to_mbclen()

◆ rb_enc_codelen()

◆ rb_enc_codepoint()

◆ rb_enc_codepoint_len()

◆ rb_enc_compatible()

◆ rb_enc_copy()

◆ rb_enc_default_external()

◆ rb_enc_default_internal()

◆ rb_enc_dummy_p()

◆ rb_enc_fast_mbclen()

◆ rb_enc_find()

◆ rb_enc_find_index()

◆ rb_enc_from_encoding()

◆ rb_enc_from_index()

◆ rb_enc_get()

◆ rb_enc_get_index()

◆ rb_enc_left_char_head()

◆ rb_enc_mbc_to_codepoint()

◆ rb_enc_mbclen()

◆ rb_enc_mbcput()

◆ rb_enc_mbmaxlen()

◆ rb_enc_mbminlen()

◆ rb_enc_name()

◆ rb_enc_precise_mbclen()

◆ rb_enc_prev_char()

◆ rb_enc_right_char_head()

◆ rb_enc_set_default_external()

◆ rb_enc_set_default_internal()

◆ rb_enc_set_index()

◆ rb_enc_step_back()

◆ rb_enc_str_asciicompat_p()

◆ rb_enc_to_index()

◆ rb_enc_unicode_p()

◆ RB_ENCODING_CODERANGE_SET()

◆ RB_ENCODING_GET()

◆ RB_ENCODING_GET_INLINED()

◆ RB_ENCODING_IS_ASCII8BIT()

◆ RB_ENCODING_SET()

◆ RB_ENCODING_SET_INLINED()

◆ rb_filesystem_encindex()

◆ rb_filesystem_encoding()

◆ rb_find_encoding()

◆ rb_locale_charmap()

◆ rb_locale_encindex()

◆ rb_locale_encoding()

◆ rb_to_encoding()

◆ rb_to_encoding_index()

◆ rb_usascii_encindex()

◆ rb_usascii_encoding()

◆ rb_utf8_encindex()

◆ rb_utf8_encoding()