Ruby
3.4.0dev (2024-11-05 revision 348a53415339076afc4a02fcd09f3ae36e9c4c61)
|
Routines to manipulate encodings of strings. More...
#include "ruby/internal/dllexport.h"
#include "ruby/internal/value.h"
#include "ruby/internal/encoding/encoding.h"
#include "ruby/internal/attr/nonnull.h"
#include "ruby/internal/intern/string.h"
Go to the source code of this file.
Functions | |
VALUE | rb_enc_str_new (const char *ptr, long len, rb_encoding *enc) |
Identical to rb_str_new(), except it additionally takes an encoding. More... | |
VALUE | rb_enc_str_new_cstr (const char *ptr, rb_encoding *enc) |
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string. More... | |
VALUE | rb_enc_str_new_static (const char *ptr, long len, rb_encoding *enc) |
Identical to rb_enc_str_new(), except it takes a C string literal. More... | |
VALUE | rb_enc_interned_str (const char *ptr, long len, rb_encoding *enc) |
Identical to rb_enc_str_new(), except it returns a "f"string. More... | |
VALUE | rb_enc_interned_str_cstr (const char *ptr, rb_encoding *enc) |
Identical to rb_enc_str_new_cstr(), except it returns a "f"string. More... | |
long | rb_enc_strlen (const char *head, const char *tail, rb_encoding *enc) |
Counts the number of characters of the passed string, according to the passed encoding. More... | |
char * | rb_enc_nth (const char *head, const char *tail, long nth, rb_encoding *enc) |
Queries the n-th character. More... | |
VALUE | rb_obj_encoding (VALUE obj) |
Identical to rb_enc_get_index(), except the return type. More... | |
VALUE | rb_enc_str_buf_cat (VALUE str, const char *ptr, long len, rb_encoding *enc) |
Identical to rb_str_cat(), except it additionally takes an encoding. More... | |
VALUE | rb_enc_uint_chr (unsigned int code, rb_encoding *enc) |
Encodes the passed code point into a series of bytes. More... | |
VALUE | rb_external_str_new_with_enc (const char *ptr, long len, rb_encoding *enc) |
Identical to rb_external_str_new(), except it additionally takes an encoding. More... | |
VALUE | rb_str_export_to_enc (VALUE obj, rb_encoding *enc) |
Identical to rb_str_export(), except it additionally takes an encoding. More... | |
VALUE | rb_str_conv_enc (VALUE str, rb_encoding *from, rb_encoding *to) |
Encoding conversion main routine. More... | |
VALUE | rb_str_conv_enc_opts (VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) |
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options. More... | |
int | rb_enc_str_coderange (VALUE str) |
Scans the passed string to collect its code range. More... | |
long | rb_str_coderange_scan_restartable (const char *str, const char *end, rb_encoding *enc, int *cr) |
Scans the passed string until it finds something odd. More... | |
int | rb_enc_str_asciionly_p (VALUE str) |
Queries if the passed string is "ASCII only". More... | |
long | rb_memsearch (const void *x, long m, const void *y, long n, rb_encoding *enc) |
Looks for the passed string in the passed buffer. More... | |
Routines to manipulate encodings of strings.
RBIMPL
or rbimpl
are implementation details. Don't take them as canon. They could rapidly appear then vanish. The name (path) of this header file is also an implementation detail. Do not expect it to persist at the place it is now. Developers are free to move it anywhere anytime at will. __VA_ARGS__
is always available. We assume C99 for ruby itself but we don't assume languages of extension libraries. They could be written in C++98. Definition in file string.h.
VALUE rb_enc_interned_str | ( | const char * | ptr, |
long | len, | ||
rb_encoding * | enc | ||
) |
Identical to rb_enc_str_new(), except it returns a "f"string.
It can also be seen as a routine identical to rb_interned_str(), except it additionally takes an encoding.
[in] | ptr | A memory region of len bytes length. |
[in] | len | Length of ptr , in bytes, not including the terminating NUL character. |
[in] | enc | Encoding of ptr . |
rb_eArgError | len is negative. |
len
bytes length, of enc
encoding, whose contents are identical to that of ptr
. len
bytes of continuous memory region shall be accessible via ptr
. enc
can be a null pointer. Definition at line 12506 of file string.c.
Referenced by rb_enc_interned_str_cstr().
VALUE rb_enc_interned_str_cstr | ( | const char * | ptr, |
rb_encoding * | enc | ||
) |
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
It can also be seen as a routine identical to rb_interned_str_cstr(), except it additionally takes an encoding.
[in] | ptr | A memory region of len bytes length. |
[in] | enc | Encoding of ptr . |
enc
encoding, whose contents are identical to that of ptr
. len
bytes of continuous memory region shall be accessible via ptr
. enc
can be a null pointer. char* rb_enc_nth | ( | const char * | head, |
const char * | tail, | ||
long | nth, | ||
rb_encoding * | enc | ||
) |
Queries the n-th character.
Like rb_enc_strlen() this function can be fast or slow depending on the contents. Don't expect characters to be uniformly distributed across the entire string.
[in] | head | Leftmost pointer to the string. |
[in] | tail | Rightmost pointer to the string. |
[in] | nth | Requested index of characters. |
[in] | enc | Encoding of the string. |
nth
character ahead of head
, or tail
if there is no such character (OOB etc). The definition of "character" depends on the passed enc
. Definition at line 2921 of file string.c.
Referenced by rb_str_ellipsize(), and rb_str_format().
int rb_enc_str_asciionly_p | ( | VALUE | str | ) |
Queries if the passed string is "ASCII only".
An ASCII only string is a string who doesn't have any non-ASCII characters at all. This doesn't necessarily mean the string is in ASCII encoding. For instance a String of CP932 encoding can quite much be ASCII only, depending on its contents.
[in] | str | String in question. |
1 | It doesn't have non-ASCII characters. |
0 | It has characters that are out of ASCII. |
Definition at line 899 of file string.c.
Referenced by rb_inspect(), and rb_reg_quote().
VALUE rb_enc_str_buf_cat | ( | VALUE | str, |
const char * | ptr, | ||
long | len, | ||
rb_encoding * | enc | ||
) |
Identical to rb_str_cat(), except it additionally takes an encoding.
[out] | str | Destination object. |
[in] | ptr | Contents to append. |
[in] | len | Length of src , in bytes. |
[in] | enc | Encoding of ptr . |
rb_eArgError | len is negative. |
rb_eEncCompatError | enc is not compatible with str . |
dst
. ptr
is copied, transcoded into dst
's encoding, then pasted into dst
's end. Definition at line 3597 of file string.c.
Referenced by rb_reg_regsub().
int rb_enc_str_coderange | ( | VALUE | str | ) |
Scans the passed string to collect its code range.
Because a Ruby's string is mutable, its contents change from time to time; so does its code range. A long-lived string tends to fall back to RUBY_ENC_CODERANGE_UNKNOWN. This API scans it and re-assigns a fine-grained code range constant.
[out] | str | A string. |
Definition at line 880 of file string.c.
Referenced by rb_econv_append(), rb_str_buf_append(), and rb_str_comparable().
VALUE rb_enc_str_new | ( | const char * | ptr, |
long | len, | ||
rb_encoding * | enc | ||
) |
Identical to rb_str_new(), except it additionally takes an encoding.
[in] | ptr | A memory region of len bytes length. |
[in] | len | Length of ptr , in bytes, not including the terminating NUL character. |
[in] | enc | Encoding of ptr . |
rb_eNoMemError | Failed to allocate len+1 bytes. |
rb_eArgError | len is negative. |
len
bytes length, of enc
encoding, whose contents are verbatim copy of ptr
. len
bytes of continuous memory region shall be accessible via ptr
. enc
can be a null pointer. It can also be seen as a routine identical to rb_usascii_str_new() then. Definition at line 1042 of file string.c.
Referenced by rb_enc_str_new_cstr(), rb_enc_uint_chr(), rb_external_str_new_with_enc(), and rb_intern3().
VALUE rb_enc_str_new_cstr | ( | const char * | ptr, |
rb_encoding * | enc | ||
) |
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
It can also be seen as a routine identical to rb_str_new_cstr(), except it additionally takes an encoding.
[in] | ptr | A C string. |
[in] | enc | Encoding of ptr . |
rb_eNoMemError | Failed to allocate memory. |
enc
encoding, whose contents are verbatim copy of ptr
. ptr
must not be a null pointer. ptr
is a C string it makes no sense for enc
to be something like UTF-32. enc
can be a null pointer. It can also be seen as a routine identical to rb_usascii_str_new_cstr() then. VALUE rb_enc_str_new_static | ( | const char * | ptr, |
long | len, | ||
rb_encoding * | enc | ||
) |
Identical to rb_enc_str_new(), except it takes a C string literal.
It can also be seen as a routine identical to rb_str_new_static(), except it additionally takes an encoding.
[in] | ptr | A C string literal. |
[in] | len | strlen(ptr) . |
[in] | enc | Encoding of ptr . |
rb_eArgError | len out of range of size_t . |
ptr
must be a C string constant. enc
encoding, whose backend storage is the passed C string literal. enc
can be a null pointer. It can also be seen as a routine identical to rb_usascii_str_new_static() then. long rb_enc_strlen | ( | const char * | head, |
const char * | tail, | ||
rb_encoding * | enc | ||
) |
Counts the number of characters of the passed string, according to the passed encoding.
This has to be complicated. The passed string could be invalid and/or broken. This routine would scan from the beginning til the end, byte by byte, to seek out character boundaries. Could be super slow.
[in] | head | Leftmost pointer to the string. |
[in] | tail | Rightmost pointer to the string. |
[in] | enc | Encoding of the string. |
head
.. tail
. The definition of "character" depends on the passed enc
. Definition at line 2251 of file string.c.
Referenced by rb_str_format().
VALUE rb_enc_uint_chr | ( | unsigned int | code, |
rb_encoding * | enc | ||
) |
Encodes the passed code point into a series of bytes.
[in] | code | Code point. |
[in] | enc | Target encoding scheme. |
rb_eRangeError | enc does not glean code . |
enc
encoding, whose sole contents is code
represented in enc
. Definition at line 3803 of file numeric.c.
Referenced by rb_io_ungetc().
VALUE rb_external_str_new_with_enc | ( | const char * | ptr, |
long | len, | ||
rb_encoding * | enc | ||
) |
Identical to rb_external_str_new(), except it additionally takes an encoding.
However the whole point of rb_external_str_new() is to encode a string into default external encoding. Being able to specify arbitrary encoding just ruins the designed purpose the function meseems.
[in] | ptr | A memory region of len bytes length. |
[in] | len | Length of ptr , in bytes, not including the terminating NUL character. |
[in] | enc | Target encoding scheme. |
rb_eArgError | len is negative. |
enc
is fully defined over the given contents, then the return value is a string of enc
encoding, whose contents are the converted ones. Otherwise the string is a junk. valid_encoding?
of the result object. Definition at line 1276 of file string.c.
Referenced by rb_external_str_new(), rb_external_str_new_cstr(), rb_filesystem_str_new(), rb_filesystem_str_new_cstr(), rb_locale_str_new(), and rb_locale_str_new_cstr().
long rb_memsearch | ( | const void * | x, |
long | m, | ||
const void * | y, | ||
long | n, | ||
rb_encoding * | enc | ||
) |
Looks for the passed string in the passed buffer.
[in] | x | Query string. |
[in] | m | Number of bytes of x . |
[in] | y | Buffer that potentially includes x . |
[in] | n | Number of bytes of y . |
[in] | enc | Encoding of both x and y . |
-1 | Not found. |
otherwise | Found index in y . |
Identical to rb_enc_get_index(), except the return type.
[in] | obj | Object in question. |
rb_eTypeError | obj is incapable of having an encoding. |
obj
's encoding. Definition at line 1148 of file encoding.c.
long rb_str_coderange_scan_restartable | ( | const char * | str, |
const char * | end, | ||
rb_encoding * | enc, | ||
int * | cr | ||
) |
Scans the passed string until it finds something odd.
Returns the number of bytes scanned. As the name implies this is suitable for repeated call. One of its application is IO#readlines
. The method reads from its receiver's read buffer, maybe more than once, looking for newlines. But "newline" can be different among encodings. This API is used to detect broken contents to properly mark them as such.
[in] | str | String to scan. |
[in] | end | End of str . |
[in] | enc | str 's encoding. |
[out] | cr | Return buffer. |
str
and first such byte where broken. cr
has the code range type. Definition at line 764 of file string.c.
Referenced by rb_econv_append(), and rb_str_set_len().
VALUE rb_str_conv_enc | ( | VALUE | str, |
rb_encoding * | from, | ||
rb_encoding * | to | ||
) |
Encoding conversion main routine.
[in] | str | String to convert. |
[in] | from | Source encoding. |
[in] | to | Destination encoding. |
str
, with conversion from from
to to
applied. from
can be a null pointer. str
's encoding is taken then. to
can be a null pointer. No-op then. Definition at line 1270 of file string.c.
Referenced by rb_dir_getwd(), rb_str_encode_ospath(), and rb_str_export_to_enc().
VALUE rb_str_conv_enc_opts | ( | VALUE | str, |
rb_encoding * | from, | ||
rb_encoding * | to, | ||
int | ecflags, | ||
VALUE | ecopts | ||
) |
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
The extra arguments can be constructed using io_extract_modeenc() etc.
[in] | str | String to convert. |
[in] | from | Source encoding. |
[in] | to | Destination encoding. |
[in] | ecflags | A set of enum ruby_econv_flag_type. |
[in] | ecopts | Optional hash. |
str
, with conversion from from
to to
applied. from
can be a null pointer. str
's encoding is taken then. to
can be a null pointer. No-op then. ecopts
can be RUBY_Qnil, which is equivalent to passing an empty hash. Definition at line 1154 of file string.c.
Referenced by rb_str_conv_enc().
VALUE rb_str_export_to_enc | ( | VALUE | obj, |
rb_encoding * | enc | ||
) |
Identical to rb_str_export(), except it additionally takes an encoding.
[in] | obj | Target object. |
[in] | enc | Target encoding. |
rb_eTypeError | No implicit conversion to String. |
enc
encoding. Definition at line 1375 of file string.c.
Referenced by rb_str_export(), and rb_str_export_locale().