xentara-utils v1.2.1
Xentara utilities library
Loading...
Searching...
No Matches
xentara::utils::string::unicode Namespace Reference

Functions

constexpr auto isHighSurrogate (char16_t codeUnit) noexcept -> bool
 Determines if a UTF-16 code unit is a high surrogate.
 
constexpr auto isLowSurrogate (char16_t codeUnit) noexcept -> bool
 Determines if a UTF-16 code unit is a low surrogate.
 
constexpr auto isSurrogate (char16_t codeUnit) noexcept -> bool
 Determines if a UTF-16 code unit is a high or low surrogate.
 
constexpr auto combineSurrogates (char16_t high, char16_t low) noexcept -> char32_t
 Combines a high and a low surrogate into a single code point.
 
constexpr auto makeSurrogatePair (char32_t codePoint) noexcept -> std::array< char16_t, 2 >
 Splits a code point into high and low surrogates.
 
constexpr auto isBmp (char32_t codePoint) noexcept -> bool
 Determines if a code point lies in the basic multilingual plane.
 
constexpr auto isOutOfRange (char32_t codePoint) noexcept -> bool
 Determines if a code point is outside the valid range for Unicode scalar values.
 
constexpr auto isReserved (char32_t codeUnit) noexcept -> bool
 Determines if a UTF-32 code unit is reserved for a high or a low surrogate.
 
constexpr auto isIllegal (char32_t codePoint) noexcept -> bool
 Determines if a code point is not a legal Unicode scalar value.
 
constexpr auto replacementCharacter () noexcept -> char32_t
 Returns the replacement character.
 
constexpr auto isSingleByteUtf8 (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a single UTF-8 code unit.
 
constexpr auto isTwoByteUtf8 (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of two UTF-8 code units.
 
constexpr auto isThreeByteUtf8 (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of three UTF-8 code units.
 
constexpr auto isFourByteUtf8 (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of four UTF-8 code units.
 
constexpr auto isTwoByteUtf8OrLess (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of two UTF-8 code units or less.
 
constexpr auto isThreeByteUtf8OrLess (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of three UTF-8 code units or less.
 
constexpr auto isFourByteUtf8OrLess (char32_t codePoint) noexcept -> bool
 Determines if a code point encodes to a sequence of four UTF-8 code units or less.
 
constexpr auto utf8Size (char32_t codePoint) noexcept -> std::size_t
 Determines how many UTF-8 code units are needed to encode a code point.
 
constexpr auto utf16Size (char32_t codePoint) noexcept -> std::size_t
 Determines how many UTF-16 code units are needed to encode a code point.
 

Function Documentation

◆ combineSurrogates()

constexpr auto xentara::utils::string::unicode::combineSurrogates ( char16_t  high,
char16_t  low 
) -> char32_t
constexprnoexcept

Combines a high and a low surrogate into a single code point.

Parameters
highThe high surrogate
lowThe low surrogate
Returns
the combined code point

◆ isBmp()

constexpr auto xentara::utils::string::unicode::isBmp ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point lies in the basic multilingual plane.

The basic multilingual plane consists of all code points less than or equal to U+FFFF. These characters can be represented by a single UTF-16 code unit. Characters outside the basic multilingual plane must be split up into high and low surrogates.

Parameters
codePointThe code point
Returns
true if the code unit is in the basic multilingual plane, and can thus be reporesented by a single UTF-16 code unit.

◆ isFourByteUtf8()

constexpr auto xentara::utils::string::unicode::isFourByteUtf8 ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of four UTF-8 code units.

◆ isFourByteUtf8OrLess()

constexpr auto xentara::utils::string::unicode::isFourByteUtf8OrLess ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of four UTF-8 code units or less.

This function can be used insead of isTwoByteUtf8() if it is already known that the code unit is not a three byte sequence or less, e.g. in chained if statements.

Note
No legal Unicode code point encodes to more than four characters, so this function is equivalent to !isOutOfRange().

◆ isHighSurrogate()

constexpr auto xentara::utils::string::unicode::isHighSurrogate ( char16_t  codeUnit) -> bool
constexprnoexcept

Determines if a UTF-16 code unit is a high surrogate.

Parameters
codeUnitThe code unit
Returns
true if the code unit is a high surrogate

◆ isIllegal()

constexpr auto xentara::utils::string::unicode::isIllegal ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point is not a legal Unicode scalar value.

Parameters
codePointThe code point
Returns
true if the code point is a high or low surrogate, or if it is out of range

◆ isLowSurrogate()

constexpr auto xentara::utils::string::unicode::isLowSurrogate ( char16_t  codeUnit) -> bool
constexprnoexcept

Determines if a UTF-16 code unit is a low surrogate.

Parameters
codeUnitThe code unit
Returns
true if the code unit is a low surrogate

◆ isOutOfRange()

constexpr auto xentara::utils::string::unicode::isOutOfRange ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point is outside the valid range for Unicode scalar values.

Parameters
codePointThe code point
Returns
true if the code point is out of range (i.e. greater than U+10FFFF)

◆ isReserved()

constexpr auto xentara::utils::string::unicode::isReserved ( char32_t  codeUnit) -> bool
constexprnoexcept

Determines if a UTF-32 code unit is reserved for a high or a low surrogate.

Parameters
codeUnitThe code unit
Returns
true if the code unit is a high surrogate or a low surrogate

◆ isSingleByteUtf8()

constexpr auto xentara::utils::string::unicode::isSingleByteUtf8 ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a single UTF-8 code unit.

◆ isSurrogate()

constexpr auto xentara::utils::string::unicode::isSurrogate ( char16_t  codeUnit) -> bool
constexprnoexcept

Determines if a UTF-16 code unit is a high or low surrogate.

Parameters
codeUnitThe code unit
Returns
true if the code unit is a high surrogate or a low surrogate

◆ isThreeByteUtf8()

constexpr auto xentara::utils::string::unicode::isThreeByteUtf8 ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of three UTF-8 code units.

◆ isThreeByteUtf8OrLess()

constexpr auto xentara::utils::string::unicode::isThreeByteUtf8OrLess ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of three UTF-8 code units or less.

This function can be used insead of isTwoByteUtf8() if it is already known that the code unit is not a two byte sequence or less, e.g. in chained if statements.

◆ isTwoByteUtf8()

constexpr auto xentara::utils::string::unicode::isTwoByteUtf8 ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of two UTF-8 code units.

◆ isTwoByteUtf8OrLess()

constexpr auto xentara::utils::string::unicode::isTwoByteUtf8OrLess ( char32_t  codePoint) -> bool
constexprnoexcept

Determines if a code point encodes to a sequence of two UTF-8 code units or less.

This function can be used insead of isTwoByteUtf8() if it is already known that the code unit is not a single byte sequence, e.g. in chained if statements.

◆ makeSurrogatePair()

constexpr auto xentara::utils::string::unicode::makeSurrogatePair ( char32_t  codePoint) -> std::array<char16_t, 2>
constexprnoexcept

Splits a code point into high and low surrogates.

Parameters
codePointThe code point. Must be between U+10000 and U+10FFFF (supplementary planes).
Returns
The code point's high surrogate followed by the its low surrogate

◆ replacementCharacter()

constexpr auto xentara::utils::string::unicode::replacementCharacter ( ) -> char32_t
constexprnoexcept

Returns the replacement character.

Returns
The Unicode replacement character (U+FFFD)

◆ utf16Size()

constexpr auto xentara::utils::string::unicode::utf16Size ( char32_t  codePoint) -> std::size_t
constexprnoexcept

Determines how many UTF-16 code units are needed to encode a code point.

Parameters
codePointThe code point. Must be a legal Unicode scalar value (less than or equal to U+10FFFF, and not a surrogate).
Returns
the number of UTF-16 code units needed to represent the code point. The return value will always be 1 or 2, as all Unicode code points encode to either a single code unit, or a surrogate pair.

◆ utf8Size()

constexpr auto xentara::utils::string::unicode::utf8Size ( char32_t  codePoint) -> std::size_t
constexprnoexcept

Determines how many UTF-8 code units are needed to encode a code point.

Parameters
codePointThe code point. Must be a legal Unicode scalar value (less than or equal to U+10FFFF, and not a surrogate).
Returns
the number of UTF-8 code units needed to represent the code point. The return value will always be between 1 and 4, as all Unicode code points encode to between one and four bytes.