From 3ce4735d50e69fc72bb8f99b466169bbb4f180bc Mon Sep 17 00:00:00 2001 From: Dav999-v Date: Thu, 23 Feb 2023 03:41:36 +0100 Subject: [PATCH] Add UTF8.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a small library I wrote to handle UTF-8. Usage is meant to be as simple as possible - see for example decoding a UTF-8 string: const char* str = "asdf"; uint32_t codepoint; while ((codepoint = UTF8_next(&str))) { // you have a codepoint congrats } Or encoding a single codepoint to add it to a string: std::string result; result.append(UTF8_encode(0x1234).bytes); There are some other functions (UTF8_total_codepoints() to get the total number of codepoints in a string, UTF8_backspace() to get the length of a string after backspacing one character, and UTF8_peek_next() as a slightly less fancy version of UTF8_next()), but more functions could always be added if we need them. This will allow us to replace utfcpp (utf8::unchecked) and also fix some less-than-ideal code: - Some places have to resort to ignoring UTF-8 (next_wrap) or using UCS-4→UTF-8 functions (VFormat had to use PHYSFS ones, and one other place has four lines of code including a std::back_inserter just for one character) - The iterator stuff is kinda confusing and verbose anyway --- desktop_version/CMakeLists.txt | 1 + desktop_version/src/UTF8.c | 202 +++++++++++++++++++++++++++++++++ desktop_version/src/UTF8.h | 35 ++++++ 3 files changed, 238 insertions(+) create mode 100644 desktop_version/src/UTF8.c create mode 100644 desktop_version/src/UTF8.h diff --git a/desktop_version/CMakeLists.txt b/desktop_version/CMakeLists.txt index 7f181ef3..094fab4d 100644 --- a/desktop_version/CMakeLists.txt +++ b/desktop_version/CMakeLists.txt @@ -111,6 +111,7 @@ set(VVV_SRC src/Network.c src/Textbook.c src/ThirdPartyDeps.c + src/UTF8.c src/VFormat.c src/Vlogging.c src/Xoshiro.c diff --git a/desktop_version/src/UTF8.c b/desktop_version/src/UTF8.c new file mode 100644 index 00000000..528b9964 --- /dev/null +++ b/desktop_version/src/UTF8.c @@ -0,0 +1,202 @@ +#include "UTF8.h" + +#define STARTS_0(byte) ((byte & 0x80) == 0x00) +#define STARTS_10(byte) ((byte & 0xC0) == 0x80) +#define STARTS_110(byte) ((byte & 0xE0) == 0xC0) +#define STARTS_1110(byte) ((byte & 0xF0) == 0xE0) +#define STARTS_11110(byte) ((byte & 0xF8) == 0xF0) +#define TAKE(byte, nbits) (byte & ((1 << nbits)-1)) + +static inline bool is_illegal(uint32_t codepoint) +{ + return (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint > 0x10FFFF; +} + +uint32_t UTF8_peek_next(const char* s_str, uint8_t* codepoint_nbytes) +{ + /* Get the next codepoint from a string, but instead of advancing the + * pointer, give the number of bytes the index will need to advance. */ + if (s_str == NULL) + { + return 0; + } + + // Pointer conversion to avoid all those brilliant signedness plot twists... + const unsigned char* str = (const unsigned char*) s_str; + uint32_t codepoint; + *codepoint_nbytes = 1; + + if (STARTS_0(str[0])) + { + // 0xxx xxxx - ASCII + codepoint = str[0]; + } + else if (STARTS_10(str[0])) + { + // 10xx xxxx - unexpected continuation byte + codepoint = 0xFFFD; + } + else if (STARTS_110(str[0])) + { + // 110x xxxx - 2-byte sequence + if (!STARTS_10(str[1])) + { + codepoint = 0xFFFD; + } + else + { + codepoint = + (TAKE(str[0], 5) << 6) | + (TAKE(str[1], 6)); + *codepoint_nbytes = 2; + } + } + else if (STARTS_1110(str[0])) + { + // 1110 xxxx - 3-byte sequence + if (!STARTS_10(str[1]) || !STARTS_10(str[2])) + { + codepoint = 0xFFFD; + } + else + { + codepoint = + (TAKE(str[0], 4) << 12) | + (TAKE(str[1], 6) << 6) | + (TAKE(str[2], 6)); + *codepoint_nbytes = 3; + } + } + else if (STARTS_11110(str[0])) + { + // 1111 0xxx - 4-byte sequence + if (!STARTS_10(str[1]) || !STARTS_10(str[2]) || !STARTS_10(str[3])) + { + codepoint = 0xFFFD; + } + else + { + codepoint = + (TAKE(str[0], 3) << 18) | + (TAKE(str[1], 6) << 12) | + (TAKE(str[2], 6) << 6) | + (TAKE(str[3], 6)); + *codepoint_nbytes = 4; + } + } + else + { + // 1111 1xxx - invalid + codepoint = 0xFFFD; + } + + // Overlong sequence? + if ( + (codepoint <= 0x7F && *codepoint_nbytes > 1) || + (codepoint > 0x7F && codepoint <= 0x7FF && *codepoint_nbytes > 2) || + (codepoint > 0x7FF && codepoint <= 0xFFFF && *codepoint_nbytes > 3) + ) { + codepoint = 0xFFFD; + } + + // UTF-16 surrogates are invalid, so are codepoints after 10FFFF + if (is_illegal(codepoint)) + { + codepoint = 0xFFFD; + } + + return codepoint; +} + +uint32_t UTF8_next(const char** p_str) +{ + /* Get the next codepoint from a string, and advance the pointer. + * Example usage: + * + * const char* str = "asdf"; + * uint32_t codepoint; + * while ((codepoint = UTF8_next(&str))) + * { + * // you have a codepoint congrats + * } + */ + if (p_str == NULL) + { + return 0; + } + + uint8_t codepoint_nbytes; + uint32_t codepoint = UTF8_peek_next(*p_str, &codepoint_nbytes); + *p_str += codepoint_nbytes; + return codepoint; +} + +UTF8_encoding UTF8_encode(uint32_t codepoint) +{ + UTF8_encoding enc = {0}; + + // Pretend the bytes array is unsigned... + unsigned char* bytes = (unsigned char*) &enc.bytes; + + if (is_illegal(codepoint)) + { + codepoint = 0xFFFD; + enc.error = true; + } + + if (codepoint <= 0x7F) + { + enc.nbytes = 1; + bytes[0] = codepoint; + } + else if (codepoint <= 0x7FF) + { + enc.nbytes = 2; + bytes[0] = 0xC0 | (codepoint >> 6); + bytes[1] = 0x80 | (codepoint & 0x3F); + } + else if (codepoint <= 0xFFFF) + { + enc.nbytes = 3; + bytes[0] = 0xE0 | (codepoint >> 12); + bytes[1] = 0x80 | ((codepoint >> 6) & 0x3F); + bytes[2] = 0x80 | (codepoint & 0x3F); + } + else + { + enc.nbytes = 4; + bytes[0] = 0xF0 | (codepoint >> 18); + bytes[1] = 0x80 | ((codepoint >> 12) & 0x3F); + bytes[2] = 0x80 | ((codepoint >> 6) & 0x3F); + bytes[3] = 0x80 | (codepoint & 0x3F); + } + + return enc; +} + +size_t UTF8_total_codepoints(const char* str) +{ + size_t total = 0; + while (UTF8_next(&str)) + { + total++; + } + return total; +} + +size_t UTF8_backspace(const char* str, size_t len) +{ + /* Given a string of length len, + * give the new length after removing the last character. + * In other words, the index at which to write a \0 byte. */ + + for (len -= 1; len > 0; len--) + { + if (!STARTS_10(str[len])) + { + break; + } + } + + return len; +} diff --git a/desktop_version/src/UTF8.h b/desktop_version/src/UTF8.h new file mode 100644 index 00000000..0f9b1cf1 --- /dev/null +++ b/desktop_version/src/UTF8.h @@ -0,0 +1,35 @@ +#ifndef UTF8_H +#define UTF8_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +typedef struct +{ + char bytes[5]; + uint8_t nbytes; + bool error; +} +UTF8_encoding; + + +uint32_t UTF8_peek_next(const char* s_str, uint8_t* codepoint_nbytes); + +uint32_t UTF8_next(const char** p_str); +UTF8_encoding UTF8_encode(uint32_t codepoint); + +size_t UTF8_total_codepoints(const char* str); +size_t UTF8_backspace(const char* str, size_t len); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // UTF8_H