From 3ce4735d50e69fc72bb8f99b466169bbb4f180bc Mon Sep 17 00:00:00 2001
From: Dav999-v <dav999.tolp@gmail.com>
Date: Thu, 23 Feb 2023 03:41:36 +0100
Subject: [PATCH] Add UTF8.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a small library I wrote to handle UTF-8.

Usage is meant to be as simple as possible - see for example decoding
a UTF-8 string:

  const char* str = "asdf";
  uint32_t codepoint;
  while ((codepoint = UTF8_next(&str)))
  {
      // you have a codepoint congrats
  }

Or encoding a single codepoint to add it to a string:

  std::string result;
  result.append(UTF8_encode(0x1234).bytes);

There are some other functions (UTF8_total_codepoints() to get the
total number of codepoints in a string, UTF8_backspace() to get the
length of a string after backspacing one character, and
UTF8_peek_next() as a slightly less fancy version of UTF8_next()), but
more functions could always be added if we need them.

This will allow us to replace utfcpp (utf8::unchecked) and also fix
some less-than-ideal code:

- Some places have to resort to ignoring UTF-8 (next_wrap) or using
  UCS-4→UTF-8 functions (VFormat had to use PHYSFS ones, and one other
  place has four lines of code including a std::back_inserter just for
  one character)

- The iterator stuff is kinda confusing and verbose anyway
---
 desktop_version/CMakeLists.txt |   1 +
 desktop_version/src/UTF8.c     | 202 +++++++++++++++++++++++++++++++++
 desktop_version/src/UTF8.h     |  35 ++++++
 3 files changed, 238 insertions(+)
 create mode 100644 desktop_version/src/UTF8.c
 create mode 100644 desktop_version/src/UTF8.h

diff --git a/desktop_version/CMakeLists.txt b/desktop_version/CMakeLists.txt
index 7f181ef3..094fab4d 100644
--- a/desktop_version/CMakeLists.txt
+++ b/desktop_version/CMakeLists.txt
@@ -111,6 +111,7 @@ set(VVV_SRC
     src/Network.c
     src/Textbook.c
     src/ThirdPartyDeps.c
+    src/UTF8.c
     src/VFormat.c
     src/Vlogging.c
     src/Xoshiro.c
diff --git a/desktop_version/src/UTF8.c b/desktop_version/src/UTF8.c
new file mode 100644
index 00000000..528b9964
--- /dev/null
+++ b/desktop_version/src/UTF8.c
@@ -0,0 +1,202 @@
+#include "UTF8.h"
+
+#define STARTS_0(byte) ((byte & 0x80) == 0x00)
+#define STARTS_10(byte) ((byte & 0xC0) == 0x80)
+#define STARTS_110(byte) ((byte & 0xE0) == 0xC0)
+#define STARTS_1110(byte) ((byte & 0xF0) == 0xE0)
+#define STARTS_11110(byte) ((byte & 0xF8) == 0xF0)
+#define TAKE(byte, nbits) (byte & ((1 << nbits)-1))
+
+static inline bool is_illegal(uint32_t codepoint)
+{
+    return (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint > 0x10FFFF;
+}
+
+uint32_t UTF8_peek_next(const char* s_str, uint8_t* codepoint_nbytes)
+{
+    /* Get the next codepoint from a string, but instead of advancing the
+     * pointer, give the number of bytes the index will need to advance. */
+    if (s_str == NULL)
+    {
+        return 0;
+    }
+
+    // Pointer conversion to avoid all those brilliant signedness plot twists...
+    const unsigned char* str = (const unsigned char*) s_str;
+    uint32_t codepoint;
+    *codepoint_nbytes = 1;
+
+    if (STARTS_0(str[0]))
+    {
+        // 0xxx xxxx - ASCII
+        codepoint = str[0];
+    }
+    else if (STARTS_10(str[0]))
+    {
+        // 10xx xxxx - unexpected continuation byte
+        codepoint = 0xFFFD;
+    }
+    else if (STARTS_110(str[0]))
+    {
+        // 110x xxxx - 2-byte sequence
+        if (!STARTS_10(str[1]))
+        {
+            codepoint = 0xFFFD;
+        }
+        else
+        {
+            codepoint =
+                (TAKE(str[0], 5) << 6) |
+                (TAKE(str[1], 6));
+            *codepoint_nbytes = 2;
+        }
+    }
+    else if (STARTS_1110(str[0]))
+    {
+        // 1110 xxxx - 3-byte sequence
+        if (!STARTS_10(str[1]) || !STARTS_10(str[2]))
+        {
+            codepoint = 0xFFFD;
+        }
+        else
+        {
+            codepoint =
+                (TAKE(str[0], 4) << 12) |
+                (TAKE(str[1], 6) << 6) |
+                (TAKE(str[2], 6));
+            *codepoint_nbytes = 3;
+        }
+    }
+    else if (STARTS_11110(str[0]))
+    {
+        // 1111 0xxx - 4-byte sequence
+        if (!STARTS_10(str[1]) || !STARTS_10(str[2]) || !STARTS_10(str[3]))
+        {
+            codepoint = 0xFFFD;
+        }
+        else
+        {
+            codepoint =
+                (TAKE(str[0], 3) << 18) |
+                (TAKE(str[1], 6) << 12) |
+                (TAKE(str[2], 6) << 6) |
+                (TAKE(str[3], 6));
+            *codepoint_nbytes = 4;
+        }
+    }
+    else
+    {
+        // 1111 1xxx - invalid
+        codepoint = 0xFFFD;
+    }
+
+    // Overlong sequence?
+    if (
+        (codepoint <= 0x7F && *codepoint_nbytes > 1) ||
+        (codepoint > 0x7F && codepoint <= 0x7FF && *codepoint_nbytes > 2) ||
+        (codepoint > 0x7FF && codepoint <= 0xFFFF && *codepoint_nbytes > 3)
+    ) {
+        codepoint = 0xFFFD;
+    }
+
+    // UTF-16 surrogates are invalid, so are codepoints after 10FFFF
+    if (is_illegal(codepoint))
+    {
+        codepoint = 0xFFFD;
+    }
+
+    return codepoint;
+}
+
+uint32_t UTF8_next(const char** p_str)
+{
+    /* Get the next codepoint from a string, and advance the pointer.
+     * Example usage:
+     *
+     *  const char* str = "asdf";
+     *  uint32_t codepoint;
+     *  while ((codepoint = UTF8_next(&str)))
+     *  {
+     *      // you have a codepoint congrats
+     *  }
+     */
+    if (p_str == NULL)
+    {
+        return 0;
+    }
+
+    uint8_t codepoint_nbytes;
+    uint32_t codepoint = UTF8_peek_next(*p_str, &codepoint_nbytes);
+    *p_str += codepoint_nbytes;
+    return codepoint;
+}
+
+UTF8_encoding UTF8_encode(uint32_t codepoint)
+{
+    UTF8_encoding enc = {0};
+
+    // Pretend the bytes array is unsigned...
+    unsigned char* bytes = (unsigned char*) &enc.bytes;
+
+    if (is_illegal(codepoint))
+    {
+        codepoint = 0xFFFD;
+        enc.error = true;
+    }
+
+    if (codepoint <= 0x7F)
+    {
+        enc.nbytes = 1;
+        bytes[0] = codepoint;
+    }
+    else if (codepoint <= 0x7FF)
+    {
+        enc.nbytes = 2;
+        bytes[0] = 0xC0 | (codepoint >> 6);
+        bytes[1] = 0x80 | (codepoint & 0x3F);
+    }
+    else if (codepoint <= 0xFFFF)
+    {
+        enc.nbytes = 3;
+        bytes[0] = 0xE0 | (codepoint >> 12);
+        bytes[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+        bytes[2] = 0x80 | (codepoint & 0x3F);
+    }
+    else
+    {
+        enc.nbytes = 4;
+        bytes[0] = 0xF0 | (codepoint >> 18);
+        bytes[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+        bytes[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+        bytes[3] = 0x80 | (codepoint & 0x3F);
+    }
+
+    return enc;
+}
+
+size_t UTF8_total_codepoints(const char* str)
+{
+    size_t total = 0;
+    while (UTF8_next(&str))
+    {
+        total++;
+    }
+    return total;
+}
+
+size_t UTF8_backspace(const char* str, size_t len)
+{
+    /* Given a string of length len,
+     * give the new length after removing the last character.
+     * In other words, the index at which to write a \0 byte. */
+
+    for (len -= 1; len > 0; len--)
+    {
+        if (!STARTS_10(str[len]))
+        {
+            break;
+        }
+    }
+
+    return len;
+}
diff --git a/desktop_version/src/UTF8.h b/desktop_version/src/UTF8.h
new file mode 100644
index 00000000..0f9b1cf1
--- /dev/null
+++ b/desktop_version/src/UTF8.h
@@ -0,0 +1,35 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef struct
+{
+    char bytes[5];
+    uint8_t nbytes;
+    bool error;
+}
+UTF8_encoding;
+
+
+uint32_t UTF8_peek_next(const char* s_str, uint8_t* codepoint_nbytes);
+
+uint32_t UTF8_next(const char** p_str);
+UTF8_encoding UTF8_encode(uint32_t codepoint);
+
+size_t UTF8_total_codepoints(const char* str);
+size_t UTF8_backspace(const char* str, size_t len);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // UTF8_H