VVVVVV/desktop_version/src/FontBidi.cpp

553 lines
17 KiB
C++

#include "FontBidi.h"
#include <SDL.h>
#include "Alloc.h"
#include "UTF8.h"
extern "C"
{
#include <c-hashmap/map.h>
#include <SheenBidi.h>
}
namespace font
{
struct ArabicLetter
{
uint32_t letter;
uint32_t isolated;
uint32_t initial;
uint32_t medial;
uint32_t final;
};
// Arabic reshaping lookup table from https://github.com/TerryCavanagh/hx_arabic_shaper
static ArabicLetter arabic_letters[] = {
// ARABIC LETTER HAMZA
{0x0621, 0xFE80, 0, 0, 0},
// ARABIC LETTER ALEF WITH MADDA ABOVE
{0x0622, 0xFE81, 0, 0, 0xFE82},
// ARABIC LETTER ALEF WITH HAMZA ABOVE
{0x0623, 0xFE83, 0, 0, 0xFE84},
// ARABIC LETTER WAW WITH HAMZA ABOVE
{0x0624, 0xFE85, 0, 0, 0xFE86},
// ARABIC LETTER ALEF WITH HAMZA BELOW
{0x0625, 0xFE87, 0, 0, 0xFE88},
// ARABIC LETTER YEH WITH HAMZA ABOVE
{0x0626, 0xFE89, 0xFE8B, 0xFE8C, 0xFE8A},
// ARABIC LETTER ALEF
{0x0627, 0xFE8D, 0, 0, 0xFE8E},
// ARABIC LETTER BEH
{0x0628, 0xFE8F, 0xFE91, 0xFE92, 0xFE90},
// ARABIC LETTER TEH MARBUTA
{0x0629, 0xFE93, 0, 0, 0xFE94},
// ARABIC LETTER TEH
{0x062A, 0xFE95, 0xFE97, 0xFE98, 0xFE96},
// ARABIC LETTER THEH
{0x062B, 0xFE99, 0xFE9B, 0xFE9C, 0xFE9A},
// ARABIC LETTER JEEM
{0x062C, 0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E},
// ARABIC LETTER HAH
{0x062D, 0xFEA1, 0xFEA3, 0xFEA4, 0xFEA2},
// ARABIC LETTER KHAH
{0x062E, 0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6},
// ARABIC LETTER DAL
{0x062F, 0xFEA9, 0, 0, 0xFEAA},
// ARABIC LETTER THAL
{0x0630, 0xFEAB, 0, 0, 0xFEAC},
// ARABIC LETTER REH
{0x0631, 0xFEAD, 0, 0, 0xFEAE},
// ARABIC LETTER ZAIN
{0x0632, 0xFEAF, 0, 0, 0xFEB0},
// ARABIC LETTER SEEN
{0x0633, 0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2},
// ARABIC LETTER SHEEN
{0x0634, 0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6},
// ARABIC LETTER SAD
{0x0635, 0xFEB9, 0xFEBB, 0xFEBC, 0xFEBA},
// ARABIC LETTER DAD
{0x0636, 0xFEBD, 0xFEBF, 0xFEC0, 0xFEBE},
// ARABIC LETTER TAH
{0x0637, 0xFEC1, 0xFEC3, 0xFEC4, 0xFEC2},
// ARABIC LETTER ZAH
{0x0638, 0xFEC5, 0xFEC7, 0xFEC8, 0xFEC6},
// ARABIC LETTER AIN
{0x0639, 0xFEC9, 0xFECB, 0xFECC, 0xFECA},
// ARABIC LETTER GHAIN
{0x063A, 0xFECD, 0xFECF, 0xFED0, 0xFECE},
// ARABIC TATWEEL
{0x0640, 0x0640, 0x0640, 0x0640, 0x0640},
// ARABIC LETTER FEH
{0x0641, 0xFED1, 0xFED3, 0xFED4, 0xFED2},
// ARABIC LETTER QAF
{0x0642, 0xFED5, 0xFED7, 0xFED8, 0xFED6},
// ARABIC LETTER KAF
{0x0643, 0xFED9, 0xFEDB, 0xFEDC, 0xFEDA},
// ARABIC LETTER LAM
{0x0644, 0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE},
// ARABIC LETTER MEEM
{0x0645, 0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2},
// ARABIC LETTER NOON
{0x0646, 0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6},
// ARABIC LETTER HEH
{0x0647, 0xFEE9, 0xFEEB, 0xFEEC, 0xFEEA},
// ARABIC LETTER WAW
{0x0648, 0xFEED, 0, 0, 0xFEEE},
// ARABIC LETTER [UIGHUR KAZAKH KIRGHIZ]? ALEF MAKSURA
{0x0649, 0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0},
// ARABIC LETTER YEH
{0x064A, 0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2},
// ARABIC LETTER ALEF WASLA
{0x0671, 0xFB50, 0, 0, 0xFB51},
// ARABIC LETTER U WITH HAMZA ABOVE
{0x0677, 0xFBDD, 0, 0, 0},
// ARABIC LETTER TTEH
{0x0679, 0xFB66, 0xFB68, 0xFB69, 0xFB67},
// ARABIC LETTER TTEHEH
{0x067A, 0xFB5E, 0xFB60, 0xFB61, 0xFB5F},
// ARABIC LETTER BEEH
{0x067B, 0xFB52, 0xFB54, 0xFB55, 0xFB53},
// ARABIC LETTER PEH
{0x067E, 0xFB56, 0xFB58, 0xFB59, 0xFB57},
// ARABIC LETTER TEHEH
{0x067F, 0xFB62, 0xFB64, 0xFB65, 0xFB63},
// ARABIC LETTER BEHEH
{0x0680, 0xFB5A, 0xFB5C, 0xFB5D, 0xFB5B},
// ARABIC LETTER NYEH
{0x0683, 0xFB76, 0xFB78, 0xFB79, 0xFB77},
// ARABIC LETTER DYEH
{0x0684, 0xFB72, 0xFB74, 0xFB75, 0xFB73},
// ARABIC LETTER TCHEH
{0x0686, 0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B},
// ARABIC LETTER TCHEHEH
{0x0687, 0xFB7E, 0xFB80, 0xFB81, 0xFB7F},
// ARABIC LETTER DDAL
{0x0688, 0xFB88, 0, 0, 0xFB89},
// ARABIC LETTER DAHAL
{0x068C, 0xFB84, 0, 0, 0xFB85},
// ARABIC LETTER DDAHAL
{0x068D, 0xFB82, 0, 0, 0xFB83},
// ARABIC LETTER DUL
{0x068E, 0xFB86, 0, 0, 0xFB87},
// ARABIC LETTER RREH
{0x0691, 0xFB8C, 0, 0, 0xFB8D},
// ARABIC LETTER JEH
{0x0698, 0xFB8A, 0, 0, 0xFB8B},
// ARABIC LETTER VEH
{0x06A4, 0xFB6A, 0xFB6C, 0xFB6D, 0xFB6B},
// ARABIC LETTER PEHEH
{0x06A6, 0xFB6E, 0xFB70, 0xFB71, 0xFB6F},
// ARABIC LETTER KEHEH
{0x06A9, 0xFB8E, 0xFB90, 0xFB91, 0xFB8F},
// ARABIC LETTER NG
{0x06AD, 0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4},
// ARABIC LETTER GAF
{0x06AF, 0xFB92, 0xFB94, 0xFB95, 0xFB93},
// ARABIC LETTER NGOEH
{0x06B1, 0xFB9A, 0xFB9C, 0xFB9D, 0xFB9B},
// ARABIC LETTER GUEH
{0x06B3, 0xFB96, 0xFB98, 0xFB99, 0xFB97},
// ARABIC LETTER NOON GHUNNA
{0x06BA, 0xFB9E, 0, 0, 0xFB9F},
// ARABIC LETTER RNOON
{0x06BB, 0xFBA0, 0xFBA2, 0xFBA3, 0xFBA1},
// ARABIC LETTER HEH DOACHASHMEE
{0x06BE, 0xFBAA, 0xFBAC, 0xFBAD, 0xFBAB},
// ARABIC LETTER HEH WITH YEH ABOVE
{0x06C0, 0xFBA4, 0, 0, 0xFBA5},
// ARABIC LETTER HEH GOAL
{0x06C1, 0xFBA6, 0xFBA8, 0xFBA9, 0xFBA7},
// ARABIC LETTER KIRGHIZ OE
{0x06C5, 0xFBE0, 0, 0, 0xFBE1},
// ARABIC LETTER OE
{0x06C6, 0xFBD9, 0, 0, 0xFBDA},
// ARABIC LETTER U
{0x06C7, 0xFBD7, 0, 0, 0xFBD8},
// ARABIC LETTER YU
{0x06C8, 0xFBDB, 0, 0, 0xFBDC},
// ARABIC LETTER KIRGHIZ YU
{0x06C9, 0xFBE2, 0, 0, 0xFBE3},
// ARABIC LETTER VE
{0x06CB, 0xFBDE, 0, 0, 0xFBDF},
// ARABIC LETTER FARSI YEH
{0x06CC, 0xFBFC, 0xFBFE, 0xFBFF, 0xFBFD},
// ARABIC LETTER E
{0x06D0, 0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5},
// ARABIC LETTER YEH BARREE
{0x06D2, 0xFBAE, 0, 0, 0xFBAF},
// ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
{0x06D3, 0xFBB0, 0, 0, 0xFBB1},
// ZWJ
{0x200D, 0x200D, 0x200D, 0x200D, 0x200D},
};
// Our ligatures are all simple A+B -> C conversions
struct ArabicLigature
{
uint32_t source[2];
uint32_t target;
bool mandatory;
};
static ArabicLigature arabic_ligatures[] = {
{{0xFEDF, 0xFE8E}, 0xFEFB, true},
{{0xFEDF, 0xFE82}, 0xFEF5, true},
{{0xFEDF, 0xFE84}, 0xFEF7, true},
{{0xFEDF, 0xFE88}, 0xFEF9, true},
{{0xFEE0, 0xFE8E}, 0xFEFC, true},
{{0xFEE0, 0xFE82}, 0xFEF6, true},
{{0xFEE0, 0xFE84}, 0xFEF8, true},
{{0xFEE0, 0xFE88}, 0xFEFA, true},
{{0xFE8D, 0xFEDF}, 0xFBF0, false},
{{0xFE8E, 0xFEDF}, 0xFBF0, false},
{{0xFEDF, 0xFEDF}, 0xFBF1, false},
{{0xFEE0, 0xFEDF}, 0xFBF1, false},
{{0xFE8B, 0xFE8E}, 0xFBF2, false},
{{0xFE8C, 0xFE8E}, 0xFBF2, false},
{{0xFEE7, 0xFE8E}, 0xFBF3, false},
{{0xFEE8, 0xFE8E}, 0xFBF3, false},
{{0xFE91, 0xFE8E}, 0xFBF4, false},
{{0xFE92, 0xFE8E}, 0xFBF4, false},
{{0xFEF3, 0xFE8E}, 0xFBF5, false},
{{0xFEF4, 0xFE8E}, 0xFBF5, false},
{{0xFEAD, 0xFE8D}, 0xFBF6, false},
{{0xFEAE, 0xFE8D}, 0xFBF6, false},
{{0xFEAF, 0xFE8D}, 0xFBF7, false},
{{0xFEB0, 0xFE8D}, 0xFBF7, false},
{{0xFEAD, 0xFEDF}, 0xFBF8, false},
{{0xFEAE, 0xFEDF}, 0xFBF8, false},
{{0xFEAF, 0xFEDF}, 0xFBF9, false},
{{0xFEB0, 0xFEDF}, 0xFBF9, false},
{{0xFEDF, 0xFEE0}, 0xFBF1, false},
{{0xFEE0, 0xFEE0}, 0xFBF1, false},
};
static hashmap* arabic_letters_map;
static hashmap* arabic_ligatures_map;
void bidi_init(void)
{
arabic_letters_map = hashmap_create();
for (size_t i = 0; i < sizeof(arabic_letters)/sizeof(ArabicLetter); i++)
{
hashmap_set(
arabic_letters_map,
&arabic_letters[i].letter,
sizeof(uint32_t),
(uintptr_t) &arabic_letters[i]
);
}
arabic_ligatures_map = hashmap_create();
for (size_t i = 0; i < sizeof(arabic_ligatures)/sizeof(ArabicLigature); i++)
{
hashmap_set(
arabic_ligatures_map,
&arabic_ligatures[i].source,
sizeof(uint32_t) * 2,
(uintptr_t) &arabic_ligatures[i]
);
}
}
void bidi_destroy(void)
{
VVV_freefunc(hashmap_free, arabic_ligatures_map);
VVV_freefunc(hashmap_free, arabic_letters_map);
}
bool is_directional_character(const uint32_t codepoint)
{
// LEFT-TO-RIGHT MARK and RIGHT-TO-LEFT MARK
if (codepoint == 0x200E || codepoint == 0x200F) return true;
// Some other directional formatting: LRE, RLE, PDF, RLO, LRO
if (codepoint >= 0x202A && codepoint <= 0x202E) return true;
// The more recent isolates: LRI, RLI, FSI, PDI
if (codepoint >= 0x2066 && codepoint <= 0x2069) return true;
return false;
}
bool is_joiner(const uint32_t codepoint)
{
return codepoint == 0x200C || codepoint == 0x200D;
}
bool bidi_should_transform(const bool rtl, const char* text)
{
/* Just as an optimization, only run the whole bidi machinery if the
* language is actually an RTL one, _or_ if an RTL character is found. */
if (rtl)
{
return true;
}
const char* text_ptr = text;
uint32_t ch;
while ((ch = UTF8_next(&text_ptr)))
{
// The standard Hebrew and Arabic blocks
if (ch >= 0x590 && ch <= 0x77F) return true;
// Extended Arabic B and A
if (ch >= 0x870 && ch <= 0x8FF) return true;
// Any directional control character
if (is_directional_character(ch)) return true;
// Hebrew presentation forms
if (ch >= 0xFB1D && ch <= 0xFB4F) return true;
// Arabic presentation forms A
if (ch >= 0xFB50 && ch <= 0xFDFF) return true;
// Arabic presentation forms B
if (ch >= 0xFE70 && ch <= 0xFEFE) return true;
}
return false;
}
const char* bidi_transform(const bool rtl, const char* text)
{
uint32_t utf32_in[1024];
int n_codepoints = 0;
const char* text_ptr = text;
uint32_t codepoint;
while ((codepoint = UTF8_next(&text_ptr)))
{
if (codepoint == '\r' || codepoint == '\n')
{
// Don't treat newlines in font::print differently in bidi
codepoint = ' ';
}
utf32_in[n_codepoints++] = codepoint;
if (n_codepoints >= 1023)
{
break;
}
}
utf32_in[n_codepoints] = 0;
if (n_codepoints == 0)
{
return text;
}
static char utf8_out[1024];
size_t utf8_out_cur = 0;
SBCodepointSequence codepoint_sequence = {SBStringEncodingUTF32, (void*) utf32_in, (SBUInteger) n_codepoints};
SBAlgorithmRef algorithm = SBAlgorithmCreate(&codepoint_sequence);
if (algorithm == NULL)
{
return text;
}
SBParagraphRef paragraph = SBAlgorithmCreateParagraph(
algorithm,
0,
INT32_MAX,
rtl ? SBLevelDefaultRTL : SBLevelDefaultLTR
);
SDL_assert(paragraph != NULL);
SBUInteger paragraph_len = SBParagraphGetLength(paragraph);
SBLineRef paragraph_line = SBParagraphCreateLine(paragraph, 0, paragraph_len);
SDL_assert(paragraph_line != NULL);
// Make sure )brackets( are mirrored correctly...
SBMirrorLocatorRef mirror_locator = SBMirrorLocatorCreate();
if (mirror_locator != NULL)
{
SBMirrorLocatorLoadLine(mirror_locator, paragraph_line, (void*) utf32_in);
const SBMirrorAgent *mirror_agent = SBMirrorLocatorGetAgent(mirror_locator);
while (SBMirrorLocatorMoveNext(mirror_locator))
{
utf32_in[mirror_agent->index] = mirror_agent->mirror;
}
VVV_freefunc(SBMirrorLocatorRelease, mirror_locator);
}
SBUInteger n_runs = SBLineGetRunCount(paragraph_line);
const SBRun *runs = SBLineGetRunsPtr(paragraph_line);
for (SBUInteger i = 0; i < n_runs; i++)
{
bool is_ltr = runs[i].level % 2 == 0;
if (!is_ltr)
{
// Time for reshaping!
enum arabic_form { NONE, ISOLATED, INITIAL, MEDIAL, FINAL };
arabic_form forms[1024];
uint32_t replacements[1024];
const ArabicLetter* letter;
const ArabicLetter* previous_letter = NULL;
arabic_form previous_form = NONE;
for (size_t c = 0; c < runs[i].length; c++)
{
uintptr_t letter_ptr;
bool found = hashmap_get(arabic_letters_map, &utf32_in[runs[i].offset + c], sizeof(uint32_t), &letter_ptr);
if (!found)
{
forms[c] = NONE;
replacements[c] = 0;
previous_form = NONE;
previous_letter = NULL;
continue;
}
letter = (const ArabicLetter*) letter_ptr;
if (previous_form == NONE)
{
// Maybe the first letter, or the one after an unknown one
forms[c] = ISOLATED;
replacements[c] = letter->isolated;
}
else if (letter->final == 0 && letter->medial == 0)
{
// letter doesn't connect with the one before
forms[c] = ISOLATED;
replacements[c] = letter->isolated;
}
else if (previous_letter->initial == 0 && previous_letter->medial == 0)
{
// previous_letter doesn't connect with the one after
forms[c] = ISOLATED;
replacements[c] = letter->isolated;
}
else if (previous_form == FINAL && previous_letter->medial == 0)
{
// previous_letter doesn't connect with the ones before and after
forms[c] = ISOLATED;
replacements[c] = letter->isolated;
}
else if (previous_form == ISOLATED)
{
forms[c-1] = INITIAL;
forms[c] = FINAL;
replacements[c-1] = previous_letter->initial;
replacements[c] = letter->final;
}
else
{
/* Otherwise, we will change the previous letter
* to connect to the current letter */
forms[c-1] = MEDIAL;
forms[c] = FINAL;
replacements[c-1] = previous_letter->medial;
replacements[c] = letter->final;
}
previous_form = forms[c];
previous_letter = (const ArabicLetter*) letter;
}
// Now that we have all the forms, time to change the codepoints!
for (size_t c = 0; c < runs[i].length; c++)
{
if (replacements[c] != 0)
{
utf32_in[runs[i].offset + c] = replacements[c];
}
}
/* Ligature time! We have to do these after the reshaping process, that is, now!
* Again, all our ligatures are just A+B -> C, so we can just do a single pass,
* up until the second-to-last character (because the last character can't form
* a ligature with the character after).
* Actually, did I say single pass... The mandatory ligatures must be prioritized
* over the optional ones... */
for (char pass = 0; pass < 2; pass++)
{
for (size_t c = 0; c < runs[i].length - 1; c++)
{
if (pass == 1 && utf32_in[runs[i].offset + c + 1] == 0xFFFFFFFF)
{
c++;
continue;
}
uintptr_t ligature_ptr;
bool found = hashmap_get(arabic_ligatures_map, &utf32_in[runs[i].offset + c], sizeof(uint32_t)*2, &ligature_ptr);
if (!found)
{
continue;
}
const ArabicLigature* ligature = (const ArabicLigature*) ligature_ptr;
if (pass == 0 && !ligature->mandatory)
{
continue;
}
/* We have a match, that means [c]+[c+1] needs to be replaced!
* We'll use 0xFFFFFFFF as a special tombstone character,
* otherwise we'd have to keep shifting the array contents... */
utf32_in[runs[i].offset + c] = ligature->target;
utf32_in[runs[i].offset + c + 1] = 0xFFFFFFFF;
// Don't bother comparing the tombstone with the next letter
c++;
}
}
}
for (size_t c = 0; c < runs[i].length; c++)
{
size_t ix;
if (is_ltr)
{
ix = runs[i].offset + c;
}
else
{
ix = runs[i].offset + runs[i].length - 1 - c;
}
if (utf32_in[ix] == 0xFFFFFFFF)
{
continue;
}
int out_room_left = sizeof(utf8_out) - 1 - utf8_out_cur;
if (out_room_left <= 0)
{
goto no_more_runs;
}
UTF8_encoding enc = UTF8_encode(utf32_in[ix]);
size_t n_copy = SDL_min(enc.nbytes, (size_t) out_room_left);
SDL_memcpy(
&utf8_out[utf8_out_cur],
enc.bytes,
n_copy
);
utf8_out_cur += n_copy;
}
}
no_more_runs:
utf8_out[utf8_out_cur] = '\0';
VVV_freefunc(SBLineRelease, paragraph_line);
VVV_freefunc(SBParagraphRelease, paragraph);
VVV_freefunc(SBAlgorithmRelease, algorithm);
return utf8_out;
}
} // namespace font