LaTeX reader: simplified accent code using unicode-transforms.

New dependency on unicode-transforms package for normalization.
This commit is contained in:
John MacFarlane 2018-09-30 10:25:17 -07:00
parent 966bd94ba2
commit aebe5fe99e
2 changed files with 34 additions and 322 deletions

View file

@ -388,6 +388,7 @@ library
http-client-tls >= 0.2.4 && < 0.4,
http-types >= 0.8 && < 0.13,
case-insensitive >= 1.2 && < 1.3,
unicode-transforms >= 0.3 && < 0.4,
HsYAML >= 0.1.1.1 && < 0.2
if impl(ghc < 8.0)
build-depends: semigroups == 0.18.*,

View file

@ -76,6 +76,7 @@ import Text.Pandoc.Shared
import qualified Text.Pandoc.Translations as Translations
import Text.Pandoc.Walk
import qualified Text.Pandoc.Builder as B
import qualified Data.Text.Normalize as Normalize
-- for debugging:
-- import Text.Pandoc.Extensions (getDefaultExtensions)
@ -589,308 +590,18 @@ keyval = try $ do
keyvals :: PandocMonad m => LP m [(String, String)]
keyvals = try $ symbol '[' >> manyTill keyval (symbol ']')
accent :: PandocMonad m => Char -> (Char -> String) -> LP m Inlines
accent c f = try $ do
accent :: PandocMonad m => Char -> Maybe Char -> LP m Inlines
accent combiningAccent fallBack = try $ do
ils <- tok
case toList ils of
(Str (x:xs) : ys) -> return $ fromList $
case f x of
[z] | z == x -> Str ([z,c] ++ xs) : ys -- combining accent
zs -> Str (zs ++ xs) : ys
[Space] -> return $ str [c]
[] -> return $ str [c]
-- try to normalize to the combined character:
Str (T.unpack
(Normalize.normalize Normalize.NFC
(T.pack [x, combiningAccent])) ++ xs) : ys
[Space] -> return $ str [fromMaybe combiningAccent fallBack]
[] -> return $ str [fromMaybe combiningAccent fallBack]
_ -> return ils
grave :: Char -> String
grave 'A' = "À"
grave 'E' = "È"
grave 'I' = "Ì"
grave 'O' = "Ò"
grave 'U' = "Ù"
grave 'a' = "à"
grave 'e' = "è"
grave 'i' = "ì"
grave 'o' = "ò"
grave 'u' = "ù"
grave c = [c]
acute :: Char -> String
acute 'A' = "Á"
acute 'E' = "É"
acute 'I' = "Í"
acute 'O' = "Ó"
acute 'U' = "Ú"
acute 'Y' = "Ý"
acute 'a' = "á"
acute 'e' = "é"
acute 'i' = "í"
acute 'o' = "ó"
acute 'u' = "ú"
acute 'y' = "ý"
acute 'C' = "Ć"
acute 'c' = "ć"
acute 'L' = "Ĺ"
acute 'l' = "ĺ"
acute 'N' = "Ń"
acute 'n' = "ń"
acute 'R' = "Ŕ"
acute 'r' = "ŕ"
acute 'S' = "Ś"
acute 's' = "ś"
acute 'Z' = "Ź"
acute 'z' = "ź"
acute c = [c]
circ :: Char -> String
circ 'A' = "Â"
circ 'E' = "Ê"
circ 'I' = "Î"
circ 'O' = "Ô"
circ 'U' = "Û"
circ 'a' = "â"
circ 'e' = "ê"
circ 'i' = "î"
circ 'o' = "ô"
circ 'u' = "û"
circ 'C' = "Ĉ"
circ 'c' = "ĉ"
circ 'G' = "Ĝ"
circ 'g' = "ĝ"
circ 'H' = "Ĥ"
circ 'h' = "ĥ"
circ 'J' = "Ĵ"
circ 'j' = "ĵ"
circ 'S' = "Ŝ"
circ 's' = "ŝ"
circ 'W' = "Ŵ"
circ 'w' = "ŵ"
circ 'Y' = "Ŷ"
circ 'y' = "ŷ"
circ c = [c]
tilde :: Char -> String
tilde 'A' = "Ã"
tilde 'a' = "ã"
tilde 'O' = "Õ"
tilde 'o' = "õ"
tilde 'I' = "Ĩ"
tilde 'i' = "ĩ"
tilde 'U' = "Ũ"
tilde 'u' = "ũ"
tilde 'N' = "Ñ"
tilde 'n' = "ñ"
tilde c = [c]
umlaut :: Char -> String
umlaut 'A' = "Ä"
umlaut 'E' = "Ë"
umlaut 'I' = "Ï"
umlaut 'O' = "Ö"
umlaut 'U' = "Ü"
umlaut 'a' = "ä"
umlaut 'e' = "ë"
umlaut 'i' = "ï"
umlaut 'o' = "ö"
umlaut 'u' = "ü"
umlaut c = [c]
hungarumlaut :: Char -> String
hungarumlaut 'A' = ""
hungarumlaut 'E' = ""
hungarumlaut 'I' = ""
hungarumlaut 'O' = "Ő"
hungarumlaut 'U' = "Ű"
hungarumlaut 'Y' = "ӳ"
hungarumlaut 'a' = ""
hungarumlaut 'e' = ""
hungarumlaut 'i' = ""
hungarumlaut 'o' = "ő"
hungarumlaut 'u' = "ű"
hungarumlaut 'y' = "ӳ"
hungarumlaut c = [c]
dot :: Char -> String
dot 'C' = "Ċ"
dot 'c' = "ċ"
dot 'E' = "Ė"
dot 'e' = "ė"
dot 'G' = "Ġ"
dot 'g' = "ġ"
dot 'I' = "İ"
dot 'Z' = "Ż"
dot 'z' = "ż"
dot c = [c]
macron :: Char -> String
macron 'A' = "Ā"
macron 'E' = "Ē"
macron 'I' = "Ī"
macron 'O' = "Ō"
macron 'U' = "Ū"
macron 'a' = "ā"
macron 'e' = "ē"
macron 'i' = "ī"
macron 'o' = "ō"
macron 'u' = "ū"
macron c = [c]
ringabove :: Char -> String
ringabove 'A' = "Å"
ringabove 'a' = "å"
ringabove 'U' = "Ů"
ringabove 'u' = "ů"
ringabove c = [c]
dotbelow :: Char -> String
dotbelow 'B' = ""
dotbelow 'b' = ""
dotbelow 'D' = ""
dotbelow 'd' = ""
dotbelow 'H' = ""
dotbelow 'h' = ""
dotbelow 'K' = ""
dotbelow 'k' = ""
dotbelow 'L' = ""
dotbelow 'l' = ""
dotbelow 'M' = ""
dotbelow 'm' = ""
dotbelow 'N' = ""
dotbelow 'n' = ""
dotbelow 'R' = ""
dotbelow 'r' = ""
dotbelow 'S' = ""
dotbelow 's' = ""
dotbelow 'T' = ""
dotbelow 't' = ""
dotbelow 'V' = ""
dotbelow 'v' = "ṿ"
dotbelow 'W' = ""
dotbelow 'w' = ""
dotbelow 'Z' = ""
dotbelow 'z' = ""
dotbelow 'A' = ""
dotbelow 'a' = ""
dotbelow 'E' = ""
dotbelow 'e' = ""
dotbelow 'I' = ""
dotbelow 'i' = ""
dotbelow 'O' = ""
dotbelow 'o' = ""
dotbelow 'U' = ""
dotbelow 'u' = ""
dotbelow 'Y' = ""
dotbelow 'y' = ""
dotbelow c = [c]
doublegrave :: Char -> String
doublegrave 'A' = "Ȁ"
doublegrave 'a' = "ȁ"
doublegrave 'E' = "Ȅ"
doublegrave 'e' = "ȅ"
doublegrave 'I' = "Ȉ"
doublegrave 'i' = "ȉ"
doublegrave 'O' = "Ȍ"
doublegrave 'o' = "ȍ"
doublegrave 'R' = "Ȑ"
doublegrave 'r' = "ȑ"
doublegrave 'U' = "Ȕ"
doublegrave 'u' = "ȕ"
doublegrave c = [c]
hookabove :: Char -> String
hookabove 'A' = ""
hookabove 'a' = ""
hookabove 'E' = ""
hookabove 'e' = ""
hookabove 'I' = ""
hookabove 'i' = ""
hookabove 'O' = ""
hookabove 'o' = ""
hookabove 'U' = ""
hookabove 'u' = ""
hookabove 'Y' = ""
hookabove 'y' = ""
hookabove c = [c]
cedilla :: Char -> String
cedilla 'c' = "ç"
cedilla 'C' = "Ç"
cedilla 's' = "ş"
cedilla 'S' = "Ş"
cedilla 't' = "ţ"
cedilla 'T' = "Ţ"
cedilla 'e' = "ȩ"
cedilla 'E' = "Ȩ"
cedilla 'h' = ""
cedilla 'H' = ""
cedilla 'o' = ""
cedilla 'O' = ""
cedilla c = [c]
hacek :: Char -> String
hacek 'A' = "Ǎ"
hacek 'a' = "ǎ"
hacek 'C' = "Č"
hacek 'c' = "č"
hacek 'D' = "Ď"
hacek 'd' = "ď"
hacek 'E' = "Ě"
hacek 'e' = "ě"
hacek 'G' = "Ǧ"
hacek 'g' = "ǧ"
hacek 'H' = "Ȟ"
hacek 'h' = "ȟ"
hacek 'I' = "Ǐ"
hacek 'i' = "ǐ"
hacek 'j' = "ǰ"
hacek 'K' = "Ǩ"
hacek 'k' = "ǩ"
hacek 'L' = "Ľ"
hacek 'l' = "ľ"
hacek 'N' = "Ň"
hacek 'n' = "ň"
hacek 'O' = "Ǒ"
hacek 'o' = "ǒ"
hacek 'R' = "Ř"
hacek 'r' = "ř"
hacek 'S' = "Š"
hacek 's' = "š"
hacek 'T' = "Ť"
hacek 't' = "ť"
hacek 'U' = "Ǔ"
hacek 'u' = "ǔ"
hacek 'Z' = "Ž"
hacek 'z' = "ž"
hacek c = [c]
ogonek :: Char -> String
ogonek 'a' = "ą"
ogonek 'e' = "ę"
ogonek 'o' = "ǫ"
ogonek 'i' = "į"
ogonek 'u' = "ų"
ogonek 'A' = "Ą"
ogonek 'E' = "Ę"
ogonek 'I' = "Į"
ogonek 'O' = "Ǫ"
ogonek 'U' = "Ų"
ogonek c = [c]
breve :: Char -> String
breve 'A' = "Ă"
breve 'a' = "ă"
breve 'E' = "Ĕ"
breve 'e' = "ĕ"
breve 'G' = "Ğ"
breve 'g' = "ğ"
breve 'I' = "Ĭ"
breve 'i' = "ĭ"
breve 'O' = "Ŏ"
breve 'o' = "ŏ"
breve 'U' = "Ŭ"
breve 'u' = "ŭ"
breve c = [c]
mathDisplay :: String -> Inlines
mathDisplay = displayMath . trim
@ -1228,32 +939,32 @@ inlineCommands = M.union inlineLanguageCommands $ M.fromList
, ("copyright", lit "©")
, ("textasciicircum", lit "^")
, ("textasciitilde", lit "~")
, ("H", accent '\779' hungarumlaut)
, ("`", accent '`' grave)
, ("'", accent '\'' acute)
, ("^", accent '^' circ)
, ("~", accent '~' tilde)
, ("\"", accent '\776' umlaut)
, (".", accent '\775' dot)
, ("=", accent '\772' macron)
, ("|", accent '\781' (:[])) -- vertical line above
, ("b", accent '\817' (:[])) -- macron below
, ("c", accent '\807' cedilla)
, ("G", accent '\783' doublegrave)
, ("h", accent '\777' hookabove)
, ("d", accent '\803' dotbelow)
, ("f", accent '\785' (:[])) -- inverted breve
, ("r", accent '\778' ringabove)
, ("t", accent '\865' (:[])) -- double inverted breve
, ("U", accent '\782' (:[])) -- double vertical line above
, ("v", accent 'ˇ' hacek)
, ("u", accent '\774' breve)
, ("k", accent '\808' ogonek)
, ("textogonekcentered", accent '\808' ogonek)
, ("H", accent '\779' Nothing) -- hungarumlaut
, ("`", accent '\768' (Just '`')) -- grave
, ("'", accent '\769' (Just '\'')) -- acute
, ("^", accent '\770' (Just '^')) -- circ
, ("~", accent '\771' (Just '~')) -- tilde
, ("\"", accent '\776' Nothing) -- umlaut
, (".", accent '\775' Nothing) -- dot
, ("=", accent '\772' Nothing) -- macron
, ("|", accent '\781' Nothing) -- vertical line above
, ("b", accent '\817' Nothing) -- macron below
, ("c", accent '\807' Nothing) -- cedilla
, ("G", accent '\783' Nothing) -- doublegrave
, ("h", accent '\777' Nothing) -- hookabove
, ("d", accent '\803' Nothing) -- dotbelow
, ("f", accent '\785' Nothing) -- inverted breve
, ("r", accent '\778' Nothing) -- ringabove
, ("t", accent '\865' Nothing) -- double inverted breve
, ("U", accent '\782' Nothing) -- double vertical line above
, ("v", accent '\780' Nothing) -- hacek
, ("u", accent '\774' Nothing) -- breve
, ("k", accent '\808' Nothing) -- ogonek
, ("textogonekcentered", accent '\808' Nothing) -- ogonek
, ("i", lit "ı") -- dotless i
, ("j", lit "ȷ") -- dotless j
, ("newtie", accent '\785' (:[])) -- inverted breve
, ("textcircled", accent '\8413' (:[])) -- combining circle
, ("newtie", accent '\785' Nothing) -- inverted breve
, ("textcircled", accent '\8413' Nothing) -- combining circle
, ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState
guard $ not inTableCell
optional opt