From b2fe009d8fee618cbcd837976b6f2dea7c0a9837 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Mon, 26 Jun 2017 15:04:22 +0200
Subject: [PATCH] LaTeX writer: use BCP47 parser.

---
 src/Text/Pandoc/Writers/LaTeX.hs | 194 +++++++++++++++++--------------
 1 file changed, 105 insertions(+), 89 deletions(-)

diff --git a/src/Text/Pandoc/Writers/LaTeX.hs b/src/Text/Pandoc/Writers/LaTeX.hs
index 53a67a27a..5d505ed15 100644
--- a/src/Text/Pandoc/Writers/LaTeX.hs
+++ b/src/Text/Pandoc/Writers/LaTeX.hs
@@ -39,12 +39,13 @@ import Control.Monad.State.Strict
 import Data.Aeson (FromJSON, object, (.=))
 import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isPunctuation, ord,
                   toLower)
-import Data.List (foldl', intercalate, intersperse, isInfixOf, nub, nubBy,
+import Data.List (foldl', intercalate, intersperse, isInfixOf, nubBy,
                   stripPrefix, (\\))
 import Data.Maybe (catMaybes, fromMaybe, isJust)
 import Data.Text (Text)
 import qualified Data.Text as T
 import Network.URI (unEscapeString)
+import Text.Pandoc.BCP47 (Lang(..), toLang, getLang, renderLang)
 import Text.Pandoc.Class (PandocMonad, report)
 import Text.Pandoc.Definition
 import Text.Pandoc.Highlighting (formatLaTeXBlock, formatLaTeXInline, highlight,
@@ -188,7 +189,8 @@ pandocToLaTeX options (Pandoc meta blocks) = do
   st <- get
   titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta
   authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta
-  let docLangs = nub $ query (extract "lang") blocks
+  docLangs <- catMaybes <$>
+      mapM (toLang . Just) (ordNub (query (extract "lang") blocks))
   let hasStringValue x = isJust (getField x metadata :: Maybe String)
   let geometryFromMargins = intercalate [','] $ catMaybes $
                               map (\(x,y) ->
@@ -198,6 +200,18 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                               ,("tmargin","margin-top")
                               ,("bmargin","margin-bottom")
                               ]
+  let toPolyObj lang = object [ "name"    .= T.pack name
+                              , "options" .= T.pack opts ]
+        where
+          (name, opts) = toPolyglossia lang
+  mblang <- toLang $ case getLang options meta of
+                          Just l -> Just l
+                          Nothing | null docLangs -> Nothing
+                                  | otherwise     -> Just "en"
+  -- we need a default here since lang is used in template conditionals
+
+  let dirs = query (extract "dir") blocks
+
   let context  =  defField "toc" (writerTableOfContents options) $
                   defField "toc-depth" (show (writerTOCDepth options -
                                               if stBook st
@@ -235,26 +249,20 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                          Biblatex -> defField "biblio-title" biblioTitle .
                                      defField "biblatex" True
                          _        -> id) $
-                  -- set lang to something so polyglossia/babel is included
-                  defField "lang" (if null docLangs then ""::String else "en") $
-                  defField "otherlangs" docLangs $
                   defField "colorlinks" (any hasStringValue
                            ["citecolor", "urlcolor", "linkcolor", "toccolor"]) $
-                  defField "dir" (if (null $ query (extract "dir") blocks)
-                                     then ""::String
-                                     else "ltr") $
+                  (if null dirs
+                     then id
+                     else defField "dir" ("ltr" :: String)) $
                   defField "section-titles" True $
                   defField "geometry" geometryFromMargins $
                   metadata
-  let toPolyObj lang = object [ "name"    .= T.pack name
-                              , "options" .= T.pack opts ]
-        where
-          (name, opts) = toPolyglossia lang
-  let lang = maybe [] (splitBy (=='-')) $ getField "lang" context
-      otherlangs = maybe [] (map $ splitBy (=='-')) $ getField "otherlangs" context
   let context' =
-          defField "babel-lang" (toBabel lang)
-        $ defField "babel-otherlangs" (map toBabel otherlangs)
+          -- note: lang is used in some conditionals in the template,
+          -- so we need to set it if we have any babel/polyglossia:
+          maybe id (defField "lang" . renderLang) mblang
+        $ maybe id (defField "babel-lang" . toBabel) mblang
+        $ defField "babel-otherlangs" (map toBabel docLangs)
         $ defField "babel-newcommands" (concatMap (\(poly, babel) ->
             -- \textspanish and \textgalician are already used by babel
             -- save them as \oritext... and let babel use that
@@ -274,16 +282,12 @@ pandocToLaTeX options (Pandoc meta blocks) = do
             -- eliminate duplicates that have same polyglossia name
             $ nubBy (\a b -> fst a == fst b)
             -- find polyglossia and babel names of languages used in the document
-            $ map (\l ->
-              let lng = splitBy (=='-') l
-              in  (fst $ toPolyglossia lng, toBabel lng)
-              )
-            docLangs )
-        $ defField "polyglossia-lang" (toPolyObj lang)
-        $ defField "polyglossia-otherlangs" (map toPolyObj otherlangs)
-        $ defField "latex-dir-rtl" (case (getField "dir" context)::Maybe String of
-                                      Just "rtl" -> True
-                                      _          -> False)
+            $ map (\l -> (fst $ toPolyglossia l, toBabel l)) docLangs
+          )
+        $ maybe id (defField "polyglossia-lang" . toPolyObj) mblang
+        $ defField "polyglossia-otherlangs" (map toPolyObj docLangs)
+        $ defField "latex-dir-rtl"
+           (getField "dir" context == Just ("rtl" :: String))
         $ context
   case writerTemplate options of
        Nothing  -> return main
@@ -443,11 +447,12 @@ blockToLaTeX (Div (identifier,classes,kvs) bs) = do
               -> "\\leavevmode" <> linkAnchor' <> "%"
             _ -> linkAnchor'
   let align dir txt = inCmd "begin" dir $$ txt $$ inCmd "end" dir
+  lang <- toLang $ lookup "lang" kvs
   let wrapDir = case lookup "dir" kvs of
                   Just "rtl" -> align "RTL"
                   Just "ltr" -> align "LTR"
                   _          -> id
-      wrapLang txt = case lookup "lang" kvs of
+      wrapLang txt = case lang of
                        Just lng -> let (l, o) = toPolyglossiaEnv lng
                                        ops = if null o
                                                 then ""
@@ -918,13 +923,14 @@ inlineToLaTeX :: PandocMonad m
               -> LW m Doc
 inlineToLaTeX (Span (id',classes,kvs) ils) = do
   linkAnchor <- hypertarget False id' empty
+  lang <- toLang $ lookup "lang" kvs
   let cmds = ["textup" | "csl-no-emph" `elem` classes] ++
              ["textnormal" | "csl-no-strong" `elem` classes ||
                              "csl-no-smallcaps" `elem` classes] ++
              ["RL" | ("dir", "rtl") `elem` kvs] ++
              ["LR" | ("dir", "ltr") `elem` kvs] ++
-             (case lookup "lang" kvs of
-                Just lng -> let (l, o) = toPolyglossia $ splitBy (=='-') lng
+             (case lang of
+                Just lng -> let (l, o) = toPolyglossia lng
                                 ops = if null o then "" else ("[" ++ o ++ "]")
                             in  ["text" ++ l ++ ops]
                 Nothing  -> [])
@@ -1254,85 +1260,95 @@ lookKey :: String -> Attr -> [String]
 lookKey key (_,_,kvs) =  maybe [] words $ lookup key kvs
 
 -- In environments \Arabic instead of \arabic is used
-toPolyglossiaEnv :: String -> (String, String)
+toPolyglossiaEnv :: Lang -> (String, String)
 toPolyglossiaEnv l =
-  case toPolyglossia $ (splitBy (=='-')) l of
+  case toPolyglossia l of
     ("arabic", o) -> ("Arabic", o)
     x             -> x
 
 -- Takes a list of the constituents of a BCP 47 language code and
 -- converts it to a Polyglossia (language, options) tuple
 -- http://mirrors.ctan.org/macros/latex/contrib/polyglossia/polyglossia.pdf
-toPolyglossia :: [String] -> (String, String)
-toPolyglossia ("ar":"DZ":_)        = ("arabic", "locale=algeria")
-toPolyglossia ("ar":"IQ":_)        = ("arabic", "locale=mashriq")
-toPolyglossia ("ar":"JO":_)        = ("arabic", "locale=mashriq")
-toPolyglossia ("ar":"LB":_)        = ("arabic", "locale=mashriq")
-toPolyglossia ("ar":"LY":_)        = ("arabic", "locale=libya")
-toPolyglossia ("ar":"MA":_)        = ("arabic", "locale=morocco")
-toPolyglossia ("ar":"MR":_)        = ("arabic", "locale=mauritania")
-toPolyglossia ("ar":"PS":_)        = ("arabic", "locale=mashriq")
-toPolyglossia ("ar":"SY":_)        = ("arabic", "locale=mashriq")
-toPolyglossia ("ar":"TN":_)        = ("arabic", "locale=tunisia")
-toPolyglossia ("de":"1901":_)      = ("german", "spelling=old")
-toPolyglossia ("de":"AT":"1901":_) = ("german", "variant=austrian, spelling=old")
-toPolyglossia ("de":"AT":_)        = ("german", "variant=austrian")
-toPolyglossia ("de":"CH":"1901":_) = ("german", "variant=swiss, spelling=old")
-toPolyglossia ("de":"CH":_)        = ("german", "variant=swiss")
-toPolyglossia ("de":_)             = ("german", "")
-toPolyglossia ("dsb":_)            = ("lsorbian", "")
-toPolyglossia ("el":"polyton":_)   = ("greek",   "variant=poly")
-toPolyglossia ("en":"AU":_)        = ("english", "variant=australian")
-toPolyglossia ("en":"CA":_)        = ("english", "variant=canadian")
-toPolyglossia ("en":"GB":_)        = ("english", "variant=british")
-toPolyglossia ("en":"NZ":_)        = ("english", "variant=newzealand")
-toPolyglossia ("en":"UK":_)        = ("english", "variant=british")
-toPolyglossia ("en":"US":_)        = ("english", "variant=american")
-toPolyglossia ("grc":_)            = ("greek",   "variant=ancient")
-toPolyglossia ("hsb":_)            = ("usorbian", "")
-toPolyglossia ("la":"x":"classic":_) = ("latin", "variant=classic")
-toPolyglossia ("sl":_)             = ("slovenian", "")
-toPolyglossia x                    = (commonFromBcp47 x, "")
+toPolyglossia :: Lang -> (String, String)
+toPolyglossia (Lang "ar" _ "DZ" _)        = ("arabic", "locale=algeria")
+toPolyglossia (Lang "ar" _ "IQ" _)        = ("arabic", "locale=mashriq")
+toPolyglossia (Lang "ar" _ "JO" _)        = ("arabic", "locale=mashriq")
+toPolyglossia (Lang "ar" _ "LB" _)        = ("arabic", "locale=mashriq")
+toPolyglossia (Lang "ar" _ "LY" _)        = ("arabic", "locale=libya")
+toPolyglossia (Lang "ar" _ "MA" _)        = ("arabic", "locale=morocco")
+toPolyglossia (Lang "ar" _ "MR" _)        = ("arabic", "locale=mauritania")
+toPolyglossia (Lang "ar" _ "PS" _)        = ("arabic", "locale=mashriq")
+toPolyglossia (Lang "ar" _ "SY" _)        = ("arabic", "locale=mashriq")
+toPolyglossia (Lang "ar" _ "TN" _)        = ("arabic", "locale=tunisia")
+toPolyglossia (Lang "de" _ _ vars)
+  | "1901" `elem` vars                    = ("german", "spelling=old")
+toPolyglossia (Lang "de" _ "AT" vars)
+  | "1901" `elem` vars                    = ("german", "variant=austrian, spelling=old")
+toPolyglossia (Lang "de" _ "AT" _)        = ("german", "variant=austrian")
+toPolyglossia (Lang "de" _ "CH" vars)
+  | "1901" `elem` vars                    = ("german", "variant=swiss, spelling=old")
+toPolyglossia (Lang "de" _ "CH" _)        = ("german", "variant=swiss")
+toPolyglossia (Lang "de" _ _ _)           = ("german", "")
+toPolyglossia (Lang "dsb" _ _ _)          = ("lsorbian", "")
+toPolyglossia (Lang "el" _ "polyton" _)   = ("greek",   "variant=poly")
+toPolyglossia (Lang "en" _ "AU" _)        = ("english", "variant=australian")
+toPolyglossia (Lang "en" _ "CA" _)        = ("english", "variant=canadian")
+toPolyglossia (Lang "en" _ "GB" _)        = ("english", "variant=british")
+toPolyglossia (Lang "en" _ "NZ" _)        = ("english", "variant=newzealand")
+toPolyglossia (Lang "en" _ "UK" _)        = ("english", "variant=british")
+toPolyglossia (Lang "en" _ "US" _)        = ("english", "variant=american")
+toPolyglossia (Lang "grc" _ _ _)          = ("greek",   "variant=ancient")
+toPolyglossia (Lang "hsb" _ _  _)         = ("usorbian", "")
+toPolyglossia (Lang "la" _ _ vars)
+  | "x-classic" `elem` vars               = ("latin", "variant=classic")
+toPolyglossia (Lang "sl" _ _ _)           = ("slovenian", "")
+toPolyglossia x                           = (commonFromBcp47 x, "")
 
 -- Takes a list of the constituents of a BCP 47 language code and
 -- converts it to a Babel language string.
 -- http://mirrors.ctan.org/macros/latex/required/babel/base/babel.pdf
 -- List of supported languages (slightly outdated):
 -- http://tug.ctan.org/language/hyph-utf8/doc/generic/hyph-utf8/hyphenation.pdf
-toBabel :: [String] -> String
-toBabel ("de":"1901":_)        = "german"
-toBabel ("de":"AT":"1901":_)   = "austrian"
-toBabel ("de":"AT":_)          = "naustrian"
-toBabel ("de":"CH":"1901":_)   = "swissgerman"
-toBabel ("de":"CH":_)          = "nswissgerman"
-toBabel ("de":_)               = "ngerman"
-toBabel ("dsb":_)              = "lowersorbian"
-toBabel ("el":"polyton":_)     = "polutonikogreek"
-toBabel ("en":"AU":_)          = "australian"
-toBabel ("en":"CA":_)          = "canadian"
-toBabel ("en":"GB":_)          = "british"
-toBabel ("en":"NZ":_)          = "newzealand"
-toBabel ("en":"UK":_)          = "british"
-toBabel ("en":"US":_)          = "american"
-toBabel ("fr":"CA":_)          = "canadien"
-toBabel ("fra":"aca":_)        = "acadian"
-toBabel ("grc":_)              = "polutonikogreek"
-toBabel ("hsb":_)              = "uppersorbian"
-toBabel ("la":"x":"classic":_) = "classiclatin"
-toBabel ("sl":_)               = "slovene"
-toBabel x                      = commonFromBcp47 x
+toBabel :: Lang -> String
+toBabel (Lang "de" _ "AT" vars)
+  | "1901" `elem` vars                  = "austrian"
+  | otherwise                           = "naustrian"
+toBabel (Lang "de" _ "CH" vars)
+  | "1901" `elem` vars                  = "swissgerman"
+  | otherwise                           = "nswissgerman"
+toBabel (Lang "de" _ _ vars)
+  | "1901" `elem` vars                  = "german"
+  | otherwise                           = "ngerman"
+toBabel (Lang "dsb" _ _ _)              = "lowersorbian"
+toBabel (Lang "el" _ _ vars)
+  | "polyton" `elem` vars               = "polutonikogreek"
+toBabel (Lang "en" _ "AU" _)            = "australian"
+toBabel (Lang "en" _ "CA" _)            = "canadian"
+toBabel (Lang "en" _ "GB" _)            = "british"
+toBabel (Lang "en" _ "NZ" _)            = "newzealand"
+toBabel (Lang "en" _ "UK" _)            = "british"
+toBabel (Lang "en" _ "US" _)            = "american"
+toBabel (Lang "fr" _ "CA" _)            = "canadien"
+toBabel (Lang "fra" _ _ vars)
+  | "aca" `elem` vars                   = "acadian"
+toBabel (Lang "grc" _ _ _)              = "polutonikogreek"
+toBabel (Lang "hsb" _ _ _)              = "uppersorbian"
+toBabel (Lang "la" _ _ vars)
+  | "x-classic" `elem` vars             = "classiclatin"
+toBabel (Lang "sl" _ _ _)               = "slovene"
+toBabel x                               = commonFromBcp47 x
 
 -- Takes a list of the constituents of a BCP 47 language code
 -- and converts it to a string shared by Babel and Polyglossia.
 -- https://tools.ietf.org/html/bcp47#section-2.1
-commonFromBcp47 :: [String] -> String
-commonFromBcp47 [] = ""
-commonFromBcp47 ("pt":"BR":_)            = "brazil"
+commonFromBcp47 :: Lang -> String
+commonFromBcp47 (Lang "pt" _ "BR" _)            = "brazil"
 -- Note: documentation says "brazilian" works too, but it doesn't seem to work
 -- on some systems.  See #2953.
-commonFromBcp47 ("sr":"Cyrl":_)          = "serbianc"
-commonFromBcp47 ("zh":"Latn":"pinyin":_) = "pinyin"
-commonFromBcp47 x = fromIso $ head x
+commonFromBcp47 (Lang "sr" "Cyrl" _ _)          = "serbianc"
+commonFromBcp47 (Lang "zh" "Latn" _ vars)
+  | "pinyin" `elem` vars                        = "pinyin"
+commonFromBcp47 (Lang l _ _ _) = fromIso l
   where
     fromIso "af"  = "afrikaans"
     fromIso "am"  = "amharic"