lang variable is now in BCP47 format

strings are converted for LaTeX and ConTeXt output, closes #1614
2015-08-20 17:18:14 +02:00 · 2015-08-20 17:18:14 +02:00 · 622df7034c
commit 622df7034c
parent a0ddabb206
3 changed files with 192 additions and 16 deletions
--- a/20
+++ b/20
@ -945,7 +945,19 @@ as `title`, `author`, and `date`) as well as the following:
 :   body of document
 `lang`
-:   language code for HTML or LaTeX documents
+:   The `lang` variable should be set by the user to a language
    code according to [BCP 47] (e.g. `en` or `en-GB`).
    For some output formats, pandoc will convert it to an approriate
    format stored in the additional variables `babel-lang`,
    `polyglossia-lang`, `polyglossia-variant` (LaTeX)
    and `context-lang` (ConTeXt).
 `otherlangs`
 :   Should be set to a list of other languages used in the document
    in the YAML metadata, according to [BCP 47]. For example:
    `otherlangs: [en-GB, fr]`.
    Currently only used by XeTeX through the generated
    `polyglossia-otherlangs` variable.
 `slidy-url`
 :   base URL for Slidy documents (defaults to
@ -3264,8 +3276,8 @@ The following fields are recognized:
  ~ A string value in `YYYY-MM-DD` format.  (Only the year is necessary.)
    Pandoc will attempt to convert other common date formats.
-`language`
+`lang` (or legacy: `language`)
-  ~ A string value in [RFC5646] format.  Pandoc will default to the local
+  ~ A string value in [BCP 47] format.  Pandoc will default to the local
    language if nothing is specified.
 `subject`
@ -3549,7 +3561,7 @@ Xavier Olive.
 [FictionBook2]: http://www.fictionbook.org/index.php/Eng:XML_Schema_Fictionbook_2.1
 [lua]: http://www.lua.org
 [marc relators]: http://www.loc.gov/marc/relators/relaterm.html
-[RFC5646]: http://tools.ietf.org/html/rfc5646
+[BCP 47]: https://tools.ietf.org/html/bcp47
 [InDesign ICML]: https://www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf
 [txt2tags]: http://txt2tags.org/
 [EPUB]: http://idpf.org/epub
--- a/src/Text/Pandoc/Writers/ConTeXt.hs
+++ b/src/Text/Pandoc/Writers/ConTeXt.hs
@ -80,12 +80,12 @@ pandocToConTeXt options (Pandoc meta blocks) = do
                        "subsubsubsection","subsubsubsubsection"])
                $ defField "body" main
                $ defField "number-sections" (writerNumberSections options)
                $ defField "mainlang" (maybe ""
                    (reverse . takeWhile (/=',') . reverse)
                    (lookup "lang" $ writerVariables options))
                $ metadata
  let context' =  defField "context-lang" (maybe "" (fromBcp47 . splitBy (=='-')) $
                    getField "lang" context)
                  context
  return $ if writerStandalone options
-              then renderTemplate' (writerTemplate options) context
+              then renderTemplate' (writerTemplate options) context'
              else main
 -- escape things as needed for ConTeXt
@ -362,3 +362,35 @@ sectionHeader (ident,classes,_) hdrLevel lst = do
                       then char '\\' <> chapter <> braces contents
                       else contents <> blankline
 -- Takes a list of the constituents of a BCP 47 language code
 -- and irons out ConTeXt's exceptions
 -- https://tools.ietf.org/html/bcp47#section-2.1
 -- http://wiki.contextgarden.net/Language_Codes
 fromBcp47 :: [String] -> String
 fromBcp47 []              = ""
 fromBcp47 ("ar":"SY":_)   = "ar-sy"
 fromBcp47 ("ar":"IQ":_)   = "ar-iq"
 fromBcp47 ("ar":"JO":_)   = "ar-jo"
 fromBcp47 ("ar":"LB":_)   = "ar-lb"
 fromBcp47 ("ar":"DZ":_)   = "ar-dz"
 fromBcp47 ("ar":"MA":_)   = "ar-ma"
 fromBcp47 ("de":"1901":_) = "deo"
 fromBcp47 ("de":"DE":_)   = "de-de"
 fromBcp47 ("de":"AT":_)   = "de-at"
 fromBcp47 ("de":"CH":_)   = "de-ch"
 fromBcp47 ("el":"poly":_) = "agr"
 fromBcp47 ("en":"US":_)   = "en-us"
 fromBcp47 ("en":"GB":_)   = "en-gb"
 fromBcp47 ("grc":_)       = "agr"
 fromBcp47 x               = fromIso $ head x
  where
    fromIso "cz" = "cs"
    fromIso "el" = "gr"
    fromIso "eu" = "ba"
    fromIso "he" = "il"
    fromIso "jp" = "ja"
    fromIso "uk" = "ua"
    fromIso "vi" = "vn"
    fromIso "zh" = "cn"
    fromIso l    = l
--- a/src/Text/Pandoc/Writers/LaTeX.hs
+++ b/src/Text/Pandoc/Writers/LaTeX.hs
@ -144,11 +144,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do
  st <- get
  titleMeta <- stringToLaTeX TextString $ stringify $ docTitle meta
  authorsMeta <- mapM (stringToLaTeX TextString . stringify) $ docAuthors meta
  let (mainlang, otherlang) =
       case (reverse . splitBy (==',') . filter (/=' ')) `fmap`
            getField "lang" metadata of
              Just (m:os) -> (m, reverse os)
              _           -> ("", [])
  let context  =  defField "toc" (writerTableOfContents options) $
                  defField "toc-depth" (show (writerTOCDepth options -
                                              if stBook st
@ -173,8 +168,6 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                  defField "euro" (stUsesEuro st) $
                  defField "listings" (writerListings options || stLHS st) $
                  defField "beamer" (writerBeamer options) $
                  defField "mainlang" mainlang $
                  defField "otherlang" otherlang $
                  (if stHighlighting st
                      then defField "highlighting-macros" (styleToLaTeX
                                $ writerHighlightStyle options )
@ -186,8 +179,18 @@ pandocToLaTeX options (Pandoc meta blocks) = do
                                     defField "biblatex" True
                         _        -> id) $
                  metadata
  let lang = maybe [] (splitBy (=='-')) $ getField "lang" context
      (polyLang, polyVar) = toPolyglossia lang
  let context' =
          defField "babel-lang" (toBabel lang)
        $ defField "polyglossia-lang" polyLang
        $ defField "polyglossia-variant" polyVar
        $ defField "polyglossia-otherlangs"
            (maybe [] (map $ fst . toPolyglossia . splitBy (=='-')) $
            getField "otherlangs" context)
        $ context
  return $ if writerStandalone options
-              then renderTemplate' template context
+              then renderTemplate' template context'
              else main
 -- | Convert Elements to LaTeX
@ -980,3 +983,132 @@ citationsToBiblatex _ = return empty
 getListingsLanguage :: [String] -> Maybe String
 getListingsLanguage [] = Nothing
 getListingsLanguage (x:xs) = toListingsLanguage x <|> getListingsLanguage xs
 -- Takes a list of the constituents of a BCP 47 language code and
 -- converts it to a Polyglossia (language, variant) tuple
 -- http://mirrors.concertpass.com/tex-archive/macros/latex/contrib/polyglossia/polyglossia.pdf
 toPolyglossia :: [String] -> (String, String)
 toPolyglossia ("de":"AT":_)   = ("german", "austrian")
 toPolyglossia ("de":"CH":_)   = ("german", "swiss")
 toPolyglossia ("de":_)        = ("german", "")
 toPolyglossia ("dsb":_)       = ("lsorbian", "")
 toPolyglossia ("el":"poly":_) = ("greek", "poly")
 toPolyglossia ("en":"AU":_)   = ("english", "australian")
 toPolyglossia ("en":"CA":_)   = ("english", "canadian")
 toPolyglossia ("en":"GB":_)   = ("english", "british")
 toPolyglossia ("en":"NZ":_)   = ("english", "newzealand")
 toPolyglossia ("en":"UK":_)   = ("english", "british")
 toPolyglossia ("en":"US":_)   = ("english", "american")
 toPolyglossia ("grc":_)       = ("greek", "ancient")
 toPolyglossia ("hsb":_)       = ("usorbian", "")
 toPolyglossia ("sl":_)        = ("slovenian", "")
 toPolyglossia x               = (commonFromBcp47 x, "")
 -- Takes a list of the constituents of a BCP 47 language code and
 -- converts it to a Babel language string.
 -- http://mirrors.concertpass.com/tex-archive/macros/latex/required/babel/base/babel.pdf
 -- Note that the PDF unfortunately does not contain a complete list of supported languages.
 toBabel :: [String] -> String
 toBabel ("de":"1901":_)      = "german"
 toBabel ("de":"AT":"1901":_) = "austrian"
 toBabel ("de":"AT":_)        = "naustrian"
 toBabel ("de":_)             = "ngerman"
 toBabel ("dsb":_)            = "lowersorbian"
 toBabel ("el":"poly":_)      = "polutonikogreek"
 toBabel ("en":"AU":_)        = "australian"
 toBabel ("en":"CA":_)        = "canadian"
 toBabel ("en":"GB":_)        = "british"
 toBabel ("en":"NZ":_)        = "newzealand"
 toBabel ("en":"UK":_)        = "british"
 toBabel ("en":"US":_)        = "american"
 toBabel ("fr":"CA":_)        = "canadien"
 toBabel ("fra":"aca":_)      = "acadian"
 toBabel ("grc":_)            = "polutonikogreek"
 toBabel ("hsb":_)            = "uppersorbian"
 toBabel ("sl":_)             = "slovene"
 toBabel x                    = commonFromBcp47 x
 -- Takes a list of the constituents of a BCP 47 language code
 -- and converts it to a string shared by Babel and Polyglossia.
 -- https://tools.ietf.org/html/bcp47#section-2.1
 commonFromBcp47 :: [String] -> String
 commonFromBcp47 [] = ""
 commonFromBcp47 ("pt":"BR":_) = "brazilian"
 commonFromBcp47 x = fromIso $ head x
  where
    fromIso "af"  = "afrikaans"
    fromIso "am"  = "amharic"
    fromIso "ar"  = "arabic"
    fromIso "ast" = "asturian"
    fromIso "bg"  = "bulgarian"
    fromIso "bn"  = "bengali"
    fromIso "bo"  = "tibetan"
    fromIso "br"  = "breton"
    fromIso "ca"  = "catalan"
    fromIso "cy"  = "welsh"
    fromIso "cz"  = "czech"
    fromIso "cop" = "coptic"
    fromIso "da"  = "danish"
    fromIso "dv"  = "divehi"
    fromIso "el"  = "greek"
    fromIso "en"  = "english"
    fromIso "eo"  = "esperanto"
    fromIso "es"  = "spanish"
    fromIso "et"  = "estonian"
    fromIso "eu"  = "basque"
    fromIso "fa"  = "farsi"
    fromIso "fi"  = "finnish"
    fromIso "fr"  = "french"
    fromIso "fur" = "friulan"
    fromIso "ga"  = "irish"
    fromIso "gd"  = "scottish"
    fromIso "gl"  = "galician"
    fromIso "he"  = "hebrew"
    fromIso "hi"  = "hindi"
    fromIso "hr"  = "croatian"
    fromIso "hy"  = "armenian"
    fromIso "hu"  = "magyar"
    fromIso "ia"  = "interlingua"
    fromIso "id"  = "indonesian"
    fromIso "ie"  = "interlingua"
    fromIso "is"  = "icelandic"
    fromIso "it"  = "italian"
    fromIso "jp"  = "japanese"
    fromIso "km"  = "khmer"
    fromIso "kn"  = "kannada"
    fromIso "ko"  = "korean"
    fromIso "la"  = "latin"
    fromIso "lo"  = "lao"
    fromIso "lt"  = "lithuanian"
    fromIso "lv"  = "latvian"
    fromIso "ml"  = "malayalam"
    fromIso "mn"  = "mongolian"
    fromIso "mr"  = "marathi"
    fromIso "nb"  = "norsk"
    fromIso "nl"  = "dutch"
    fromIso "nn"  = "nynorsk"
    fromIso "no"  = "norsk"
    fromIso "nqo" = "nko"
    fromIso "oc"  = "occitan"
    fromIso "pl"  = "polish"
    fromIso "pms" = "piedmontese"
    fromIso "pt"  = "portuguese"
    fromIso "rm"  = "romansh"
    fromIso "ro"  = "romanian"
    fromIso "ru"  = "russian"
    fromIso "sa"  = "sanskrit"
    fromIso "se"  = "samin"
    fromIso "sk"  = "slovak"
    fromIso "sq"  = "albanian"
    fromIso "sr"  = "serbian"
    fromIso "sv"  = "swedish"
    fromIso "syr" = "syriac"
    fromIso "ta"  = "tamil"
    fromIso "te"  = "telugu"
    fromIso "th"  = "thai"
    fromIso "tk"  = "turkmen"
    fromIso "tr"  = "turkish"
    fromIso "uk"  = "ukrainian"
    fromIso "ur"  = "urdu"
    fromIso "vi"  = "vietnamese"
    fromIso _     = ""