Implement --ascii (writerPreferAscii) in writers, not App.

Now the `write*` functions for Docbook, HTML, ICML, JATS,
Man, Ms, OPML are sensitive to `writerPreferAscii`.  Previously
the to-ascii translation was done in Text.Pandoc.App, and
thus not available to those using the writer functions
directly.

In addition, the LaTeX writer is now sensitive to
`writerPreferAscii` and to `--ascii`.  100% ASCII
output can't be guaranteed, but the writer will use
commands like `\"{a}` and `\l` whenever possible,
to avoid emiting a non-ASCII character.

A new unexported module, Text.Pandoc.Groff, has been
added to store functions used in the different groff-based
writers.
This commit is contained in:
John MacFarlane 2018-09-30 22:32:00 -07:00
parent 0a8d212a09
commit 36f1846cc3
13 changed files with 230 additions and 76 deletions

View file

@ -860,8 +860,10 @@ Options affecting specific writers {.options}
: Use only ASCII characters in output. Currently supported for
XML and HTML formats (which use numerical entities instead of
UTF-8 when this option is selected) and for groff ms and man
(which use hexadecimal escapes).
UTF-8 when this option is selected), groff ms and man
(which use hexadecimal escapes), and to a limited degree
for LaTeX (which uses standard commands for accented
characters when possible).
`--reference-links`

View file

@ -540,6 +540,7 @@ library
Text.Pandoc.Lua.Packages,
Text.Pandoc.Lua.StackInstances,
Text.Pandoc.Lua.Util,
Text.Pandoc.Groff
Text.Pandoc.CSS,
Text.Pandoc.CSV,
Text.Pandoc.UUID,

View file

@ -52,7 +52,7 @@ import Data.Aeson (defaultOptions)
import Data.Aeson.TH (deriveJSON)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.Char (toLower, toUpper, isAscii, ord)
import Data.Char (toLower, toUpper)
import Data.List (find, intercalate, isPrefixOf, isSuffixOf, sort)
import qualified Data.Map as M
import Data.Maybe (fromMaybe, isJust, isNothing)
@ -95,7 +95,6 @@ import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs,
headerShift, isURI, ordNub, safeRead, tabFilter, uriPathToPath)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Writers.Math (defaultKaTeXURL, defaultMathJaxURL)
import Text.Pandoc.XML (toEntities)
import Text.Printf
#ifndef _WINDOWS
import System.Posix.IO (stdOutput)
@ -443,6 +442,7 @@ convertWithOpts opts = do
, writerTOCDepth = optTOCDepth opts
, writerReferenceDoc = optReferenceDoc opts
, writerSyntaxMap = syntaxMap
, writerPreferAscii = optAscii opts
}
let readerOpts = def{
@ -519,19 +519,10 @@ convertWithOpts opts = do
let htmlFormat = format `elem`
["html","html4","html5","s5","slidy",
"slideous","dzslides","revealjs"]
escape
| optAscii opts
, htmlFormat || format == "docbook4" ||
format == "docbook5" || format == "docbook" ||
format == "jats" || format == "opml" ||
format == "icml" = toEntities
| optAscii opts
, format == "ms" || format == "man" = groffEscape
| otherwise = id
addNl = if standalone
then id
else (<> T.singleton '\n')
output <- (addNl . escape) <$> f writerOptions doc
output <- addNl <$> f writerOptions doc
writerFn eol outputFile =<<
if optSelfContained opts && htmlFormat
-- TODO not maximally efficient; change type
@ -539,12 +530,6 @@ convertWithOpts opts = do
then T.pack <$> makeSelfContained (T.unpack output)
else return output
groffEscape :: Text -> Text
groffEscape = T.concatMap toUchar
where toUchar c
| isAscii c = T.singleton c
| otherwise = T.pack $ printf "\\[u%04X]" (ord c)
type Transform = Pandoc -> Pandoc
isTextFormat :: String -> Bool
@ -606,7 +591,7 @@ data Opt = Opt
, optPdfEngineArgs :: [String] -- ^ Flags to pass to the engine
, optSlideLevel :: Maybe Int -- ^ Header level that creates slides
, optSetextHeaders :: Bool -- ^ Use atx headers for markdown level 1-2
, optAscii :: Bool -- ^ Use ascii characters only in html
, optAscii :: Bool -- ^ Prefer ascii output
, optDefaultImageExtension :: String -- ^ Default image extension
, optExtractMedia :: Maybe FilePath -- ^ Path to extract embedded media
, optTrackChanges :: TrackChanges -- ^ Accept or reject MS Word track-changes.
@ -1173,7 +1158,7 @@ options =
, Option "" ["ascii"]
(NoArg
(\opt -> return opt { optAscii = True }))
"" -- "Use ascii characters only in HTML output"
"" -- "Prefer ASCII output"
, Option "" ["reference-links"]
(NoArg

43
src/Text/Pandoc/Groff.hs Normal file
View file

@ -0,0 +1,43 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-
Copyright (C) 2018 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
Module : Text.Pandoc.Groff
Copyright : Copyright (C) 2018 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Shared functions for escaping and formatting groff.
-}
module Text.Pandoc.Groff ( groffEscape )
where
import Prelude
import Data.Char (isAscii, ord)
import qualified Data.Text as T
import Text.Printf (printf)
groffEscape :: T.Text -> T.Text
groffEscape = T.concatMap toUchar
where toUchar c
| isAscii c = T.singleton c
| otherwise = T.pack $ printf "\\[u%04X]" (ord c)

View file

@ -126,9 +126,10 @@ writeDocbook opts (Pandoc meta blocks) = do
defField "mathml" (case writerHTMLMathMethod opts of
MathML -> True
_ -> False) metadata
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context
(if writerPreferAscii opts then toEntities else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context
-- | Convert an Element to Docbook.
elementToDocbook :: PandocMonad m => WriterOptions -> Int -> Element -> DB m Doc

View file

@ -75,7 +75,7 @@ import Text.Pandoc.Templates
import Text.Pandoc.Walk
import Text.Pandoc.Writers.Math
import Text.Pandoc.Writers.Shared
import Text.Pandoc.XML (escapeStringForXML, fromEntities)
import Text.Pandoc.XML (escapeStringForXML, fromEntities, toEntities)
#if MIN_VERSION_blaze_markup(0,6,3)
#else
import Text.Blaze.Internal (preEscapedString, preEscapedText)
@ -206,7 +206,8 @@ writeHtmlString' :: PandocMonad m
=> WriterState -> WriterOptions -> Pandoc -> m Text
writeHtmlString' st opts d = do
(body, context) <- evalStateT (pandocToHtml opts d) st
case writerTemplate opts of
(if writerPreferAscii opts then toEntities else id) <$>
case writerTemplate opts of
Nothing -> return $ renderHtml' body
Just tpl -> do
-- warn if empty lang
@ -221,16 +222,19 @@ writeHtmlString' st opts d = do
lookup "sourcefile" (writerVariables opts)
report $ NoTitleElement fallback
return $ resetField "pagetitle" fallback context
renderTemplate' tpl $
defField "body" (renderHtml' body) context'
renderTemplate' tpl
(defField "body" (renderHtml' body) context')
writeHtml' :: PandocMonad m => WriterState -> WriterOptions -> Pandoc -> m Html
writeHtml' st opts d =
case writerTemplate opts of
Just _ -> preEscapedText <$> writeHtmlString' st opts d
Nothing -> do
(body, _) <- evalStateT (pandocToHtml opts d) st
return body
Nothing
| writerPreferAscii opts
-> preEscapedText <$> writeHtmlString' st opts d
| otherwise -> do
(body, _) <- evalStateT (pandocToHtml opts d) st
return body
-- result is (title, authors, date, toc, body, new variables)
pandocToHtml :: PandocMonad m

View file

@ -149,7 +149,8 @@ writeICML opts (Pandoc meta blocks) = do
$ defField "charStyles" (render' $ charStylesToDoc st)
$ defField "parStyles" (render' $ parStylesToDoc st)
$ defField "hyperlinks" (render' $ hyperlinksToDoc $ links st) metadata
case writerTemplate opts of
(if writerPreferAscii opts then toEntities else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context

View file

@ -102,7 +102,8 @@ docToJATS opts (Pandoc meta blocks) = do
$ defField "mathml" (case writerHTMLMathMethod opts of
MathML -> True
_ -> False) metadata
case writerTemplate opts of
(if writerPreferAscii opts then toEntities else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context

View file

@ -42,8 +42,9 @@ import Data.Aeson (FromJSON, object, (.=))
import Data.Char (isAlphaNum, isAscii, isDigit, isLetter, isPunctuation, ord,
toLower)
import Data.List (foldl', intercalate, intersperse, isInfixOf, nubBy,
stripPrefix, (\\))
stripPrefix, (\\), uncons)
import Data.Maybe (catMaybes, fromMaybe, isJust, mapMaybe, isNothing)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
import Network.URI (unEscapeString)
@ -63,6 +64,7 @@ import Text.Pandoc.Walk
import Text.Pandoc.Writers.Shared
import qualified Text.Parsec as P
import Text.Printf (printf)
import qualified Data.Text.Normalize as Normalize
data WriterState =
WriterState { stInNote :: Bool -- true if we're in a note
@ -318,46 +320,110 @@ data StringContext = TextString
-- escape things as needed for LaTeX
stringToLaTeX :: PandocMonad m => StringContext -> String -> LW m String
stringToLaTeX _ [] = return ""
stringToLaTeX ctx (x:xs) = do
stringToLaTeX context zs = do
opts <- gets stOptions
rest <- stringToLaTeX ctx xs
let ligatures = isEnabled Ext_smart opts && ctx == TextString
let isUrl = ctx == URLString
return $
go opts context $
if writerPreferAscii opts
then T.unpack $ Normalize.normalize Normalize.NFD $ T.pack zs
else zs
where
go _ _ [] = return ""
go opts ctx (x:xs) = do
let ligatures = isEnabled Ext_smart opts && ctx == TextString
let isUrl = ctx == URLString
let mbAccentCmd =
if writerPreferAscii opts && ctx == TextString
then uncons xs >>= \(c,_) -> M.lookup c accents
else Nothing
let emits s =
case mbAccentCmd of
Just cmd -> ((cmd ++ "{" ++ s ++ "}") ++)
<$> go opts ctx (drop 1 xs) -- drop combining accent
Nothing -> (s++) <$> go opts ctx xs
let emitc c =
case mbAccentCmd of
Just cmd -> ((cmd ++ "{" ++ [c] ++ "}") ++)
<$> go opts ctx (drop 1 xs) -- drop combining accent
Nothing -> (c:) <$> go opts ctx xs
case x of
'{' -> "\\{" ++ rest
'}' -> "\\}" ++ rest
'`' | ctx == CodeString -> "\\textasciigrave{}" ++ rest
'$' | not isUrl -> "\\$" ++ rest
'%' -> "\\%" ++ rest
'&' -> "\\&" ++ rest
'_' | not isUrl -> "\\_" ++ rest
'#' -> "\\#" ++ rest
'-' | not isUrl -> case xs of
-- prevent adjacent hyphens from forming ligatures
('-':_) -> "-\\/" ++ rest
_ -> '-' : rest
'~' | not isUrl -> "\\textasciitilde{}" ++ rest
'^' -> "\\^{}" ++ rest
'\\'| isUrl -> '/' : rest -- NB. / works as path sep even on Windows
| otherwise -> "\\textbackslash{}" ++ rest
'|' | not isUrl -> "\\textbar{}" ++ rest
'<' -> "\\textless{}" ++ rest
'>' -> "\\textgreater{}" ++ rest
'[' -> "{[}" ++ rest -- to avoid interpretation as
']' -> "{]}" ++ rest -- optional arguments
'\'' | ctx == CodeString -> "\\textquotesingle{}" ++ rest
'\160' -> "~" ++ rest
'\x202F' -> "\\," ++ rest
'\x2026' -> "\\ldots{}" ++ rest
'\x2018' | ligatures -> "`" ++ rest
'\x2019' | ligatures -> "'" ++ rest
'\x201C' | ligatures -> "``" ++ rest
'\x201D' | ligatures -> "''" ++ rest
'\x2014' | ligatures -> "---" ++ rest
'\x2013' | ligatures -> "--" ++ rest
_ -> x : rest
'{' -> emits "\\{"
'}' -> emits "\\}"
'`' | ctx == CodeString -> emits "\\textasciigrave{}"
'$' | not isUrl -> emits "\\$"
'%' -> emits "\\%"
'&' -> emits "\\&"
'_' | not isUrl -> emits "\\_"
'#' -> emits "\\#"
'-' | not isUrl -> case xs of
-- prevent adjacent hyphens from forming ligatures
('-':_) -> emits "-\\/"
_ -> emitc '-'
'~' | not isUrl -> emits "\\textasciitilde{}"
'^' -> emits "\\^{}"
'\\'| isUrl -> emitc '/' -- NB. / works as path sep even on Windows
| otherwise -> emits "\\textbackslash{}"
'|' | not isUrl -> emits "\\textbar{}"
'<' -> emits "\\textless{}"
'>' -> emits "\\textgreater{}"
'[' -> emits "{[}" -- to avoid interpretation as
']' -> emits "{]}" -- optional arguments
'\'' | ctx == CodeString -> emits "\\textquotesingle{}"
'\160' -> emits "~"
'\x202F' -> emits "\\,"
'\x2026' -> emits "\\ldots{}"
'\x2018' | ligatures -> emits "`"
'\x2019' | ligatures -> emits "'"
'\x201C' | ligatures -> emits "``"
'\x201D' | ligatures -> emits "''"
'\x2014' | ligatures -> emits "---"
'\x2013' | ligatures -> emits "--"
_ | writerPreferAscii opts
-> case x of
'ı' -> emits "\\i "
'ȷ' -> emits "\\j "
'å' -> emits "\\aa "
'Å' -> emits "\\AA "
'ß' -> emits "\\ss "
'ø' -> emits "\\o "
'Ø' -> emits "\\O "
'Ł' -> emits "\\L "
'ł' -> emits "\\l "
'æ' -> emits "\\ae "
'Æ' -> emits "\\AE "
'œ' -> emits "\\oe "
'Œ' -> emits "\\OE "
'£' -> emits "\\pounds "
'€' -> emits "\\euro "
'©' -> emits "\\copyright "
_ -> emitc x
| otherwise -> emitc x
accents :: M.Map Char String
accents = M.fromList
[ ('\779' , "\\H")
, ('\768' , "\\`")
, ('\769' , "\\'")
, ('\770' , "\\^")
, ('\771' , "\\~")
, ('\776' , "\\\"")
, ('\775' , "\\.")
, ('\772' , "\\=")
, ('\781' , "\\|")
, ('\817' , "\\b")
, ('\807' , "\\c")
, ('\783' , "\\G")
, ('\777' , "\\h")
, ('\803' , "\\d")
, ('\785' , "\\f")
, ('\778' , "\\r")
, ('\865' , "\\t")
, ('\782' , "\\U")
, ('\780' , "\\v")
, ('\774' , "\\u")
, ('\808' , "\\k")
, ('\785' , "\\newtie")
, ('\8413', "\\textcircled")
]
toLabel :: PandocMonad m => String -> LW m String
toLabel z = go `fmap` stringToLaTeX URLString z

View file

@ -45,6 +45,7 @@ import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Pretty
import Text.Pandoc.Shared
import Text.Pandoc.Groff (groffEscape)
import Text.Pandoc.Templates
import Text.Pandoc.Writers.Math
import Text.Pandoc.Writers.Shared
@ -107,7 +108,8 @@ pandocToMan opts (Pandoc meta blocks) = do
$ defField "has-tables" hasTables
$ defField "hyphenate" True
$ defField "pandoc-version" pandocVersion metadata
case writerTemplate opts of
(if writerPreferAscii opts then groffEscape else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context

View file

@ -60,6 +60,7 @@ import Text.Pandoc.Shared
import Text.Pandoc.Templates
import Text.Pandoc.Writers.Math
import Text.Pandoc.Writers.Shared
import Text.Pandoc.Groff (groffEscape)
import Text.Printf (printf)
import Text.TeXMath (writeEqn)
@ -127,7 +128,8 @@ pandocToMs opts (Pandoc meta blocks) = do
$ defField "title-meta" titleMeta
$ defField "author-meta" (intercalate "; " authorsMeta)
$ defField "highlighting-macros" highlightingMacros metadata
case writerTemplate opts of
(if writerPreferAscii opts then groffEscape else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context

View file

@ -62,7 +62,8 @@ writeOPML opts (Pandoc meta blocks) = do
meta'
main <- (render colwidth . vcat) <$> mapM (elementToOPML opts) elements
let context = defField "body" main metadata
case writerTemplate opts of
(if writerPreferAscii opts then toEntities else id) <$>
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context

45
test/command/ascii.md Normal file
View file

@ -0,0 +1,45 @@
```
pandoc -t html --ascii
äéıå
^D
<p>&#228;&#233;&#305;&#229;</p>
```
```
pandoc -t latex --ascii
äéıå
^D
\"{a}\'{e}\i \r{a}
```
```
pandoc -t man --ascii
äéıå
^D
.PP
\[u00E4]\[u00E9]\[u0131]\[u00E5]
```
```
pandoc -t ms --ascii
äéıå
^D
.LP
\[u00E4]\[u00E9]\[u0131]\[u00E5]
```
```
pandoc -t docbook --ascii
äéıå
^D
<para>
&#228;&#233;&#305;&#229;
</para>
```
```
pandoc -t jats --ascii
äéıå
^D
<p>&#228;&#233;&#305;&#229;</p>
```