Groff escaping changes.

- `--ascii` is now turned on automatically for man output, for
  portability.  All man output will be escaped to ASCII.
- In T.P.Writers.Groff, `escapeChar`, `escapeString`, and
  `escapeCode` now take a boolean parameter that selects
  ascii-only output.  This is used by the Ms writer for
  `--ascii`, instead of doing an extra pass after writing
  the document.
- In ms output without `--ascii`, unicode is used whenever
  possible (e.g. for double quotes).
- A few escapes are changed: e.g. `\[rs]` instead of `\\` for
  backslash, and `\ga]` instead of `` \` `` for backtick.
This commit is contained in:
John MacFarlane 2018-10-18 10:21:34 -07:00
parent bbd94eae2b
commit efbb329f1a
10 changed files with 118 additions and 110 deletions

View file

@ -860,10 +860,11 @@ Options affecting specific writers {.options}
: Use only ASCII characters in output. Currently supported for
XML and HTML formats (which use numerical entities instead of
UTF-8 when this option is selected), groff ms and man
UTF-8 when this option is selected), groff ms
(which use hexadecimal escapes), and to a limited degree
for LaTeX (which uses standard commands for accented
characters when possible).
LaTeX (which uses standard commands for accented
characters when possible). Groff man output uses ASCII
by default.
`--reference-links`

View file

@ -1,4 +1,5 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-
Copyright (C) 2018 John MacFarlane <jgm@berkeley.edu>
@ -400,19 +401,19 @@ characterCodes =
-- use like: \\[E a^ aa]
combiningAccents :: [(Char, String)]
combiningAccents =
[ ('˝' , "\\[a\"]")
, ('¯', "\\[a-]")
, ('˙', "\\[a.]")
, ('^', "\\[a^]")
, ('´', "\\[aa]")
, ('`', "\\[ga]")
, ('˘', "\\[ab]")
, ('¸', "\\[ac]")
, ('¨', "\\[ad]")
, ('ˇ', "\\[ah]")
, ('˚', "\\[ao]")
, ('~', "\\[a~]")
, ('˛', "\\[ho]")
, ('^', "\\[ha]")
, ('~', "\\[ti]")
[ ('˝' , "a\"")
, ('¯', "a-")
, ('˙', "a.")
, ('^', "a^")
, ('´', "aa")
, ('`', "ga")
, ('˘', "ab")
, ('¸', "ac")
, ('¨', "ad")
, ('ˇ', "ah")
, ('˚', "ao")
, ('~', "a~")
, ('˛', "ho")
, ('^', "ha")
, ('~', "ti")
]

View file

@ -37,12 +37,10 @@ module Text.Pandoc.Writers.Groff (
, escapeChar
, escapeString
, escapeCode
, groffEscape
, withFontFeature
) where
import Prelude
import qualified Data.Text as T
import Data.Char (isAscii, ord)
import Data.Char (ord, isAscii)
import Control.Monad.State.Strict
import Data.List (intercalate)
import qualified Data.Map as Map
@ -51,7 +49,7 @@ import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Pretty
import Text.Printf (printf)
import Text.Pandoc.GroffChar (essentialEscapes)
import Text.Pandoc.GroffChar (essentialEscapes, characterCodes)
data WriterState = WriterState { stHasInlineMath :: Bool
, stFirstPara :: Bool
@ -82,31 +80,35 @@ type Note = [Block]
type MS = StateT WriterState
escapeChar :: Char -> String
escapeChar c = fromMaybe [c] (Map.lookup c essentialEscapes)
escapeChar :: Bool -> Char -> String
escapeChar useAscii c =
case Map.lookup c essentialEscapes of
Just s -> s
Nothing
| useAscii
, not (isAscii c) ->
case Map.lookup c characterCodeMap of
Just t -> "\\[" <> t <> "]"
Nothing -> printf "\\[u%04X]" (ord c)
| otherwise -> [c]
-- | Escape special characters for groff.
escapeString :: String -> String
escapeString = concatMap escapeChar
escapeString :: Bool -> String -> String
escapeString useAscii = concatMap (escapeChar useAscii)
-- | Escape a literal (code) section for groff.
escapeCode :: String -> String
escapeCode = intercalate "\n" . map escapeLine . lines
escapeCode :: Bool -> String -> String
escapeCode useAScii = intercalate "\n" . map escapeLine . lines
where escapeCodeChar ' ' = "\\ "
escapeCodeChar '\t' = "\\\t"
escapeCodeChar c = escapeChar c
escapeCodeChar c = escapeChar useAScii c
escapeLine codeline =
case concatMap escapeCodeChar codeline of
a@('.':_) -> "\\&" ++ a
b -> b
-- | Escape non-ASCII characters using groff \u[..] sequences.
groffEscape :: T.Text -> T.Text
groffEscape = T.concatMap toUchar
where toUchar c
| isAscii c = T.singleton c
| otherwise = T.pack $ printf "\\[u%04X]" (ord c)
characterCodeMap :: Map.Map Char String
characterCodeMap = Map.fromList characterCodes
fontChange :: PandocMonad m => MS m Doc
fontChange = do

View file

@ -33,6 +33,7 @@ Conversion of 'Pandoc' documents to groff man page format.
module Text.Pandoc.Writers.Man ( writeMan) where
import Prelude
import Control.Monad.State.Strict
import Data.Char (isAscii)
import Data.List (intersperse, stripPrefix)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
@ -93,8 +94,7 @@ pandocToMan opts (Pandoc meta blocks) = do
$ defField "has-tables" hasTables
$ defField "hyphenate" True
$ defField "pandoc-version" pandocVersion metadata
(if writerPreferAscii opts then groffEscape else id) <$>
case writerTemplate opts of
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context
@ -148,7 +148,7 @@ blockToMan _ (CodeBlock _ str) = return $
text ".IP" $$
text ".nf" $$
text "\\f[C]" $$
text (escapeCode str) $$
text (escapeCode True str) $$
text "\\f[R]" $$
text ".fi"
blockToMan opts (BlockQuote blocks) = do
@ -296,10 +296,10 @@ inlineToMan opts (Quoted DoubleQuote lst) = do
inlineToMan opts (Cite _ lst) =
inlineListToMan opts lst
inlineToMan _ (Code _ str) =
withFontFeature 'C' (return (text $ escapeCode str))
withFontFeature 'C' (return (text $ escapeCode True str))
inlineToMan _ (Str str@('.':_)) =
return $ afterBreak "\\&" <> text (escapeString str)
inlineToMan _ (Str str) = return $ text $ escapeString str
return $ afterBreak "\\&" <> text (escapeString True str)
inlineToMan _ (Str str) = return $ text $ escapeString True str
inlineToMan opts (Math InlineMath str) =
lift (texMathToInlines InlineMath str) >>= inlineListToMan opts
inlineToMan opts (Math DisplayMath str) = do

View file

@ -69,6 +69,9 @@ writeMs :: PandocMonad m => WriterOptions -> Pandoc -> m Text
writeMs opts document =
evalStateT (pandocToMs opts document) defaultWriterState
escString :: WriterOptions -> String -> String
escString opts = escapeString (writerPreferAscii opts)
-- | Return groff ms representation of document.
pandocToMs :: PandocMonad m => WriterOptions -> Pandoc -> MS m Text
pandocToMs opts (Pandoc meta blocks) = do
@ -84,8 +87,8 @@ pandocToMs opts (Pandoc meta blocks) = do
body <- blockListToMs opts blocks
let main = render' body
hasInlineMath <- gets stHasInlineMath
let titleMeta = (escapeString . stringify) $ docTitle meta
let authorsMeta = map (escapeString . stringify) $ docAuthors meta
let titleMeta = (escString opts . stringify) $ docTitle meta
let authorsMeta = map (escString opts . stringify) $ docAuthors meta
hasHighlighting <- gets stHighlighting
let highlightingMacros = if hasHighlighting
then case writerHighlightStyle opts of
@ -101,8 +104,7 @@ pandocToMs opts (Pandoc meta blocks) = do
$ defField "title-meta" titleMeta
$ defField "author-meta" (intercalate "; " authorsMeta)
$ defField "highlighting-macros" highlightingMacros metadata
(if writerPreferAscii opts then groffEscape else id) <$>
case writerTemplate opts of
case writerTemplate opts of
Nothing -> return main
Just tpl -> renderTemplate' tpl context
@ -112,18 +114,18 @@ escapeUri = escapeURIString (\c -> c /= '@' && isAllowedInURI c)
-- | Escape | character, used to mark inline math, inside math.
escapeBar :: String -> String
escapeBar = concatMap go
where go '|' = "\\[u007C]"
where go '|' = "\\[ba]"
go c = [c]
toSmallCaps :: String -> String
toSmallCaps [] = []
toSmallCaps (c:cs)
toSmallCaps :: WriterOptions -> String -> String
toSmallCaps _ [] = []
toSmallCaps opts (c:cs)
| isLower c = let (lowers,rest) = span isLower (c:cs)
in "\\s-2" ++ escapeString (map toUpper lowers) ++
"\\s0" ++ toSmallCaps rest
in "\\s-2" ++ escString opts (map toUpper lowers) ++
"\\s0" ++ toSmallCaps opts rest
| isUpper c = let (uppers,rest) = span isUpper (c:cs)
in escapeString uppers ++ toSmallCaps rest
| otherwise = escapeChar c ++ toSmallCaps cs
in escString opts uppers ++ toSmallCaps opts rest
| otherwise = escapeChar (writerPreferAscii opts) c ++ toSmallCaps opts cs
-- We split inline lists into sentences, and print one sentence per
-- line. groff/troff treats the line-ending period differently.
@ -160,7 +162,7 @@ blockToMs opts (Para [Image attr alt (src,_tit)])
_ -> empty
capt <- inlineListToMs' opts alt
return $ nowrap (text ".PSPIC -C " <>
doubleQuotes (text (escapeString src)) <>
doubleQuotes (text (escString opts src)) <>
sizeAttrs) $$
text ".ce 1000" $$
capt $$
@ -198,7 +200,7 @@ blockToMs opts (Header level (ident,classes,_) inlines) = do
(if null secnum
then ""
else " ") ++
escapeString (stringify inlines))
escString opts (stringify inlines))
let backlink = nowrap (text ".pdfhref L -D " <>
doubleQuotes (text (toAscii ident)) <> space <> text "\\") <> cr <>
text " -- "
@ -400,14 +402,14 @@ inlineToMs opts (Cite _ lst) =
inlineToMs opts (Code attr str) = do
hlCode <- highlightCode opts attr str
withFontFeature 'C' (return hlCode)
inlineToMs _ (Str str) = do
inlineToMs opts (Str str) = do
let shim = case str of
'.':_ -> afterBreak "\\&"
_ -> empty
smallcaps <- gets stSmallCaps
if smallcaps
then return $ shim <> text (toSmallCaps str)
else return $ shim <> text (escapeString str)
then return $ shim <> text (toSmallCaps opts str)
else return $ shim <> text (escString opts str)
inlineToMs opts (Math InlineMath str) = do
modify $ \st -> st{ stHasInlineMath = True }
res <- convertMath writeEqn InlineMath str
@ -449,9 +451,10 @@ inlineToMs opts (Link _ txt (src, _)) = do
doubleQuotes (text (escapeUri src)) <> text " -A " <>
doubleQuotes (text "\\c") <> space <> text "\\") <> cr <>
text " -- " <> doubleQuotes (nowrap contents) <> cr <> text "\\&"
inlineToMs _ (Image _ alternate (_, _)) =
inlineToMs opts (Image _ alternate (_, _)) =
return $ char '[' <> text "IMAGE: " <>
text (escapeString (stringify alternate)) <> char ']'
text (escString opts (stringify alternate))
<> char ']'
inlineToMs _ (Note contents) = do
modify $ \st -> st{ stNotes = contents : stNotes st }
return $ text "\\**"
@ -531,20 +534,21 @@ toMacro sty toktype =
-- lnColor = lineNumberColor sty
-- lnBkgColor = lineNumberBackgroundColor sty
msFormatter :: FormatOptions -> [SourceLine] -> Doc
msFormatter _fmtopts =
msFormatter :: WriterOptions -> FormatOptions -> [SourceLine] -> Doc
msFormatter opts _fmtopts =
vcat . map fmtLine
where fmtLine = hcat . map fmtToken
fmtToken (toktype, tok) = text "\\*" <>
brackets (text (show toktype) <> text " \""
<> text (escapeCode (T.unpack tok)) <> text "\"")
<> text (escapeCode (writerPreferAscii opts)
(T.unpack tok)) <> text "\"")
highlightCode :: PandocMonad m => WriterOptions -> Attr -> String -> MS m Doc
highlightCode opts attr str =
case highlight (writerSyntaxMap opts) msFormatter attr str of
case highlight (writerSyntaxMap opts) (msFormatter opts) attr str of
Left msg -> do
unless (null msg) $ report $ CouldNotHighlight msg
return $ text (escapeCode str)
return $ text (escapeCode (writerPreferAscii opts) str)
Right h -> do
modify (\st -> st{ stHighlighting = True })
return h

View file

@ -3,5 +3,5 @@
A simple example
^D
.LP
A \[oq]simple\[cq] example
A simple example
```

View file

@ -17,7 +17,7 @@ pandoc -t man --ascii
äéıå
^D
.PP
\[u00E4]\[u00E9]\[u0131]\[u00E5]
\[:a]\['e]\[.i]\[oa]
```
```
@ -25,7 +25,7 @@ pandoc -t ms --ascii
äéıå
^D
.LP
\[u00E4]\[u00E9]\[u0131]\[u00E5]
\[:a]\['e]\[.i]\[oa]
```
```

View file

@ -135,7 +135,7 @@ T}
.LP
Multiline table with caption:
.PP
Here\[cq]s the caption. It may span multiple lines.
Heres the caption. It may span multiple lines.
.TS
delim(@@) tab( );
cw(10.5n) lw(9.6n) rw(11.4n) lw(24.5n).
@ -165,7 +165,7 @@ row
T} T{
5.0
T} T{
Here\[cq]s another one.
Heres another one.
Note the blank line between rows.
T}
.TE
@ -201,7 +201,7 @@ row
T} T{
5.0
T} T{
Here\[cq]s another one.
Heres another one.
Note the blank line between rows.
T}
.TE
@ -261,7 +261,7 @@ row
T} T{
5.0
T} T{
Here\[cq]s another one.
Heres another one.
Note the blank line between rows.
T}
.TE

View file

@ -104,7 +104,7 @@ And:
\f[C]
\ \ \ \ this\ code\ block\ is\ indented\ by\ two\ tabs
These\ should\ not\ be\ escaped:\ \ \\$\ \\\\\ \\>\ \\[\ \\{
These\ should\ not\ be\ escaped:\ \ \[rs]$\ \[rs]\[rs]\ \[rs]>\ \[rs][\ \[rs]{
\f[R]
.fi
.PP
@ -525,7 +525,7 @@ So is \f[B]\f[BI]this\f[B]\f[R] word.
.PP
So is \f[B]\f[BI]this\f[B]\f[R] word.
.PP
This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\\\f[R], \f[C]\\$\f[R],
This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
\f[C]<html>\f[R].
.PP
[STRIKEOUT:This is \f[I]strikeout\f[R].]
@ -563,11 +563,11 @@ Ellipses\&...and\&...and\&....
.SH LaTeX
.IP \[bu] 2
.IP \[bu] 2
2+2=4
2\[u2005]+\[u2005]2\[u2004]=\[u2004]4
.IP \[bu] 2
\f[I]x\f[R] ∈ \f[I]y\f[R]
\f[I]x\f[R]\[u2004]\[mo]\[u2004]\f[I]y\f[R]
.IP \[bu] 2
\f[I]α\f[R] ∧ \f[I]ω\f[R]
\f[I]\[*a]\f[R]\[u2005]\[AN]\[u2005]\f[I]\[*w]\f[R]
.IP \[bu] 2
223
.IP \[bu] 2
@ -575,11 +575,11 @@ Ellipses\&...and\&...and\&....
.IP \[bu] 2
Here\[cq]s some display math:
.RS
$$\\frac{d}{dx}f(x)=\\lim_{h\\to 0}\\frac{f(x+h)-f(x)}{h}$$
$$\[rs]frac{d}{dx}f(x)=\[rs]lim_{h\[rs]to 0}\[rs]frac{f(x+h)-f(x)}{h}$$
.RE
.IP \[bu] 2
Here\[cq]s one that has a line break in it:
\f[I]α\f[R]+\f[I]ω\f[R]×\f[I]x\f[R]^2^.
\f[I]\[*a]\f[R]\[u2005]+\[u2005]\f[I]\[*w]\f[R]\[u2005]\[tmu]\[u2005]\f[I]x\f[R]^2^.
.PP
These shouldn\[cq]t be math:
.IP \[bu] 2
@ -600,15 +600,15 @@ Here\[cq]s a LaTeX table:
.PP
Here is some unicode:
.IP \[bu] 2
I hat: Î
I hat: \[^I]
.IP \[bu] 2
o umlaut: ö
o umlaut: \[:o]
.IP \[bu] 2
section: §
section: \[sc]
.IP \[bu] 2
set membership:
set membership: \[mo]
.IP \[bu] 2
copyright: ©
copyright: \[co]
.PP
AT&T has an ampersand in their name.
.PP
@ -620,9 +620,9 @@ This & that.
.PP
6 > 5.
.PP
Backslash: \\
Backslash: \[rs]
.PP
Backtick: \`
Backtick: \[ga]
.PP
Asterisk: *
.PP

View file

@ -75,7 +75,7 @@ July 17, 2006
.1C
.LP
This is a set of tests for pandoc.
Most of them are adapted from John Gruber\[cq]s markdown test suite.
Most of them are adapted from John Grubers markdown test suite.
.HLINE
.SH 1
Headers
@ -126,7 +126,7 @@ Paragraphs
.pdfhref O 1 "Paragraphs"
.pdfhref M "paragraphs"
.LP
Here\[cq]s a regular paragraph.
Heres a regular paragraph.
.PP
In Markdown 1.0.0 and earlier.
Version 8.
@ -134,7 +134,7 @@ This line turns into a list item.
Because a hard-wrapped line in the middle of a paragraph looked like a list
item.
.PP
Here\[cq]s one with a bullet.
Heres one with a bullet.
* criminey.
.PP
There should be a hard line break
@ -210,7 +210,7 @@ And:
\f[C]
\ \ \ \ this\ code\ block\ is\ indented\ by\ two\ tabs
These\ should\ not\ be\ escaped:\ \ \\$\ \\\\\ \\>\ \\[\ \\{
These\ should\ not\ be\ escaped:\ \ \[rs]$\ \[rs]\[rs]\ \[rs]>\ \[rs][\ \[rs]{
\f[]
.fi
.HLINE
@ -314,7 +314,7 @@ Item 1, graf one.
.PP
Item 1.
graf two.
The quick brown fox jumped over the lazy dog\[cq]s back.
The quick brown fox jumped over the lazy dogs back.
.RE
.IP " 2." 4
Item 2.
@ -335,7 +335,7 @@ Tab
.RE
.RE
.LP
Here\[cq]s another:
Heres another:
.IP " 1." 4
First
.IP " 2." 4
@ -570,7 +570,7 @@ Interpreted markdown in a table:
This is \f[I]emphasized\f[R]
And this is \f[B]strong\f[R]
.PP
Here\[cq]s a simple block:
Heres a simple block:
.LP
foo
.LP
@ -617,7 +617,7 @@ Code:
\f[]
.fi
.LP
Hr\[cq]s:
Hrs:
.HLINE
.SH 1
Inline Markup
@ -641,7 +641,7 @@ So is \f[B]\f[BI]this\f[B]\f[R] word.
.PP
So is \f[B]\f[BI]this\f[B]\f[R] word.
.PP
This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\\\f[R], \f[C]\\$\f[R],
This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
\f[C]<html>\f[R].
.PP
\m[strikecolor]This is \f[I]strikeout\f[R].\m[]
@ -666,16 +666,16 @@ Smart quotes, ellipses, dashes
`Oak,' `elm,' and `beech' are names of trees.
So is `pine.'
.PP
`He said, \[lq]I want to go.\[rq]' Were you alive in the 70\[cq]s?
`He said, \[lq]I want to go.\[rq]' Were you alive in the 70s?
.PP
Here is some quoted `\f[C]code\f[R]' and a \[lq]\c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "quoted link"
\&\[rq].
.PP
Some dashes: one\[em]two \[em] three\[em]four \[em] five.
Some dashes: one—two — three—four — five.
.PP
Dashes between numbers: 5\[en]7, 255\[en]66, 1987\[en]1999.
Dashes between numbers: 57, 25566, 19871999.
.PP
Ellipses\&...and\&...and\&....
.HLINE
@ -695,14 +695,14 @@ LaTeX
.IP \[bu] 3
@p@-Tree
.IP \[bu] 3
Here\[cq]s some display math:
Heres some display math:
.EQ
d over {d x} f ( x ) = lim sub {h -> 0} {f ( x + h ) \[u2212] f ( x )} over h
.EN
.IP \[bu] 3
Here\[cq]s one that has a line break in it: @alpha + omega times x sup 2@.
Heres one that has a line break in it: @alpha + omega times x sup 2@.
.LP
These shouldn\[cq]t be math:
These shouldnt be math:
.IP \[bu] 3
To get the famous equation, write \f[C]$e\ =\ mc\[ha]2$\f[R].
.IP \[bu] 3
@ -714,7 +714,7 @@ Shoes ($20) and socks ($5).
.IP \[bu] 3
Escaped \f[C]$\f[R]: $73 \f[I]this should be emphasized\f[R] 23$.
.LP
Here\[cq]s a LaTeX table:
Heres a LaTeX table:
.HLINE
.SH 1
Special Characters
@ -743,9 +743,9 @@ This & that.
.PP
6 > 5.
.PP
Backslash: \\
Backslash: \[rs]
.PP
Backtick: \`
Backtick: \[ga]
.PP
Asterisk: *
.PP
@ -885,22 +885,22 @@ With ampersands
.pdfhref O 2 "With ampersands"
.pdfhref M "with-ampersands"
.LP
Here\[cq]s a \c
Heres a \c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "link with an ampersand in the URL"
\&.
.PP
Here\[cq]s a link with an amersand in the link text: \c
Heres a link with an amersand in the link text: \c
.pdfhref W -D "http://att.com/" -A "\c" \
-- "AT&T"
\&.
.PP
Here\[cq]s an \c
Heres an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link"
\&.
.PP
Here\[cq]s an \c
Heres an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link in pointy braces"
\&.
@ -967,7 +967,7 @@ It need not be placed at the end of the document.
.FE
and another.\**
.FS
Here\[cq]s the long note.
Heres the long note.
This one contains multiple blocks.
.PP
Subsequent blocks are indented to show that they belong to the footnote (as