Groff writer character escaping changes.

T.P.GroffChar:  replaced `essentialEscapes` with `manEscapes`,
which includes all the escapes mentioned in the groff_man manual.

T.P.Writers.Groff: removed escapeCode; changed parameter on
escapeString from Bool to new type `EscapeMode`.
Rewrote `escapeString`.
This commit is contained in:
John MacFarlane 2018-10-23 21:38:21 -07:00
parent 556e3eef4a
commit 8efb8975ed
8 changed files with 104 additions and 89 deletions

View file

@ -31,24 +31,29 @@ Groff character escaping/unescaping.
-}
module Text.Pandoc.GroffChar (
essentialEscapes
manEscapes
, characterCodes
, combiningAccents
) where
import Prelude
import qualified Data.Map as Map
essentialEscapes :: Map.Map Char String
essentialEscapes = Map.fromList
[ ('\160', "\\~")
-- | These are the escapes specifically mentioned in groff_man(7).
manEscapes :: [(Char, String)]
manEscapes =
[ ('\160', "\\ ")
, ('\'', "\\[aq]")
, ('`', "\\[ga]")
, ('', "\\[oq]")
, ('', "\\[cq]")
, ('"', "\\[dq]")
, ('~', "\\[ti]")
, ('“', "\\[lq]")
, ('”', "\\[rq]")
, ('—', "\\[em]")
, ('', "\\[en]")
, ('`', "\\[ga]")
, ('^', "\\[ha]")
, ('@', "\\[at]")
, ('\\', "\\[rs]")
, ('~', "\\[ti]")
, ('-', "\\-") -- minus; - will be interpreted as hyphen U+2010
, ('\\', "\\[rs]")
, ('\x2026', "\\&...") -- because u2026 doesn't render on tty
]

View file

@ -34,22 +34,21 @@ module Text.Pandoc.Writers.Groff (
, defaultWriterState
, MS
, Note
, EscapeMode(..)
, escapeString
, escapeCode
, withFontFeature
) where
import Prelude
import Data.Char (ord, isAscii)
import Control.Monad.State.Strict
import Data.List (intercalate)
import qualified Data.Map as Map
import Data.Maybe (fromMaybe, isJust, catMaybes)
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Pretty
import Text.Printf (printf)
import Text.Pandoc.GroffChar (essentialEscapes, characterCodes,
combiningAccents)
import Text.Pandoc.GroffChar (manEscapes,
characterCodes, combiningAccents)
data WriterState = WriterState { stHasInlineMath :: Bool
, stFirstPara :: Bool
@ -80,33 +79,38 @@ type Note = [Block]
type MS = StateT WriterState
data EscapeMode = AllowUTF8 -- ^ use preferred man escapes
| AsciiOnly -- ^ escape everything
deriving Show
combiningAccentsMap :: Map.Map Char String
combiningAccentsMap = Map.fromList combiningAccents
-- | Escape special characters for groff.
escapeString :: Bool -> String -> String
escapeString _ [] = []
escapeString useAscii (x:xs) =
case Map.lookup x essentialEscapes of
Just s -> s ++ escapeString useAscii xs
Nothing
| isAscii x || not useAscii -> x : escapeString useAscii xs
| otherwise ->
let accents = catMaybes $ takeWhile isJust
(map (\c -> Map.lookup c combiningAccentsMap) xs)
rest = drop (length accents) xs
s = case Map.lookup x characterCodeMap of
Just t -> "\\[" <> unwords (t:accents) <> "]"
Nothing -> "\\[" <> unwords
(printf "u%04X" (ord x) : accents) <> "]"
in s ++ escapeString useAscii rest
essentialEscapes :: Map.Map Char String
essentialEscapes = Map.fromList manEscapes
-- | Escape a literal (code) section for groff.
escapeCode :: Bool -> String -> String
escapeCode useAscii = intercalate "\n" . map escapeLine . lines
where escapeLine xs = case xs of
('.':_) -> "\\%" ++ escapeString useAscii xs
_ -> escapeString useAscii xs
-- | Escape special characters for groff.
escapeString :: EscapeMode -> String -> String
escapeString _ [] = []
escapeString escapeMode ('\n':'.':xs) =
'\n':'\\':'&':'.':escapeString escapeMode xs
escapeString escapeMode (x:xs) =
case Map.lookup x essentialEscapes of
Just s -> s ++ escapeString escapeMode xs
Nothing
| isAscii x -> x : escapeString escapeMode xs
| otherwise ->
case escapeMode of
AllowUTF8 -> x : escapeString escapeMode xs
AsciiOnly ->
let accents = catMaybes $ takeWhile isJust
(map (\c -> Map.lookup c combiningAccentsMap) xs)
rest = drop (length accents) xs
s = case Map.lookup x characterCodeMap of
Just t -> "\\[" <> unwords (t:accents) <> "]"
Nothing -> "\\[" <> unwords
(printf "u%04X" (ord x) : accents) <> "]"
in s ++ escapeString escapeMode rest
characterCodeMap :: Map.Map Char String
characterCodeMap = Map.fromList characterCodes

View file

@ -97,6 +97,9 @@ pandocToMan opts (Pandoc meta blocks) = do
Nothing -> return main
Just tpl -> renderTemplate' tpl context
escString :: WriterOptions -> String -> String
escString _ = escapeString AsciiOnly -- for better portability
-- | Return man representation of notes.
notesToMan :: PandocMonad m => WriterOptions -> [[Block]] -> StateT WriterState m Doc
notesToMan opts notes =
@ -143,11 +146,14 @@ blockToMan opts (Header level _ inlines) = do
1 -> ".SH "
_ -> ".SS "
return $ text heading <> contents
blockToMan _ (CodeBlock _ str) = return $
blockToMan opts (CodeBlock _ str) = return $
text ".IP" $$
text ".nf" $$
text "\\f[C]" $$
text (escapeCode True str) $$
((case str of
'.':_ -> text "\\&"
_ -> mempty) <>
text (escString opts str)) $$
text "\\f[R]" $$
text ".fi"
blockToMan opts (BlockQuote blocks) = do
@ -296,11 +302,11 @@ inlineToMan opts (Quoted DoubleQuote lst) = do
return $ text "\\[lq]" <> contents <> text "\\[rq]"
inlineToMan opts (Cite _ lst) =
inlineListToMan opts lst
inlineToMan _ (Code _ str) =
withFontFeature 'C' (return (text $ escapeCode True str))
inlineToMan _ (Str str@('.':_)) =
return $ afterBreak "\\&" <> text (escapeString True str)
inlineToMan _ (Str str) = return $ text $ escapeString True str
inlineToMan opts (Code _ str) =
withFontFeature 'C' (return (text $ escString opts str))
inlineToMan opts (Str str@('.':_)) =
return $ afterBreak "\\&" <> text (escString opts str)
inlineToMan opts (Str str) = return $ text $ escString opts str
inlineToMan opts (Math InlineMath str) =
lift (texMathToInlines InlineMath str) >>= inlineListToMan opts
inlineToMan opts (Math DisplayMath str) = do

View file

@ -69,9 +69,6 @@ writeMs :: PandocMonad m => WriterOptions -> Pandoc -> m Text
writeMs opts document =
evalStateT (pandocToMs opts document) defaultWriterState
escString :: WriterOptions -> String -> String
escString opts = escapeString (writerPreferAscii opts)
-- | Return groff ms representation of document.
pandocToMs :: PandocMonad m => WriterOptions -> Pandoc -> MS m Text
pandocToMs opts (Pandoc meta blocks) = do
@ -87,8 +84,8 @@ pandocToMs opts (Pandoc meta blocks) = do
body <- blockListToMs opts blocks
let main = render' body
hasInlineMath <- gets stHasInlineMath
let titleMeta = (escString opts . stringify) $ docTitle meta
let authorsMeta = map (escString opts . stringify) $ docAuthors meta
let titleMeta = (escapeStr opts . stringify) $ docTitle meta
let authorsMeta = map (escapeStr opts . stringify) $ docAuthors meta
hasHighlighting <- gets stHighlighting
let highlightingMacros = if hasHighlighting
then case writerHighlightStyle opts of
@ -108,6 +105,10 @@ pandocToMs opts (Pandoc meta blocks) = do
Nothing -> return main
Just tpl -> renderTemplate' tpl context
escapeStr :: WriterOptions -> String -> String
escapeStr opts =
escapeString (if writerPreferAscii opts then AsciiOnly else AllowUTF8)
escapeUri :: String -> String
escapeUri = escapeURIString (\c -> c /= '@' && isAllowedInURI c)
@ -121,11 +122,11 @@ toSmallCaps :: WriterOptions -> String -> String
toSmallCaps _ [] = []
toSmallCaps opts (c:cs)
| isLower c = let (lowers,rest) = span isLower (c:cs)
in "\\s-2" ++ escString opts (map toUpper lowers) ++
in "\\s-2" ++ escapeStr opts (map toUpper lowers) ++
"\\s0" ++ toSmallCaps opts rest
| isUpper c = let (uppers,rest) = span isUpper (c:cs)
in escString opts uppers ++ toSmallCaps opts rest
| otherwise = escapeString (writerPreferAscii opts) [c] ++ toSmallCaps opts cs
in escapeStr opts uppers ++ toSmallCaps opts rest
| otherwise = escapeStr opts [c] ++ toSmallCaps opts cs
-- We split inline lists into sentences, and print one sentence per
-- line. groff/troff treats the line-ending period differently.
@ -162,7 +163,7 @@ blockToMs opts (Para [Image attr alt (src,_tit)])
_ -> empty
capt <- inlineListToMs' opts alt
return $ nowrap (text ".PSPIC -C " <>
doubleQuotes (text (escString opts src)) <>
doubleQuotes (text (escapeStr opts src)) <>
sizeAttrs) $$
text ".ce 1000" $$
capt $$
@ -200,7 +201,7 @@ blockToMs opts (Header level (ident,classes,_) inlines) = do
(if null secnum
then ""
else " ") ++
escString opts (stringify inlines))
escapeStr opts (stringify inlines))
let backlink = nowrap (text ".pdfhref L -D " <>
doubleQuotes (text (toAscii ident)) <> space <> text "\\") <> cr <>
text " -- "
@ -409,7 +410,7 @@ inlineToMs opts (Str str) = do
smallcaps <- gets stSmallCaps
if smallcaps
then return $ shim <> text (toSmallCaps opts str)
else return $ shim <> text (escString opts str)
else return $ shim <> text (escapeStr opts str)
inlineToMs opts (Math InlineMath str) = do
modify $ \st -> st{ stHasInlineMath = True }
res <- convertMath writeEqn InlineMath str
@ -453,7 +454,7 @@ inlineToMs opts (Link _ txt (src, _)) = do
text " -- " <> doubleQuotes (nowrap contents) <> cr <> text "\\&"
inlineToMs opts (Image _ alternate (_, _)) =
return $ char '[' <> text "IMAGE: " <>
text (escString opts (stringify alternate))
text (escapeStr opts (stringify alternate))
<> char ']'
inlineToMs _ (Note contents) = do
modify $ \st -> st{ stNotes = contents : stNotes st }
@ -540,15 +541,14 @@ msFormatter opts _fmtopts =
where fmtLine = hcat . map fmtToken
fmtToken (toktype, tok) = text "\\*" <>
brackets (text (show toktype) <> text " \""
<> text (escapeCode (writerPreferAscii opts)
(T.unpack tok)) <> text "\"")
<> text (escapeStr opts (T.unpack tok)) <> text "\"")
highlightCode :: PandocMonad m => WriterOptions -> Attr -> String -> MS m Doc
highlightCode opts attr str =
case highlight (writerSyntaxMap opts) (msFormatter opts) attr str of
Left msg -> do
unless (null msg) $ report $ CouldNotHighlight msg
return $ text (escapeCode (writerPreferAscii opts) str)
return $ text (escapeStr opts str)
Right h -> do
modify (\st -> st{ stHighlighting = True })
return h

View file

@ -3,5 +3,5 @@
A simple example
^D
.LP
A simple example
A \[oq]simple\[cq] example
```

View file

@ -135,7 +135,7 @@ T}
.LP
Multiline table with caption:
.PP
Heres the caption. It may span multiple lines.
Here\[cq]s the caption. It may span multiple lines.
.TS
delim(@@) tab( );
cw(10.5n) lw(9.6n) rw(11.4n) lw(24.5n).
@ -165,7 +165,7 @@ row
T} T{
5.0
T} T{
Heres another one.
Here\[cq]s another one.
Note the blank line between rows.
T}
.TE
@ -201,7 +201,7 @@ row
T} T{
5.0
T} T{
Heres another one.
Here\[cq]s another one.
Note the blank line between rows.
T}
.TE
@ -261,7 +261,7 @@ row
T} T{
5.0
T} T{
Heres another one.
Here\[cq]s another one.
Note the blank line between rows.
T}
.TE

View file

@ -308,7 +308,7 @@ Nested.
.PP
Should not be a list item:
.PP
M.A.\~2007
M.A.\ 2007
.PP
B.
Williams
@ -492,9 +492,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
.PP
[STRIKEOUT:This is \f[I]strikeout\f[R].]
.PP
Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\~there^.
Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\ there^.
.PP
Subscripts: H~2~O, H~23~O, H~many\~of\~them~O.
Subscripts: H~2~O, H~23~O, H~many\ of\ them~O.
.PP
These should not be superscripts or subscripts, because of the unescaped
spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.

View file

@ -75,7 +75,7 @@ July 17, 2006
.1C
.LP
This is a set of tests for pandoc.
Most of them are adapted from John Grubers markdown test suite.
Most of them are adapted from John Gruber\[cq]s markdown test suite.
.HLINE
.SH 1
Headers
@ -126,7 +126,7 @@ Paragraphs
.pdfhref O 1 "Paragraphs"
.pdfhref M "paragraphs"
.LP
Heres a regular paragraph.
Here\[cq]s a regular paragraph.
.PP
In Markdown 1.0.0 and earlier.
Version 8.
@ -134,7 +134,7 @@ This line turns into a list item.
Because a hard\-wrapped line in the middle of a paragraph looked like a list
item.
.PP
Heres one with a bullet.
Here\[cq]s one with a bullet.
* criminey.
.PP
There should be a hard line break
@ -314,7 +314,7 @@ Item 1, graf one.
.PP
Item 1.
graf two.
The quick brown fox jumped over the lazy dogs back.
The quick brown fox jumped over the lazy dog\[cq]s back.
.RE
.IP " 2." 4
Item 2.
@ -335,7 +335,7 @@ Tab
.RE
.RE
.LP
Heres another:
Here\[cq]s another:
.IP " 1." 4
First
.IP " 2." 4
@ -431,7 +431,7 @@ Nested.
.LP
Should not be a list item:
.PP
M.A.\~2007
M.A.\ 2007
.PP
B.
Williams
@ -570,7 +570,7 @@ Interpreted markdown in a table:
This is \f[I]emphasized\f[R]
And this is \f[B]strong\f[R]
.PP
Heres a simple block:
Here\[cq]s a simple block:
.LP
foo
.LP
@ -617,7 +617,7 @@ Code:
\f[]
.fi
.LP
Hrs:
Hr\[cq]s:
.HLINE
.SH 1
Inline Markup
@ -646,9 +646,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
.PP
\m[strikecolor]This is \f[I]strikeout\f[R].\m[]
.PP
Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\~there\*}.
Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\ there\*}.
.PP
Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\~of\~them\*>O.
Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\ of\ them\*>O.
.PP
These should not be superscripts or subscripts, because of the unescaped
spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.
@ -666,16 +666,16 @@ Smart quotes, ellipses, dashes
`Oak,' `elm,' and `beech' are names of trees.
So is `pine.'
.PP
`He said, \[lq]I want to go.\[rq]' Were you alive in the 70s?
`He said, \[lq]I want to go.\[rq]' Were you alive in the 70\[cq]s?
.PP
Here is some quoted `\f[C]code\f[R]' and a \[lq]\c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "quoted link"
\&\[rq].
.PP
Some dashes: one—two — three—four — five.
Some dashes: one\[em]two \[em] three\[em]four \[em] five.
.PP
Dashes between numbers: 57, 25566, 19871999.
Dashes between numbers: 5\[en]7, 255\[en]66, 1987\[en]1999.
.PP
Ellipses\&...and\&...and\&....
.HLINE
@ -695,14 +695,14 @@ LaTeX
.IP \[bu] 3
@p@\-Tree
.IP \[bu] 3
Heres some display math:
Here\[cq]s some display math:
.EQ
d over {d x} f ( x ) = lim sub {h -> 0} {f ( x + h ) \[u2212] f ( x )} over h
.EN
.IP \[bu] 3
Heres one that has a line break in it: @alpha + omega times x sup 2@.
Here\[cq]s one that has a line break in it: @alpha + omega times x sup 2@.
.LP
These shouldnt be math:
These shouldn\[cq]t be math:
.IP \[bu] 3
To get the famous equation, write \f[C]$e = mc\[ha]2$\f[R].
.IP \[bu] 3
@ -714,7 +714,7 @@ Shoes ($20) and socks ($5).
.IP \[bu] 3
Escaped \f[C]$\f[R]: $73 \f[I]this should be emphasized\f[R] 23$.
.LP
Heres a LaTeX table:
Here\[cq]s a LaTeX table:
.HLINE
.SH 1
Special Characters
@ -885,22 +885,22 @@ With ampersands
.pdfhref O 2 "With ampersands"
.pdfhref M "with-ampersands"
.LP
Heres a \c
Here\[cq]s a \c
.pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
-- "link with an ampersand in the URL"
\&.
.PP
Heres a link with an amersand in the link text: \c
Here\[cq]s a link with an amersand in the link text: \c
.pdfhref W -D "http://att.com/" -A "\c" \
-- "AT&T"
\&.
.PP
Heres an \c
Here\[cq]s an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link"
\&.
.PP
Heres an \c
Here\[cq]s an \c
.pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
-- "inline link in pointy braces"
\&.
@ -925,7 +925,7 @@ It should.
.LP
An e\-mail address: \c
.pdfhref W -D "mailto:nobody%40nowhere.net" -A "\c" \
-- "nobody\[at]nowhere.net"
-- "nobody@nowhere.net"
\&
.RS
.LP
@ -967,7 +967,7 @@ It need not be placed at the end of the document.
.FE
and another.\**
.FS
Heres the long note.
Here\[cq]s the long note.
This one contains multiple blocks.
.PP
Subsequent blocks are indented to show that they belong to the footnote (as