From 8efb8975ed641ddd075954e1ccc7f71eca1d3c16 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Tue, 23 Oct 2018 21:38:21 -0700
Subject: [PATCH] Groff writer character escaping changes.

T.P.GroffChar:  replaced `essentialEscapes` with `manEscapes`,
which includes all the escapes mentioned in the groff_man manual.

T.P.Writers.Groff: removed escapeCode; changed parameter on
escapeString from Bool to new type `EscapeMode`.
Rewrote `escapeString`.
---
 src/Text/Pandoc/GroffChar.hs     | 23 ++++++++-----
 src/Text/Pandoc/Writers/Groff.hs | 58 +++++++++++++++++---------------
 src/Text/Pandoc/Writers/Man.hs   | 20 +++++++----
 src/Text/Pandoc/Writers/Ms.hs    | 30 ++++++++---------
 test/command/4550.md             |  2 +-
 test/tables.ms                   |  8 ++---
 test/writer.man                  |  6 ++--
 test/writer.ms                   | 46 ++++++++++++-------------
 8 files changed, 104 insertions(+), 89 deletions(-)

diff --git a/src/Text/Pandoc/GroffChar.hs b/src/Text/Pandoc/GroffChar.hs
index 6d7991e5d..efb3cf11a 100644
--- a/src/Text/Pandoc/GroffChar.hs
+++ b/src/Text/Pandoc/GroffChar.hs
@@ -31,24 +31,29 @@ Groff character escaping/unescaping.
 -}
 
 module Text.Pandoc.GroffChar (
-    essentialEscapes
+    manEscapes
   , characterCodes
   , combiningAccents
   ) where
 import Prelude
-import qualified Data.Map as Map
 
-essentialEscapes :: Map.Map Char String
-essentialEscapes = Map.fromList
-  [ ('\160', "\\~")
+-- | These are the escapes specifically mentioned in groff_man(7).
+manEscapes :: [(Char, String)]
+manEscapes =
+  [ ('\160', "\\ ")
   , ('\'', "\\[aq]")
-  , ('`', "\\[ga]")
+  , ('‘', "\\[oq]")
+  , ('’', "\\[cq]")
   , ('"', "\\[dq]")
-  , ('~', "\\[ti]")
+  , ('“', "\\[lq]")
+  , ('”', "\\[rq]")
+  , ('—', "\\[em]")
+  , ('–', "\\[en]")
+  , ('`', "\\[ga]")
   , ('^', "\\[ha]")
-  , ('@', "\\[at]")
-  , ('\\', "\\[rs]")
+  , ('~', "\\[ti]")
   , ('-', "\\-")  -- minus; - will be interpreted as hyphen U+2010
+  , ('\\', "\\[rs]")
   , ('\x2026', "\\&...")  -- because u2026 doesn't render on tty
   ]
 
diff --git a/src/Text/Pandoc/Writers/Groff.hs b/src/Text/Pandoc/Writers/Groff.hs
index fb3cc085b..b0e8d3d06 100644
--- a/src/Text/Pandoc/Writers/Groff.hs
+++ b/src/Text/Pandoc/Writers/Groff.hs
@@ -34,22 +34,21 @@ module Text.Pandoc.Writers.Groff (
     , defaultWriterState
     , MS
     , Note
+    , EscapeMode(..)
     , escapeString
-    , escapeCode
     , withFontFeature
     ) where
 import Prelude
 import Data.Char (ord, isAscii)
 import Control.Monad.State.Strict
-import Data.List (intercalate)
 import qualified Data.Map as Map
 import Data.Maybe (fromMaybe, isJust, catMaybes)
 import Text.Pandoc.Class (PandocMonad)
 import Text.Pandoc.Definition
 import Text.Pandoc.Pretty
 import Text.Printf (printf)
-import Text.Pandoc.GroffChar (essentialEscapes, characterCodes,
-                             combiningAccents)
+import Text.Pandoc.GroffChar (manEscapes,
+                              characterCodes, combiningAccents)
 
 data WriterState = WriterState { stHasInlineMath :: Bool
                                , stFirstPara     :: Bool
@@ -80,33 +79,38 @@ type Note = [Block]
 
 type MS = StateT WriterState
 
+data EscapeMode = AllowUTF8        -- ^ use preferred man escapes
+                | AsciiOnly        -- ^ escape everything
+                deriving Show
+
 combiningAccentsMap :: Map.Map Char String
 combiningAccentsMap = Map.fromList combiningAccents
 
--- | Escape special characters for groff.
-escapeString :: Bool -> String -> String
-escapeString _ [] = []
-escapeString useAscii (x:xs) =
-  case Map.lookup x essentialEscapes of
-    Just s  -> s ++ escapeString useAscii xs
-    Nothing
-      | isAscii x || not useAscii -> x : escapeString useAscii xs
-      | otherwise ->
-        let accents = catMaybes $ takeWhile isJust
-              (map (\c -> Map.lookup c combiningAccentsMap) xs)
-            rest = drop (length accents) xs
-            s = case Map.lookup x characterCodeMap of
-                  Just t  -> "\\[" <> unwords (t:accents) <> "]"
-                  Nothing -> "\\[" <> unwords
-                   (printf "u%04X" (ord x) : accents) <> "]"
-        in  s ++ escapeString useAscii rest
+essentialEscapes :: Map.Map Char String
+essentialEscapes = Map.fromList manEscapes
 
--- | Escape a literal (code) section for groff.
-escapeCode :: Bool -> String -> String
-escapeCode useAscii = intercalate "\n" . map escapeLine . lines
-  where escapeLine xs = case xs of
-                          ('.':_) -> "\\%" ++ escapeString useAscii xs
-                          _       -> escapeString useAscii xs
+-- | Escape special characters for groff.
+escapeString :: EscapeMode -> String -> String
+escapeString _ [] = []
+escapeString escapeMode ('\n':'.':xs) =
+  '\n':'\\':'&':'.':escapeString escapeMode xs
+escapeString escapeMode (x:xs) =
+  case Map.lookup x essentialEscapes of
+    Just s  -> s ++ escapeString escapeMode xs
+    Nothing
+     | isAscii x -> x : escapeString escapeMode xs
+     | otherwise ->
+        case escapeMode of
+          AllowUTF8 -> x : escapeString escapeMode xs
+          AsciiOnly ->
+            let accents = catMaybes $ takeWhile isJust
+                  (map (\c -> Map.lookup c combiningAccentsMap) xs)
+                rest = drop (length accents) xs
+                s = case Map.lookup x characterCodeMap of
+                      Just t  -> "\\[" <> unwords (t:accents) <> "]"
+                      Nothing -> "\\[" <> unwords
+                       (printf "u%04X" (ord x) : accents) <> "]"
+            in  s ++ escapeString escapeMode rest
 
 characterCodeMap :: Map.Map Char String
 characterCodeMap = Map.fromList characterCodes
diff --git a/src/Text/Pandoc/Writers/Man.hs b/src/Text/Pandoc/Writers/Man.hs
index 645476b77..b32d2ff6c 100644
--- a/src/Text/Pandoc/Writers/Man.hs
+++ b/src/Text/Pandoc/Writers/Man.hs
@@ -97,6 +97,9 @@ pandocToMan opts (Pandoc meta blocks) = do
        Nothing  -> return main
        Just tpl -> renderTemplate' tpl context
 
+escString :: WriterOptions -> String -> String
+escString _ = escapeString AsciiOnly -- for better portability
+
 -- | Return man representation of notes.
 notesToMan :: PandocMonad m => WriterOptions -> [[Block]] -> StateT WriterState m Doc
 notesToMan opts notes =
@@ -143,11 +146,14 @@ blockToMan opts (Header level _ inlines) = do
                   1 -> ".SH "
                   _ -> ".SS "
   return $ text heading <> contents
-blockToMan _ (CodeBlock _ str) = return $
+blockToMan opts (CodeBlock _ str) = return $
   text ".IP" $$
   text ".nf" $$
   text "\\f[C]" $$
-  text (escapeCode True str) $$
+  ((case str of
+    '.':_ -> text "\\&"
+    _     -> mempty) <>
+   text (escString opts str)) $$
   text "\\f[R]" $$
   text ".fi"
 blockToMan opts (BlockQuote blocks) = do
@@ -296,11 +302,11 @@ inlineToMan opts (Quoted DoubleQuote lst) = do
   return $ text "\\[lq]" <> contents <> text "\\[rq]"
 inlineToMan opts (Cite _ lst) =
   inlineListToMan opts lst
-inlineToMan _ (Code _ str) =
-  withFontFeature 'C' (return (text $ escapeCode True str))
-inlineToMan _ (Str str@('.':_)) =
-  return $ afterBreak "\\&" <> text (escapeString True str)
-inlineToMan _ (Str str) = return $ text $ escapeString True str
+inlineToMan opts (Code _ str) =
+  withFontFeature 'C' (return (text $ escString opts str))
+inlineToMan opts (Str str@('.':_)) =
+  return $ afterBreak "\\&" <> text (escString opts str)
+inlineToMan opts (Str str) = return $ text $ escString opts str
 inlineToMan opts (Math InlineMath str) =
   lift (texMathToInlines InlineMath str) >>= inlineListToMan opts
 inlineToMan opts (Math DisplayMath str) = do
diff --git a/src/Text/Pandoc/Writers/Ms.hs b/src/Text/Pandoc/Writers/Ms.hs
index e077e9ed9..2fb949cb9 100644
--- a/src/Text/Pandoc/Writers/Ms.hs
+++ b/src/Text/Pandoc/Writers/Ms.hs
@@ -69,9 +69,6 @@ writeMs :: PandocMonad m => WriterOptions -> Pandoc -> m Text
 writeMs opts document =
   evalStateT (pandocToMs opts document) defaultWriterState
 
-escString :: WriterOptions -> String -> String
-escString opts = escapeString (writerPreferAscii opts)
-
 -- | Return groff ms representation of document.
 pandocToMs :: PandocMonad m => WriterOptions -> Pandoc -> MS m Text
 pandocToMs opts (Pandoc meta blocks) = do
@@ -87,8 +84,8 @@ pandocToMs opts (Pandoc meta blocks) = do
   body <- blockListToMs opts blocks
   let main = render' body
   hasInlineMath <- gets stHasInlineMath
-  let titleMeta = (escString opts . stringify) $ docTitle meta
-  let authorsMeta = map (escString opts . stringify) $ docAuthors meta
+  let titleMeta = (escapeStr opts . stringify) $ docTitle meta
+  let authorsMeta = map (escapeStr opts . stringify) $ docAuthors meta
   hasHighlighting <- gets stHighlighting
   let highlightingMacros = if hasHighlighting
                               then case writerHighlightStyle opts of
@@ -108,6 +105,10 @@ pandocToMs opts (Pandoc meta blocks) = do
        Nothing  -> return main
        Just tpl -> renderTemplate' tpl context
 
+escapeStr :: WriterOptions -> String -> String
+escapeStr opts =
+  escapeString (if writerPreferAscii opts then AsciiOnly else AllowUTF8)
+
 escapeUri :: String -> String
 escapeUri = escapeURIString (\c -> c /= '@' && isAllowedInURI c)
 
@@ -121,11 +122,11 @@ toSmallCaps :: WriterOptions -> String -> String
 toSmallCaps _ [] = []
 toSmallCaps opts (c:cs)
   | isLower c = let (lowers,rest) = span isLower (c:cs)
-                in  "\\s-2" ++ escString opts (map toUpper lowers) ++
+                in  "\\s-2" ++ escapeStr opts (map toUpper lowers) ++
                     "\\s0" ++ toSmallCaps opts rest
   | isUpper c = let (uppers,rest) = span isUpper (c:cs)
-                in  escString opts uppers ++ toSmallCaps opts rest
-  | otherwise = escapeString (writerPreferAscii opts) [c] ++ toSmallCaps opts cs
+                in  escapeStr opts uppers ++ toSmallCaps opts rest
+  | otherwise = escapeStr opts [c] ++ toSmallCaps opts cs
 
 -- We split inline lists into sentences, and print one sentence per
 -- line.  groff/troff treats the line-ending period differently.
@@ -162,7 +163,7 @@ blockToMs opts (Para [Image attr alt (src,_tit)])
                        _ -> empty
   capt <- inlineListToMs' opts alt
   return $ nowrap (text ".PSPIC -C " <>
-             doubleQuotes (text (escString opts src)) <>
+             doubleQuotes (text (escapeStr opts src)) <>
              sizeAttrs) $$
            text ".ce 1000" $$
            capt $$
@@ -200,7 +201,7 @@ blockToMs opts (Header level (ident,classes,_) inlines) = do
                                       (if null secnum
                                           then ""
                                           else "  ") ++
-                                      escString opts (stringify inlines))
+                                      escapeStr opts (stringify inlines))
   let backlink = nowrap (text ".pdfhref L -D " <>
        doubleQuotes (text (toAscii ident)) <> space <> text "\\") <> cr <>
        text " -- "
@@ -409,7 +410,7 @@ inlineToMs opts (Str str) = do
   smallcaps <- gets stSmallCaps
   if smallcaps
      then return $ shim <> text (toSmallCaps opts str)
-     else return $ shim <> text (escString opts str)
+     else return $ shim <> text (escapeStr opts str)
 inlineToMs opts (Math InlineMath str) = do
   modify $ \st -> st{ stHasInlineMath = True }
   res <- convertMath writeEqn InlineMath str
@@ -453,7 +454,7 @@ inlineToMs opts (Link _ txt (src, _)) = do
        text " -- " <> doubleQuotes (nowrap contents) <> cr <> text "\\&"
 inlineToMs opts (Image _ alternate (_, _)) =
   return $ char '[' <> text "IMAGE: " <>
-           text (escString opts (stringify alternate))
+           text (escapeStr opts (stringify alternate))
              <> char ']'
 inlineToMs _ (Note contents) = do
   modify $ \st -> st{ stNotes = contents : stNotes st }
@@ -540,15 +541,14 @@ msFormatter opts _fmtopts =
   where fmtLine = hcat . map fmtToken
         fmtToken (toktype, tok) = text "\\*" <>
            brackets (text (show toktype) <> text " \""
-             <> text (escapeCode (writerPreferAscii opts)
-                      (T.unpack tok)) <> text "\"")
+             <> text (escapeStr opts (T.unpack tok)) <> text "\"")
 
 highlightCode :: PandocMonad m => WriterOptions -> Attr -> String -> MS m Doc
 highlightCode opts attr str =
   case highlight (writerSyntaxMap opts) (msFormatter opts) attr str of
          Left msg -> do
            unless (null msg) $ report $ CouldNotHighlight msg
-           return $ text (escapeCode (writerPreferAscii opts) str)
+           return $ text (escapeStr opts str)
          Right h -> do
            modify (\st -> st{ stHighlighting = True })
            return h
diff --git a/test/command/4550.md b/test/command/4550.md
index bf3afce5b..45ed21a00 100644
--- a/test/command/4550.md
+++ b/test/command/4550.md
@@ -3,5 +3,5 @@
 A ‘simple’ example
 ^D
 .LP
-A ‘simple’ example
+A \[oq]simple\[cq] example
 ```
diff --git a/test/tables.ms b/test/tables.ms
index 6d9138c64..90662aaad 100644
--- a/test/tables.ms
+++ b/test/tables.ms
@@ -135,7 +135,7 @@ T}
 .LP
 Multiline table with caption:
 .PP
-Here’s the caption. It may span multiple lines.
+Here\[cq]s the caption. It may span multiple lines.
 .TS
 delim(@@) tab(	);
 cw(10.5n) lw(9.6n) rw(11.4n) lw(24.5n).
@@ -165,7 +165,7 @@ row
 T}	T{
 5.0
 T}	T{
-Here’s another one.
+Here\[cq]s another one.
 Note the blank line between rows.
 T}
 .TE
@@ -201,7 +201,7 @@ row
 T}	T{
 5.0
 T}	T{
-Here’s another one.
+Here\[cq]s another one.
 Note the blank line between rows.
 T}
 .TE
@@ -261,7 +261,7 @@ row
 T}	T{
 5.0
 T}	T{
-Here’s another one.
+Here\[cq]s another one.
 Note the blank line between rows.
 T}
 .TE
diff --git a/test/writer.man b/test/writer.man
index 4a05b74fd..4dab58c21 100644
--- a/test/writer.man
+++ b/test/writer.man
@@ -308,7 +308,7 @@ Nested.
 .PP
 Should not be a list item:
 .PP
-M.A.\~2007
+M.A.\ 2007
 .PP
 B.
 Williams
@@ -492,9 +492,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
 .PP
 [STRIKEOUT:This is \f[I]strikeout\f[R].]
 .PP
-Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\~there^.
+Superscripts: a^bc^d a^\f[I]hello\f[R]^ a^hello\ there^.
 .PP
-Subscripts: H~2~O, H~23~O, H~many\~of\~them~O.
+Subscripts: H~2~O, H~23~O, H~many\ of\ them~O.
 .PP
 These should not be superscripts or subscripts, because of the unescaped
 spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.
diff --git a/test/writer.ms b/test/writer.ms
index 40ecddb27..0cb60a3bc 100644
--- a/test/writer.ms
+++ b/test/writer.ms
@@ -75,7 +75,7 @@ July 17, 2006
 .1C
 .LP
 This is a set of tests for pandoc.
-Most of them are adapted from John Gruber’s markdown test suite.
+Most of them are adapted from John Gruber\[cq]s markdown test suite.
 .HLINE
 .SH 1
 Headers
@@ -126,7 +126,7 @@ Paragraphs
 .pdfhref O 1 "Paragraphs"
 .pdfhref M "paragraphs"
 .LP
-Here’s a regular paragraph.
+Here\[cq]s a regular paragraph.
 .PP
 In Markdown 1.0.0 and earlier.
 Version 8.
@@ -134,7 +134,7 @@ This line turns into a list item.
 Because a hard\-wrapped line in the middle of a paragraph looked like a list
 item.
 .PP
-Here’s one with a bullet.
+Here\[cq]s one with a bullet.
 * criminey.
 .PP
 There should be a hard line break
@@ -314,7 +314,7 @@ Item 1, graf one.
 .PP
 Item 1.
 graf two.
-The quick brown fox jumped over the lazy dog’s back.
+The quick brown fox jumped over the lazy dog\[cq]s back.
 .RE
 .IP " 2." 4
 Item 2.
@@ -335,7 +335,7 @@ Tab
 .RE
 .RE
 .LP
-Here’s another:
+Here\[cq]s another:
 .IP " 1." 4
 First
 .IP " 2." 4
@@ -431,7 +431,7 @@ Nested.
 .LP
 Should not be a list item:
 .PP
-M.A.\~2007
+M.A.\ 2007
 .PP
 B.
 Williams
@@ -570,7 +570,7 @@ Interpreted markdown in a table:
 This is \f[I]emphasized\f[R]
 And this is \f[B]strong\f[R]
 .PP
-Here’s a simple block:
+Here\[cq]s a simple block:
 .LP
 foo
 .LP
@@ -617,7 +617,7 @@ Code:
 \f[]
 .fi
 .LP
-Hr’s:
+Hr\[cq]s:
 .HLINE
 .SH 1
 Inline Markup
@@ -646,9 +646,9 @@ This is code: \f[C]>\f[R], \f[C]$\f[R], \f[C]\[rs]\f[R], \f[C]\[rs]$\f[R],
 .PP
 \m[strikecolor]This is \f[I]strikeout\f[R].\m[]
 .PP
-Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\~there\*}.
+Superscripts: a\*{bc\*}d a\*{\f[I]hello\f[R]\*} a\*{hello\ there\*}.
 .PP
-Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\~of\~them\*>O.
+Subscripts: H\*<2\*>O, H\*<23\*>O, H\*<many\ of\ them\*>O.
 .PP
 These should not be superscripts or subscripts, because of the unescaped
 spaces: a\[ha]b c\[ha]d, a\[ti]b c\[ti]d.
@@ -666,16 +666,16 @@ Smart quotes, ellipses, dashes
 `Oak,' `elm,' and `beech' are names of trees.
 So is `pine.'
 .PP
-`He said, \[lq]I want to go.\[rq]' Were you alive in the 70’s?
+`He said, \[lq]I want to go.\[rq]' Were you alive in the 70\[cq]s?
 .PP
 Here is some quoted `\f[C]code\f[R]' and a \[lq]\c
 .pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
  -- "quoted link"
 \&\[rq].
 .PP
-Some dashes: one—two — three—four — five.
+Some dashes: one\[em]two \[em] three\[em]four \[em] five.
 .PP
-Dashes between numbers: 5–7, 255–66, 1987–1999.
+Dashes between numbers: 5\[en]7, 255\[en]66, 1987\[en]1999.
 .PP
 Ellipses\&...and\&...and\&....
 .HLINE
@@ -695,14 +695,14 @@ LaTeX
 .IP \[bu] 3
 @p@\-Tree
 .IP \[bu] 3
-Here’s some display math:
+Here\[cq]s some display math:
 .EQ
 d over {d x} f ( x ) = lim sub {h -> 0} {f ( x + h ) \[u2212] f ( x )} over h
 .EN
 .IP \[bu] 3
-Here’s one that has a line break in it: @alpha + omega times x sup 2@.
+Here\[cq]s one that has a line break in it: @alpha + omega times x sup 2@.
 .LP
-These shouldn’t be math:
+These shouldn\[cq]t be math:
 .IP \[bu] 3
 To get the famous equation, write \f[C]$e = mc\[ha]2$\f[R].
 .IP \[bu] 3
@@ -714,7 +714,7 @@ Shoes ($20) and socks ($5).
 .IP \[bu] 3
 Escaped \f[C]$\f[R]: $73 \f[I]this should be emphasized\f[R] 23$.
 .LP
-Here’s a LaTeX table:
+Here\[cq]s a LaTeX table:
 .HLINE
 .SH 1
 Special Characters
@@ -885,22 +885,22 @@ With ampersands
 .pdfhref O 2 "With ampersands"
 .pdfhref M "with-ampersands"
 .LP
-Here’s a \c
+Here\[cq]s a \c
 .pdfhref W -D "http://example.com/?foo=1&bar=2" -A "\c" \
  -- "link with an ampersand in the URL"
 \&.
 .PP
-Here’s a link with an amersand in the link text: \c
+Here\[cq]s a link with an amersand in the link text: \c
 .pdfhref W -D "http://att.com/" -A "\c" \
  -- "AT&T"
 \&.
 .PP
-Here’s an \c
+Here\[cq]s an \c
 .pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
  -- "inline link"
 \&.
 .PP
-Here’s an \c
+Here\[cq]s an \c
 .pdfhref W -D "/script?foo=1&bar=2" -A "\c" \
  -- "inline link in pointy braces"
 \&.
@@ -925,7 +925,7 @@ It should.
 .LP
 An e\-mail address: \c
 .pdfhref W -D "mailto:nobody%40nowhere.net" -A "\c" \
- -- "nobody\[at]nowhere.net"
+ -- "nobody@nowhere.net"
 \&
 .RS
 .LP
@@ -967,7 +967,7 @@ It need not be placed at the end of the document.
 .FE
 and another.\**
 .FS
-Here’s the long note.
+Here\[cq]s the long note.
 This one contains multiple blocks.
 .PP
 Subsequent blocks are indented to show that they belong to the footnote (as