CommonMark writer: add plain text fallbacks. (#4531)

Previously, the writer would unconditionally emit HTMLish output for
subscripts, superscripts, strikeouts (if the strikeout extension is
disabled) and small caps, even with raw_html disabled.

Now there are plain-text (and, where possible, fancy Unicode)
fallbacks for all of these corresponding (mostly) to the Markdown
fallbacks, and the HTMLish output is only used when raw_html is
enabled.

This commit adds exported functions `toSuperscript` and
`toSubscript` to `Text.Pandoc.Writers.Shared`.  [API change]

Closes #4528.
This commit is contained in:
quasicomputational 2018-10-06 05:33:14 +01:00 committed by John MacFarlane
parent a26b3a2d6a
commit 6207bdeb68
5 changed files with 237 additions and 41 deletions

View file

@ -3381,8 +3381,14 @@ Markdown allows it, but it has been made an extension so that it can
be disabled if desired.)
The raw HTML is passed through unchanged in HTML, S5, Slidy, Slideous,
DZSlides, EPUB, Markdown, Emacs Org mode, and Textile output, and suppressed
in other formats.
DZSlides, EPUB, Markdown, CommonMark, Emacs Org mode, and Textile
output, and suppressed in other formats.
In the CommonMark format, if `raw_html` is enabled, superscripts,
subscripts, strikeouts and small capitals will be represented as HTML.
Otherwise, plain-text fallbacks will be used. Note that even if
`raw_html` is disabled, tables will be rendered with HTML syntax if
they cannot use pipe syntax.
#### Extension: `markdown_in_html_blocks` ####

View file

@ -45,7 +45,7 @@ import Network.HTTP (urlEncode)
import Text.Pandoc.Class (PandocMonad)
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Shared (isTightList, linesToPara, substitute)
import Text.Pandoc.Shared (isTightList, linesToPara, substitute, capitalize)
import Text.Pandoc.Templates (renderTemplate')
import Text.Pandoc.Walk (query, walk, walkM)
import Text.Pandoc.Writers.HTML (writeHtml5String, tagWithAttributes)
@ -253,18 +253,34 @@ inlineToNodes opts (Strong xs) = (node STRONG (inlinesToNodes opts xs) :)
inlineToNodes opts (Strikeout xs) =
if isEnabled Ext_strikeout opts
then (node (CUSTOM_INLINE "~~" "~~") (inlinesToNodes opts xs) :)
else ((node (HTML_INLINE (T.pack "<s>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</s>")) []]) ++ )
else if isEnabled Ext_raw_html opts
then ((node (HTML_INLINE (T.pack "<s>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</s>")) []]) ++ )
else (inlinesToNodes opts xs ++)
inlineToNodes opts (Superscript xs) =
((node (HTML_INLINE (T.pack "<sup>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</sup>")) []]) ++ )
if isEnabled Ext_raw_html opts
then ((node (HTML_INLINE (T.pack "<sup>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</sup>")) []]) ++ )
else case traverse toSuperscriptInline xs of
Nothing ->
((node (TEXT (T.pack "^(")) [] : inlinesToNodes opts xs ++
[node (TEXT (T.pack ")")) []]) ++ )
Just xs' -> (inlinesToNodes opts xs' ++)
inlineToNodes opts (Subscript xs) =
((node (HTML_INLINE (T.pack "<sub>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</sub>")) []]) ++ )
if isEnabled Ext_raw_html opts
then ((node (HTML_INLINE (T.pack "<sub>")) [] : inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</sub>")) []]) ++ )
else case traverse toSubscriptInline xs of
Nothing ->
((node (TEXT (T.pack "_(")) [] : inlinesToNodes opts xs ++
[node (TEXT (T.pack ")")) []]) ++ )
Just xs' -> (inlinesToNodes opts xs' ++)
inlineToNodes opts (SmallCaps xs) =
((node (HTML_INLINE (T.pack "<span class=\"smallcaps\">")) []
: inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</span>")) []]) ++ )
if isEnabled Ext_raw_html opts
then ((node (HTML_INLINE (T.pack "<span class=\"smallcaps\">")) []
: inlinesToNodes opts xs ++
[node (HTML_INLINE (T.pack "</span>")) []]) ++ )
else (inlinesToNodes opts (capitalize xs) ++)
inlineToNodes opts (Link _ ils (url,tit)) =
(node (LINK (T.pack url) (T.pack tit)) (inlinesToNodes opts ils) :)
-- title beginning with fig: indicates implicit figure
@ -319,3 +335,19 @@ inlineToNodes opts (Span attr ils) =
inlineToNodes opts (Cite _ ils) = (inlinesToNodes opts ils ++)
inlineToNodes _ (Note _) = id -- should not occur
-- we remove Note elements in preprocessing
toSubscriptInline :: Inline -> Maybe Inline
toSubscriptInline Space = Just Space
toSubscriptInline (Span attr ils) = Span attr <$> traverse toSubscriptInline ils
toSubscriptInline (Str s) = Str <$> traverse toSubscript s
toSubscriptInline LineBreak = Just LineBreak
toSubscriptInline SoftBreak = Just SoftBreak
toSubscriptInline _ = Nothing
toSuperscriptInline :: Inline -> Maybe Inline
toSuperscriptInline Space = Just Space
toSuperscriptInline (Span attr ils) = Span attr <$> traverse toSuperscriptInline ils
toSuperscriptInline (Str s) = Str <$> traverse toSuperscript s
toSuperscriptInline LineBreak = Just LineBreak
toSuperscriptInline SoftBreak = Just SoftBreak
toSuperscriptInline _ = Nothing

View file

@ -38,7 +38,7 @@ module Text.Pandoc.Writers.Markdown (writeMarkdown, writePlain) where
import Prelude
import Control.Monad.Reader
import Control.Monad.State.Strict
import Data.Char (chr, isPunctuation, isSpace, ord, isAlphaNum)
import Data.Char (isPunctuation, isSpace, isAlphaNum)
import Data.Default
import qualified Data.HashMap.Strict as H
import Data.List (find, group, intersperse, sortBy, stripPrefix, transpose)
@ -1249,33 +1249,6 @@ makeMathPlainer = walk go
go (Emph xs) = Span nullAttr xs
go x = x
toSuperscript :: Char -> Maybe Char
toSuperscript '1' = Just '\x00B9'
toSuperscript '2' = Just '\x00B2'
toSuperscript '3' = Just '\x00B3'
toSuperscript '+' = Just '\x207A'
toSuperscript '-' = Just '\x207B'
toSuperscript '=' = Just '\x207C'
toSuperscript '(' = Just '\x207D'
toSuperscript ')' = Just '\x207E'
toSuperscript c
| c >= '0' && c <= '9' =
Just $ chr (0x2070 + (ord c - 48))
| isSpace c = Just c
| otherwise = Nothing
toSubscript :: Char -> Maybe Char
toSubscript '+' = Just '\x208A'
toSubscript '-' = Just '\x208B'
toSubscript '=' = Just '\x208C'
toSubscript '(' = Just '\x208D'
toSubscript ')' = Just '\x208E'
toSubscript c
| c >= '0' && c <= '9' =
Just $ chr (0x2080 + (ord c - 48))
| isSpace c = Just c
| otherwise = Nothing
lineBreakToSpace :: Inline -> Inline
lineBreakToSpace LineBreak = Space
lineBreakToSpace SoftBreak = Space

View file

@ -48,12 +48,15 @@ module Text.Pandoc.Writers.Shared (
, lookupMetaString
, stripLeadingTrailingSpace
, groffEscape
, toSubscript
, toSuperscript
)
where
import Prelude
import Control.Monad (zipWithM)
import Data.Aeson (FromJSON (..), Result (..), ToJSON (..), Value (Object),
encode, fromJSON)
import Data.Char (chr, ord, isAscii, isSpace)
import qualified Data.HashMap.Strict as H
import Data.List (groupBy, intersperse, transpose)
import qualified Data.Map as M
@ -68,7 +71,6 @@ import Text.Pandoc.Shared (stringify)
import Text.Pandoc.UTF8 (toStringLazy)
import Text.Pandoc.XML (escapeStringForXML)
import Text.Printf (printf)
import Data.Char (isAscii, ord)
-- | Create JSON value for template from a 'Meta' and an association list
-- of variables, specified at the command line or in the writer.
@ -392,3 +394,30 @@ groffEscape = T.concatMap toUchar
| isAscii c = T.singleton c
| otherwise = T.pack $ printf "\\[u%04X]" (ord c)
toSuperscript :: Char -> Maybe Char
toSuperscript '1' = Just '\x00B9'
toSuperscript '2' = Just '\x00B2'
toSuperscript '3' = Just '\x00B3'
toSuperscript '+' = Just '\x207A'
toSuperscript '-' = Just '\x207B'
toSuperscript '=' = Just '\x207C'
toSuperscript '(' = Just '\x207D'
toSuperscript ')' = Just '\x207E'
toSuperscript c
| c >= '0' && c <= '9' =
Just $ chr (0x2070 + (ord c - 48))
| isSpace c = Just c
| otherwise = Nothing
toSubscript :: Char -> Maybe Char
toSubscript '+' = Just '\x208A'
toSubscript '-' = Just '\x208B'
toSubscript '=' = Just '\x208C'
toSubscript '(' = Just '\x208D'
toSubscript ')' = Just '\x208E'
toSubscript c
| c >= '0' && c <= '9' =
Just $ chr (0x2080 + (ord c - 48))
| isSpace c = Just c
| otherwise = Nothing

156
test/command/4528.md Normal file
View file

@ -0,0 +1,156 @@
# Rendering small caps, superscripts and subscripts with and without `raw_html`
## Small caps
```
% pandoc --wrap=none -f latex -t commonmark-raw_html
This has \textsc{small caps} in it.
^D
This has SMALL CAPS in it.
```
```
% pandoc --wrap=none -f latex -t commonmark+raw_html
This has \textsc{small caps} in it.
^D
This has <span class="smallcaps">small caps</span> in it.
```
```
```
% pandoc --wrap=none -f latex -t markdown_strict+raw_html
This has \textsc{small caps} in it.
^D
This has <span class="smallcaps">small caps</span> in it.
```
## Strikeout
```
% pandoc --wrap=none -f html -t commonmark-raw_html-strikeout
This has <s>strikeout</s> in it.
^D
This has strikeout in it.
```
% pandoc --wrap=none -f html -t commonmark+raw_html-strikeout
This has <s>strikeout</s> in it.
^D
This has <s>strikeout</s> in it.
```
```
% pandoc --wrap=none -f html -t commonmark-raw_html+strikeout
This has <s>strikeout</s> in it.
^D
This has ~~strikeout~~ in it.
```
```
% pandoc --wrap=none -f html -t commonmark+raw_html+strikeout
This has <s>strikeout</s> in it.
^D
This has ~~strikeout~~ in it.
```
```
% pandoc --wrap=none -f html -t markdown_strict-raw_html-strikeout
This has <s>strikeout</s> in it.
^D
This has strikeout in it.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html-strikeout
This has <s>strikeout</s> in it.
^D
This has <s>strikeout</s> in it.
```
```
% pandoc --wrap=none -f html -t markdown_strict-raw_html+strikeout
This has <s>strikeout</s> in it.
^D
This has ~~strikeout~~ in it.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html+strikeout
This has <s>strikeout</s> in it.
^D
This has ~~strikeout~~ in it.
```
## Superscript
```
% pandoc --wrap=none -f html -t commonmark-raw_html
This has <sup>superscript</sup> in it and <sup>2 3</sup> again. With emphasis: <sup><em>2</em> 3</sup>. With letters: <sup>foo</sup>. With a span: <sup><span class=foo>2</span></sup>.
^D
This has ^(superscript) in it and ² ³ again. With emphasis: ^(*2* 3). With letters: ^(foo). With a span: ².
```
```
% pandoc --wrap=none -f html -t commonmark+raw_html
This has <sup>superscript</sup> in it and <sup>2</sup> again.
^D
This has <sup>superscript</sup> in it and <sup>2</sup> again.
```
```
% pandoc --wrap=none -f html -t markdown_strict-raw_html-superscript
This has <sup>superscript</sup> in it and <sup>2</sup> again.
^D
This has ^(superscript) in it and ² again.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html-superscript
This has <sup>superscript</sup> in it and <sup>2</sup> again.
^D
This has <sup>superscript</sup> in it and <sup>2</sup> again.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html+superscript
This has <sup>superscript</sup> in it and <sup>2</sup> again.
^D
This has ^superscript^ in it and ^2^ again.
```
## Subscript
```
% pandoc --wrap=none -f html -t commonmark-raw_html
This has <sub>subscript</sub> in it and <sub>2 3</sub> again. With emphasis: <sub><em>2</em> 3</sub>. With letters: <sub>foo</sub>. With a span: <sub><span class=foo>2</span></sub>.
^D
This has \_(subscript) in it and ₂ ₃ again. With emphasis: \_(*2* 3). With letters: \_(foo). With a span: ₂.
```
```
% pandoc --wrap=none -f html -t commonmark+raw_html
This has <sub>subscript</sub> in it and <sub>2</sub> again.
^D
This has <sub>subscript</sub> in it and <sub>2</sub> again.
```
```
% pandoc --wrap=none -f html -t markdown_strict-raw_html-subscript
This has <sub>subscript</sub> in it and <sub>2</sub> again.
^D
This has _(subscript) in it and ₂ again.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html-subscript
This has <sub>subscript</sub> in it and <sub>2</sub> again.
^D
This has <sub>subscript</sub> in it and <sub>2</sub> again.
```
```
% pandoc --wrap=none -f html -t markdown_strict+raw_html+subscript
This has <sub>subscript</sub> in it and <sub>2</sub> again.
^D
This has ~subscript~ in it and ~2~ again.
```