Multimarkdown sub- and superscripts (#5512) (#7188)

Added an extension `short_subsuperscripts` which modifies the behavior of `subscript` and `superscript`, allowing subscripts or superscripts containing only alphanumerics to end with a space character (eg. `x^2 = 4` or `H~2 is combustible`). This improves support for multimarkdown. Closes #5512. Add `Ext_short_subsuperscripts` constructor to `Extension` [API change]. This is enabled by default for `markdown_mmd`.
2021-08-16 06:57:57 +02:00 · 2021-08-16 06:57:57 +02:00 · e37cf4484d
commit e37cf4484d
parent 72447a563c
4 changed files with 80 additions and 15 deletions
--- a/MANUAL.txt
+++ b/MANUAL.txt
@ -5304,6 +5304,18 @@ For elements that accept attributes, a `data-pos` attribute
 is added; other elements are placed in a surrounding
 Div or Span elemnet with a `data-pos` attribute.

+#### Extension: `short_subsuperscript` ####
+
+Parse multimarkdown style subscripts and superscripts, which start with
+a '~' or '^' character, respectively, and include the alphanumeric sequence
+that follows. For example:
+
+    x^2 = 4
+
+or
+
+    Oxygen is O~2.
+
 ## Markdown variants

 In addition to pandoc's extended Markdown, the following Markdown
--- a/src/Text/Pandoc/Extensions.hs
+++ b/src/Text/Pandoc/Extensions.hs
@ -124,6 +124,7 @@ data Extension =
    | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid]
    | Ext_mmd_link_attributes     -- ^ MMD style reference link attributes
    | Ext_mmd_title_block     -- ^ Multimarkdown metadata block
+    | Ext_short_subsuperscripts -- ^ sub-&superscripts w/o closing char (v~i)
    | Ext_multiline_tables    -- ^ Pandoc-style multiline tables
    | Ext_native_divs             -- ^ Use Div blocks for contents of <div> tags
    | Ext_native_spans            -- ^ Use Span inlines for contents of <span>
@ -286,14 +287,9 @@ multimarkdownExtensions = extensionsFromList
  , Ext_auto_identifiers
  , Ext_mmd_header_identifiers
  , Ext_implicit_figures
-  -- Note: MMD's syntax for superscripts and subscripts
-  -- is a bit more permissive than pandoc's, allowing
-  -- e^2 and a~1 instead of e^2^ and a~1~, so even with
-  -- these options we don't have full support for MMD
-  -- superscripts and subscripts, but there's no reason
-  -- not to include these:
-  , Ext_superscript
+  , Ext_short_subsuperscripts
  , Ext_subscript
+  , Ext_superscript
  , Ext_backtick_code_blocks
  , Ext_spaced_reference_links
  -- So far only in dev version of mmd:
@ -464,6 +460,7 @@ getAllExtensions f = universalExtensions <> getAll f
       , Ext_gutenberg
       , Ext_smart
       , Ext_literate_haskell
+       , Ext_short_subsuperscripts
       , Ext_rebase_relative_paths
       ]
  getAll "markdown_strict"   = allMarkdownExtensions
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@ -1692,21 +1692,29 @@ strikeout = fmap B.strikeout <$>

 superscript :: PandocMonad m => MarkdownParser m (F Inlines)
 superscript = do
-  guardEnabled Ext_superscript
  fmap B.superscript <$> try (do
    char '^'
-    mconcat <$> many1Till (do notFollowedBy spaceChar
-                              notFollowedBy newline
-                              inline) (char '^'))
+    mconcat <$> (try regularSuperscript <|> try mmdShortSuperscript))
+      where regularSuperscript = many1Till (do guardEnabled Ext_superscript
+                                               notFollowedBy spaceChar
+                                               notFollowedBy newline
+                                               inline) (char '^')
+            mmdShortSuperscript = do guardEnabled Ext_short_subsuperscripts
+                                     result <- take1WhileP isAlphaNum
+                                     return $ return $ return $ B.str result

 subscript :: PandocMonad m => MarkdownParser m (F Inlines)
 subscript = do
-  guardEnabled Ext_subscript
  fmap B.subscript <$> try (do
    char '~'
-    mconcat <$> many1Till (do notFollowedBy spaceChar
-                              notFollowedBy newline
-                              inline) (char '~'))
+    mconcat <$> (try regularSubscript <|> mmdShortSubscript))
+      where regularSubscript = many1Till (do guardEnabled Ext_subscript
+                                             notFollowedBy spaceChar
+                                             notFollowedBy newline
+                                             inline) (char '~')
+            mmdShortSubscript = do guardEnabled Ext_short_subsuperscripts
+                                   result <- take1WhileP isAlphaNum
+                                   return $ return $ return $ B.str result

 whitespace :: PandocMonad m => MarkdownParser m (F Inlines)
 whitespace = spaceChar >> return <$> (lb <|> regsp) <?> "whitespace"
--- a/test/Tests/Readers/Markdown.hs
+++ b/test/Tests/Readers/Markdown.hs
@ -36,6 +36,9 @@ markdownGH :: Text -> Pandoc
 markdownGH = purely $ readMarkdown def {
                readerExtensions = githubMarkdownExtensions }

+markdownMMD :: Text -> Pandoc
+markdownMMD = purely $ readMarkdown def {
+                 readerExtensions = multimarkdownExtensions }
 infix 4 =:
 (=:) :: ToString c
     => String -> (Text, c) -> TestTree
@ -360,6 +363,51 @@ tests = [ testGroup "inline code"
            ("**this should \"be bold**"
            =?> para (strong "this should \8220be bold"))
          ]
+        , testGroup "sub- and superscripts"
+          [
+            test markdownMMD "normal subscript"
+            ("H~2~"
+            =?> para ("H" <> subscript "2"))
+          , test markdownMMD "normal superscript"
+            ("x^3^"
+            =?> para ("x" <> superscript "3"))
+          , test markdownMMD "short subscript delimeted by space"
+            ("O~2 is dangerous"
+            =?> para ("O" <> subscript "2" <> space <> "is dangerous"))
+          , test markdownMMD "short subscript delimeted by newline"
+            ("O~2\n"
+            =?> para ("O" <> subscript "2"))
+          , test markdownMMD "short subscript delimeted by EOF"
+            ("O~2"
+            =?> para ("O" <> subscript "2"))
+          , test markdownMMD "short subscript delimited by punctuation"
+            ("O~2."
+            =?> para ("O" <> subscript "2" <> "."))
+          , test markdownMMD "short subscript delimited by emph"
+            ("O~2*combustible!*"
+            =?> para ("O" <> subscript "2" <> emph "combustible!"))
+          , test markdownMMD "no nesting in short subscripts"
+            ("y~*2*"
+            =?> para ("y~" <> emph "2"))
+          , test markdownMMD "short superscript delimeted by space"
+            ("x^2 = y"
+            =?> para ("x" <> superscript "2" <> space <> "= y"))
+          , test markdownMMD "short superscript delimeted by newline"
+            ("x^2\n"
+            =?> para ("x" <> superscript "2"))
+          , test markdownMMD "short superscript delimeted by ExF"
+            ("x^2"
+            =?> para ("x" <> superscript "2"))
+          , test markdownMMD "short superscript delimited by punctuation"
+            ("x^2."
+            =?> para ("x" <> superscript "2" <> "."))
+          , test markdownMMD "short superscript delimited by emph"
+            ("x^2*combustible!*"
+            =?> para ("x" <> superscript "2" <> emph "combustible!"))
+          , test markdownMMD "no nesting in short superscripts"
+            ("y^*2*"
+            =?> para ("y^" <> emph "2"))
+          ]
        , testGroup "footnotes"
          [ "indent followed by newline and flush-left text" =:
            "[^1]\n\n[^1]: my note\n\n     \nnot in note\n"