Muse reader: refactor to avoid reparsing

Lists are parsed in linear instead of exponential time now. Contents of block tags, such as <quote>, is parsed directly, without storing it in a string and parsing with parseFromString. Fixed a bug: headers did not terminate lists.
2018-02-06 03:17:31 +03:00 · 2018-02-06 03:17:31 +03:00 · 8aed3652c2
commit 8aed3652c2
parent 10c8b9f4bb
2 changed files with 233 additions and 77 deletions
--- a/src/Text/Pandoc/Readers/Muse.hs
+++ b/src/Text/Pandoc/Readers/Muse.hs
@ -47,7 +47,7 @@ import Data.List (stripPrefix, intercalate)
 import Data.List.Split (splitOn)
 import qualified Data.Map as M
 import qualified Data.Set as Set
-import Data.Maybe (fromMaybe)
+import Data.Maybe (fromMaybe, isNothing)
 import Data.Text (Text, unpack)
 import System.FilePath (takeExtension)
 import Text.HTML.TagSoup
@ -82,6 +82,7 @@ data MuseState = MuseState { museMeta :: F Meta -- ^ Document metadata
                           , museLogMessages :: [LogMessage]
                           , museNotes :: M.Map String (SourcePos, F Blocks)
                           , museInLink :: Bool
+                           , museInPara :: Bool
                           }

 instance Default MuseState where
@ -96,6 +97,7 @@ defaultMuseState = MuseState { museMeta = return nullMeta
                             , museLogMessages = []
                             , museNotes = M.empty
                             , museInLink = False
+                             , museInPara = False
                             }

 type MuseParser = ParserT String MuseState
@ -149,6 +151,12 @@ htmlElement tag = try $ do
  where
    endtag = void $ htmlTag (~== TagClose tag)

+htmlBlock :: PandocMonad m => String -> MuseParser m (Attr, String)
+htmlBlock tag = try $ do
+  res <- htmlElement tag
+  manyTill spaceChar eol
+  return res
+
 htmlAttrToPandoc :: [Attribute String] -> Attr
 htmlAttrToPandoc attrs = (ident, classes, keyvals)
  where
@ -159,13 +167,13 @@ htmlAttrToPandoc attrs = (ident, classes, keyvals)
 parseHtmlContent :: PandocMonad m
                 => String -> MuseParser m (Attr, F Blocks)
 parseHtmlContent tag = do
-  (attr, content) <- htmlElement tag
-  parsedContent <- parseContent (content ++ "\n")
+  (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag [])
+  manyTill spaceChar eol
+  content <- parseBlocksTill (manyTill spaceChar endtag)
  manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline
-  return (attr, mconcat parsedContent)
+  return (htmlAttrToPandoc attr, content)
  where
-    parseContent = parseFromString $ manyTill parseBlock endOfContent
-    endOfContent = try $ skipMany blankline >> skipSpaces >> eof
+    endtag = void $ htmlTag (~== TagClose tag)

 commonPrefix :: String -> String -> String
 commonPrefix _ [] = []
@ -248,19 +256,85 @@ directive = do
 parseBlocks :: PandocMonad m
            => MuseParser m (F Blocks)
 parseBlocks =
-  try (mempty <$ eof) <|>
+  try parseEnd <|>
  try blockStart <|>
+  try listStart <|>
  try paraStart
  where
+    parseEnd = mempty <$ eof
    blockStart = do first <- header <|> blockElements <|> amuseNoteBlock <|> emacsNoteBlock
                    rest <- parseBlocks
                    return $ first B.<> rest
+    listStart = do
+      st <- getState
+      setState $ st{ museInPara = False }
+      (first, rest) <- anyListUntil parseBlocks
+      return $ first B.<> rest
    paraStart = do
      indent <- length <$> many spaceChar
-      (first, rest) <- paraUntil ((mempty <$ eof) <|> blockStart)
+      (first, rest) <- paraUntil parseBlocks
      let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first
      return $ first' B.<> rest

+parseBlocksTill :: PandocMonad m
+                => MuseParser m a
+                -> MuseParser m (F Blocks)
+parseBlocksTill end =
+  try parseEnd <|>
+  try blockStart <|>
+  try listStart <|>
+  try paraStart
+  where
+    parseEnd = mempty <$ end
+    blockStart = do first <- blockElements
+                    rest <- continuation
+                    return $ first B.<> rest
+    listStart = do
+      st <- getState
+      setState $ st{ museInPara = False }
+      (first, e) <- anyListUntil ((Left <$> end) <|> (Right <$> continuation))
+      case e of
+        Left _ -> return first
+        Right rest -> return $ first B.<> rest
+    paraStart = do (first, e) <- paraUntil ((Left <$> end) <|> (Right <$> continuation))
+                   case e of
+                     Left _ -> return $ first
+                     Right rest -> return $ first B.<> rest
+    continuation = parseBlocksTill end
+
+listItemContentsUntil :: PandocMonad m
+                      => Int
+                      -> MuseParser m a
+                      -> MuseParser m (F Blocks, a)
+listItemContentsUntil col end =
+  try blockStart <|>
+  try listStart <|>
+  try paraStart
+  where
+    parseEnd = do e <- end
+                  return (mempty, e)
+    paraStart = do
+      (first, e) <- paraUntil ((Right <$> continuation) <|> (Left <$> end))
+      case e of
+        Left ee -> return (first, ee)
+        Right (rest, ee) -> return (first B.<> rest, ee)
+    blockStart = do first <- blockElements
+                    (rest, e) <- continuation <|> parseEnd
+                    return (first B.<> rest, e)
+    listStart = do
+      st <- getState
+      setState $ st{ museInPara = False }
+      (first, e) <- anyListUntil ((Right <$> continuation) <|> (Left <$> end))
+      case e of
+        Left ee -> return (first, ee)
+        Right (rest, ee) -> return $ (first B.<> rest, ee)
+    continuation = try $ do blank <- optionMaybe blankline
+                            skipMany blankline
+                            indentWith col
+                            st <- getState
+                            setState $ st{ museInPara = museInPara st && isNothing blank }
+                            listItemContentsUntil col end
+
 parseBlock :: PandocMonad m => MuseParser m (F Blocks)
 parseBlock = do
  res <- blockElements <|> para
@ -269,7 +343,10 @@ parseBlock = do
  where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements)))

 blockElements :: PandocMonad m => MuseParser m (F Blocks)
-blockElements = choice [ mempty <$ blankline
+blockElements = do
+  st <- getState
+  setState $ st{ museInPara = False }
+  choice [ mempty <$ blankline
         , comment
         , separator
         , example
@ -281,9 +358,6 @@ blockElements = choice [ mempty <$ blankline
         , divTag
         , verseTag
         , lineBlock
-                       , bulletList
-                       , orderedList
-                       , definitionList
         , table
         , commentTag
         ]
@ -343,13 +417,13 @@ dropSpacePrefix lns =
 exampleTag :: PandocMonad m => MuseParser m (F Blocks)
 exampleTag = try $ do
  many spaceChar
-  (attr, contents) <- htmlElement "example"
+  (attr, contents) <- htmlBlock "example"
  return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents

 literalTag :: PandocMonad m => MuseParser m (F Blocks)
 literalTag = do
  guardDisabled Ext_amuse -- Text::Amuse does not support <literal>
-  (return . rawBlock) <$> htmlElement "literal"
+  (return . rawBlock) <$> htmlBlock "literal"
  where
    -- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML
    format (_, _, kvs)        = fromMaybe "html" $ lookup "style" kvs
@ -385,18 +459,22 @@ verseLines = do

 verseTag :: PandocMonad m => MuseParser m (F Blocks)
 verseTag = do
-  (_, content) <- htmlElement "verse"
+  (_, content) <- htmlBlock "verse"
  parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content)

 commentTag :: PandocMonad m => MuseParser m (F Blocks)
-commentTag = htmlElement "comment" >> return mempty
+commentTag = htmlBlock "comment" >> return mempty

 -- Indented paragraph is either center, right or quote
 paraUntil :: PandocMonad m
          => MuseParser m a
          -> MuseParser m (F Blocks, a)
 paraUntil end = do
+  state <- getState
+  guard $ not $ museInPara state
+  setState $ state{ museInPara = True }
  (l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end)
+  updateState (\st -> st { museInPara = False })
  return (fmap (B.para) $ trimInlinesF $ mconcat l, e)

 noteMarker :: PandocMonad m => MuseParser m String
@ -413,6 +491,8 @@ amuseNoteBlock = try $ do
  guardEnabled Ext_amuse
  pos <- getPosition
  ref <- noteMarker <* spaceChar
+  st <- getState
+  setState $ st{ museInPara = False }
  content <- listItemContents
  oldnotes <- museNotes <$> getState
  case M.lookup ref oldnotes of
@ -465,35 +545,36 @@ lineBlock = try $ do
 -- lists
 --

-listItemContents' :: PandocMonad m => Int -> MuseParser m (F Blocks)
-listItemContents' col =
-  mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
+bulletListItemsUntil :: PandocMonad m
+                     => Int
+                     -> MuseParser m a
+                     -> MuseParser m ([F Blocks], a)
+bulletListItemsUntil indent end = try $ do
+  char '-'
+  void spaceChar <|> lookAhead eol
+  st <- getState
+  setState $ st{ museInPara = False }
+  (x, e) <- listItemContentsUntil (indent + 2) ((Right <$> try (optional blankline >> indentWith indent >> bulletListItemsUntil indent end)) <|> (Left <$> end))
+  case e of
+    Left ee -> return ([x], ee)
+    Right (xs, ee) -> return (x:xs, ee)
+
+bulletListUntil :: PandocMonad m
+                => MuseParser m a
+                -> MuseParser m (F Blocks, a)
+bulletListUntil end = try $ do
+  many spaceChar
+  pos <- getPosition
+  let indent = sourceColumn pos - 1
+  guard $ indent /= 0
+  (items, e) <- bulletListItemsUntil indent end
+  return $ (B.bulletList <$> sequence items, e)

 listItemContents :: PandocMonad m => MuseParser m (F Blocks)
 listItemContents = do
  pos <- getPosition
  let col = sourceColumn pos - 1
-  listItemContents' col
-
-listItem :: PandocMonad m => Int -> MuseParser m a -> MuseParser m (F Blocks)
-listItem n p = try $ do
-  optional blankline
-  count n spaceChar
-  p
-  void spaceChar <|> lookAhead eol
-  listItemContents
-
-bulletList :: PandocMonad m => MuseParser m (F Blocks)
-bulletList = try $ do
-  many spaceChar
-  pos <- getPosition
-  let col = sourceColumn pos
-  guard $ col /= 1
-  char '-'
-  void spaceChar <|> lookAhead eol
-  first <- listItemContents
-  rest <- many $ listItem (col - 1) (char '-')
-  return $ B.bulletList <$> sequence (first : rest)
+  mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)

 -- | Parses an ordered list marker and returns list attributes.
 anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes
@ -516,38 +597,74 @@ museOrderedListMarker style = do
  char '.'
  return start

-orderedList :: PandocMonad m => MuseParser m (F Blocks)
-orderedList = try $ do
+orderedListItemsUntil :: PandocMonad m
+                      => Int
+                      -> ListNumberStyle
+                      -> MuseParser m a
+                      -> MuseParser m ([F Blocks], a)
+orderedListItemsUntil indent style end =
+  continuation
+  where
+    continuation = try $ do
+      pos <- getPosition
+      void spaceChar <|> lookAhead eol
+      st <- getState
+      setState $ st{ museInPara = False }
+      (x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optionMaybe blankline >> indentWith indent >> museOrderedListMarker style >> continuation)) <|> (Left <$> end))
+      case e of
+        Left ee -> return ([x], ee)
+        Right (xs, ee) -> return (x:xs, ee)
+
+orderedListUntil :: PandocMonad m
+                 => MuseParser m a
+                 -> MuseParser m (F Blocks, a)
+orderedListUntil end = try $ do
  many spaceChar
  pos <- getPosition
-  let col = sourceColumn pos
-  guard $ col /= 1
+  let indent = sourceColumn pos - 1
+  guard $ indent /= 0
  p@(_, style, _) <- anyMuseOrderedListMarker
  guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman]
-  void spaceChar <|> lookAhead eol
-  first <- listItemContents
-  rest <- many $ listItem (col - 1) (museOrderedListMarker style)
-  return $ B.orderedListWith p <$> sequence (first : rest)
+  (items, e) <- orderedListItemsUntil indent style end
+  return $ (B.orderedListWith p <$> sequence items, e)

-definitionListItem :: PandocMonad m => MuseParser m (F (Inlines, [Blocks]))
-definitionListItem = try $ do
+definitionListItemsUntil :: PandocMonad m
+                         => Int
+                         -> MuseParser m a
+                         -> MuseParser m ([F (Inlines, [Blocks])], a)
+definitionListItemsUntil indent end =
+  continuation
+  where continuation = try $ do
          pos <- getPosition
          term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::")
          void spaceChar <|> lookAhead eol
-  contents <- listItemContents' $ sourceColumn pos
-  pure $ do lineContent' <- contents
+          st <- getState
+          setState $ st{ museInPara = False }
+          (x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optional blankline >> count indent spaceChar >> continuation)) <|> (Left <$> end))
+          let xx = do
                term' <- term
-            pure (term', [lineContent'])
+                x' <- x
+                (return (term', [x']))::(F (Inlines, [Blocks]))
+          case e of
+            Left ee -> return $ ([xx], ee)
+            Right (xs, ee) -> return $ (xx : xs, ee)

-definitionList :: PandocMonad m => MuseParser m (F Blocks)
-definitionList = try $ do
+definitionListUntil :: PandocMonad m
+                    => MuseParser m a
+                    -> MuseParser m (F Blocks, a)
+definitionListUntil end = try $ do
  many spaceChar
  pos <- getPosition
  let indent = sourceColumn pos - 1
  guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse
-  first <- definitionListItem
-  rest <- many $ try (optional blankline >> count indent spaceChar >> definitionListItem)
-  return $ B.definitionList <$> sequence (first : rest)
+  (items, e) <- definitionListItemsUntil indent end
+  return (B.definitionList <$> sequence items, e)
+
+anyListUntil :: PandocMonad m
+             => MuseParser m a
+             -> MuseParser m (F Blocks, a)
+anyListUntil end =
+  bulletListUntil end <|> orderedListUntil end <|> definitionListUntil end

 --
 -- tables
--- a/test/Tests/Readers/Muse.hs
+++ b/test/Tests/Readers/Muse.hs
@ -313,6 +313,16 @@ tests =
                  , "</quote>"
                  ]
        =?> blockQuote (para $ text "Hello, world")
+      , "Nested quote tag" =:
+        T.unlines [ "<quote>"
+                  , "foo"
+                  , "<quote>"
+                  , "bar"
+                  , "</quote>"
+                  , "baz"
+                  , "</quote>"
+                  ] =?>
+        blockQuote (para "foo" <> blockQuote (para "bar") <> para "baz")
      , "Verse tag" =:
        T.unlines [ "<verse>"
                  , ""
@ -514,6 +524,12 @@ tests =
                    ] =?>
          header 2 "Foo" <>
          para (spanWith ("bar", [], []) mempty)
+        , "Headers terminate lists" =:
+          T.unlines [ " - foo"
+                    , "* bar"
+                    ] =?>
+          bulletList [ para "foo" ] <>
+          header 1 "bar"
        ]
      , testGroup "Directives"
        [ "Title" =:
@ -846,6 +862,15 @@ tests =
                              , para "c"
                              ]
                    ]
+      , "List continuation afeter nested list" =:
+         T.unlines
+           [ " - - foo"
+           , ""
+           , "   bar"
+           ] =?>
+         bulletList [ bulletList [ para "foo" ] <>
+                      para "bar"
+                    ]
      -- Emacs Muse allows to separate lists with two or more blank lines.
      -- Text::Amuse (Amusewiki engine) always creates a single list as of version 0.82.
      -- pandoc follows Emacs Muse behavior
@ -1087,7 +1112,21 @@ tests =
                             , para "* Bar"
                             ]
                   ]
-      , "List inside a tag" =:
+      , "Bullet list inside a tag" =:
+        T.unlines
+          [ "<quote>"
+          , " - First"
+          , ""
+          , " - Second"
+          , ""
+          , " - Third"
+          , "</quote>"
+          ] =?>
+        blockQuote (bulletList [ para "First"
+                               , para "Second"
+                               , para "Third"
+                               ])
+      , "Ordered list inside a tag" =:
        T.unlines
          [ "<quote>"
          , " 1. First"