Muse reader: refactor to avoid reparsing

Lists are parsed in linear instead of exponential time now.

Contents of block tags, such as <quote>, is parsed directly,
without storing it in a string and parsing with parseFromString.

Fixed a bug: headers did not terminate lists.
This commit is contained in:
Alexander Krotov 2018-02-06 03:17:31 +03:00
parent 10c8b9f4bb
commit 8aed3652c2
2 changed files with 233 additions and 77 deletions

View file

@ -47,7 +47,7 @@ import Data.List (stripPrefix, intercalate)
import Data.List.Split (splitOn)
import qualified Data.Map as M
import qualified Data.Set as Set
import Data.Maybe (fromMaybe)
import Data.Maybe (fromMaybe, isNothing)
import Data.Text (Text, unpack)
import System.FilePath (takeExtension)
import Text.HTML.TagSoup
@ -82,6 +82,7 @@ data MuseState = MuseState { museMeta :: F Meta -- ^ Document metadata
, museLogMessages :: [LogMessage]
, museNotes :: M.Map String (SourcePos, F Blocks)
, museInLink :: Bool
, museInPara :: Bool
}
instance Default MuseState where
@ -96,6 +97,7 @@ defaultMuseState = MuseState { museMeta = return nullMeta
, museLogMessages = []
, museNotes = M.empty
, museInLink = False
, museInPara = False
}
type MuseParser = ParserT String MuseState
@ -149,6 +151,12 @@ htmlElement tag = try $ do
where
endtag = void $ htmlTag (~== TagClose tag)
htmlBlock :: PandocMonad m => String -> MuseParser m (Attr, String)
htmlBlock tag = try $ do
res <- htmlElement tag
manyTill spaceChar eol
return res
htmlAttrToPandoc :: [Attribute String] -> Attr
htmlAttrToPandoc attrs = (ident, classes, keyvals)
where
@ -159,13 +167,13 @@ htmlAttrToPandoc attrs = (ident, classes, keyvals)
parseHtmlContent :: PandocMonad m
=> String -> MuseParser m (Attr, F Blocks)
parseHtmlContent tag = do
(attr, content) <- htmlElement tag
parsedContent <- parseContent (content ++ "\n")
(TagOpen _ attr, _) <- htmlTag (~== TagOpen tag [])
manyTill spaceChar eol
content <- parseBlocksTill (manyTill spaceChar endtag)
manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline
return (attr, mconcat parsedContent)
return (htmlAttrToPandoc attr, content)
where
parseContent = parseFromString $ manyTill parseBlock endOfContent
endOfContent = try $ skipMany blankline >> skipSpaces >> eof
endtag = void $ htmlTag (~== TagClose tag)
commonPrefix :: String -> String -> String
commonPrefix _ [] = []
@ -248,19 +256,85 @@ directive = do
parseBlocks :: PandocMonad m
=> MuseParser m (F Blocks)
parseBlocks =
try (mempty <$ eof) <|>
try parseEnd <|>
try blockStart <|>
try listStart <|>
try paraStart
where
parseEnd = mempty <$ eof
blockStart = do first <- header <|> blockElements <|> amuseNoteBlock <|> emacsNoteBlock
rest <- parseBlocks
return $ first B.<> rest
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, rest) <- anyListUntil parseBlocks
return $ first B.<> rest
paraStart = do
indent <- length <$> many spaceChar
(first, rest) <- paraUntil ((mempty <$ eof) <|> blockStart)
(first, rest) <- paraUntil parseBlocks
let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first
return $ first' B.<> rest
parseBlocksTill :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks)
parseBlocksTill end =
try parseEnd <|>
try blockStart <|>
try listStart <|>
try paraStart
where
parseEnd = mempty <$ end
blockStart = do first <- blockElements
rest <- continuation
return $ first B.<> rest
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, e) <- anyListUntil ((Left <$> end) <|> (Right <$> continuation))
case e of
Left _ -> return first
Right rest -> return $ first B.<> rest
paraStart = do (first, e) <- paraUntil ((Left <$> end) <|> (Right <$> continuation))
case e of
Left _ -> return $ first
Right rest -> return $ first B.<> rest
continuation = parseBlocksTill end
listItemContentsUntil :: PandocMonad m
=> Int
-> MuseParser m a
-> MuseParser m (F Blocks, a)
listItemContentsUntil col end =
try blockStart <|>
try listStart <|>
try paraStart
where
parseEnd = do e <- end
return (mempty, e)
paraStart = do
(first, e) <- paraUntil ((Right <$> continuation) <|> (Left <$> end))
case e of
Left ee -> return (first, ee)
Right (rest, ee) -> return (first B.<> rest, ee)
blockStart = do first <- blockElements
(rest, e) <- continuation <|> parseEnd
return (first B.<> rest, e)
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, e) <- anyListUntil ((Right <$> continuation) <|> (Left <$> end))
case e of
Left ee -> return (first, ee)
Right (rest, ee) -> return $ (first B.<> rest, ee)
continuation = try $ do blank <- optionMaybe blankline
skipMany blankline
indentWith col
st <- getState
setState $ st{ museInPara = museInPara st && isNothing blank }
listItemContentsUntil col end
parseBlock :: PandocMonad m => MuseParser m (F Blocks)
parseBlock = do
res <- blockElements <|> para
@ -269,7 +343,10 @@ parseBlock = do
where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements)))
blockElements :: PandocMonad m => MuseParser m (F Blocks)
blockElements = choice [ mempty <$ blankline
blockElements = do
st <- getState
setState $ st{ museInPara = False }
choice [ mempty <$ blankline
, comment
, separator
, example
@ -281,9 +358,6 @@ blockElements = choice [ mempty <$ blankline
, divTag
, verseTag
, lineBlock
, bulletList
, orderedList
, definitionList
, table
, commentTag
]
@ -343,13 +417,13 @@ dropSpacePrefix lns =
exampleTag :: PandocMonad m => MuseParser m (F Blocks)
exampleTag = try $ do
many spaceChar
(attr, contents) <- htmlElement "example"
(attr, contents) <- htmlBlock "example"
return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents
literalTag :: PandocMonad m => MuseParser m (F Blocks)
literalTag = do
guardDisabled Ext_amuse -- Text::Amuse does not support <literal>
(return . rawBlock) <$> htmlElement "literal"
(return . rawBlock) <$> htmlBlock "literal"
where
-- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML
format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs
@ -385,18 +459,22 @@ verseLines = do
verseTag :: PandocMonad m => MuseParser m (F Blocks)
verseTag = do
(_, content) <- htmlElement "verse"
(_, content) <- htmlBlock "verse"
parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content)
commentTag :: PandocMonad m => MuseParser m (F Blocks)
commentTag = htmlElement "comment" >> return mempty
commentTag = htmlBlock "comment" >> return mempty
-- Indented paragraph is either center, right or quote
paraUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
paraUntil end = do
state <- getState
guard $ not $ museInPara state
setState $ state{ museInPara = True }
(l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end)
updateState (\st -> st { museInPara = False })
return (fmap (B.para) $ trimInlinesF $ mconcat l, e)
noteMarker :: PandocMonad m => MuseParser m String
@ -413,6 +491,8 @@ amuseNoteBlock = try $ do
guardEnabled Ext_amuse
pos <- getPosition
ref <- noteMarker <* spaceChar
st <- getState
setState $ st{ museInPara = False }
content <- listItemContents
oldnotes <- museNotes <$> getState
case M.lookup ref oldnotes of
@ -465,35 +545,36 @@ lineBlock = try $ do
-- lists
--
listItemContents' :: PandocMonad m => Int -> MuseParser m (F Blocks)
listItemContents' col =
mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
bulletListItemsUntil :: PandocMonad m
=> Int
-> MuseParser m a
-> MuseParser m ([F Blocks], a)
bulletListItemsUntil indent end = try $ do
char '-'
void spaceChar <|> lookAhead eol
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (indent + 2) ((Right <$> try (optional blankline >> indentWith indent >> bulletListItemsUntil indent end)) <|> (Left <$> end))
case e of
Left ee -> return ([x], ee)
Right (xs, ee) -> return (x:xs, ee)
bulletListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
bulletListUntil end = try $ do
many spaceChar
pos <- getPosition
let indent = sourceColumn pos - 1
guard $ indent /= 0
(items, e) <- bulletListItemsUntil indent end
return $ (B.bulletList <$> sequence items, e)
listItemContents :: PandocMonad m => MuseParser m (F Blocks)
listItemContents = do
pos <- getPosition
let col = sourceColumn pos - 1
listItemContents' col
listItem :: PandocMonad m => Int -> MuseParser m a -> MuseParser m (F Blocks)
listItem n p = try $ do
optional blankline
count n spaceChar
p
void spaceChar <|> lookAhead eol
listItemContents
bulletList :: PandocMonad m => MuseParser m (F Blocks)
bulletList = try $ do
many spaceChar
pos <- getPosition
let col = sourceColumn pos
guard $ col /= 1
char '-'
void spaceChar <|> lookAhead eol
first <- listItemContents
rest <- many $ listItem (col - 1) (char '-')
return $ B.bulletList <$> sequence (first : rest)
mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
-- | Parses an ordered list marker and returns list attributes.
anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes
@ -516,38 +597,74 @@ museOrderedListMarker style = do
char '.'
return start
orderedList :: PandocMonad m => MuseParser m (F Blocks)
orderedList = try $ do
orderedListItemsUntil :: PandocMonad m
=> Int
-> ListNumberStyle
-> MuseParser m a
-> MuseParser m ([F Blocks], a)
orderedListItemsUntil indent style end =
continuation
where
continuation = try $ do
pos <- getPosition
void spaceChar <|> lookAhead eol
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optionMaybe blankline >> indentWith indent >> museOrderedListMarker style >> continuation)) <|> (Left <$> end))
case e of
Left ee -> return ([x], ee)
Right (xs, ee) -> return (x:xs, ee)
orderedListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
orderedListUntil end = try $ do
many spaceChar
pos <- getPosition
let col = sourceColumn pos
guard $ col /= 1
let indent = sourceColumn pos - 1
guard $ indent /= 0
p@(_, style, _) <- anyMuseOrderedListMarker
guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman]
void spaceChar <|> lookAhead eol
first <- listItemContents
rest <- many $ listItem (col - 1) (museOrderedListMarker style)
return $ B.orderedListWith p <$> sequence (first : rest)
(items, e) <- orderedListItemsUntil indent style end
return $ (B.orderedListWith p <$> sequence items, e)
definitionListItem :: PandocMonad m => MuseParser m (F (Inlines, [Blocks]))
definitionListItem = try $ do
definitionListItemsUntil :: PandocMonad m
=> Int
-> MuseParser m a
-> MuseParser m ([F (Inlines, [Blocks])], a)
definitionListItemsUntil indent end =
continuation
where continuation = try $ do
pos <- getPosition
term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::")
void spaceChar <|> lookAhead eol
contents <- listItemContents' $ sourceColumn pos
pure $ do lineContent' <- contents
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optional blankline >> count indent spaceChar >> continuation)) <|> (Left <$> end))
let xx = do
term' <- term
pure (term', [lineContent'])
x' <- x
(return (term', [x']))::(F (Inlines, [Blocks]))
case e of
Left ee -> return $ ([xx], ee)
Right (xs, ee) -> return $ (xx : xs, ee)
definitionList :: PandocMonad m => MuseParser m (F Blocks)
definitionList = try $ do
definitionListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
definitionListUntil end = try $ do
many spaceChar
pos <- getPosition
let indent = sourceColumn pos - 1
guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse
first <- definitionListItem
rest <- many $ try (optional blankline >> count indent spaceChar >> definitionListItem)
return $ B.definitionList <$> sequence (first : rest)
(items, e) <- definitionListItemsUntil indent end
return (B.definitionList <$> sequence items, e)
anyListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
anyListUntil end =
bulletListUntil end <|> orderedListUntil end <|> definitionListUntil end
--
-- tables

View file

@ -313,6 +313,16 @@ tests =
, "</quote>"
]
=?> blockQuote (para $ text "Hello, world")
, "Nested quote tag" =:
T.unlines [ "<quote>"
, "foo"
, "<quote>"
, "bar"
, "</quote>"
, "baz"
, "</quote>"
] =?>
blockQuote (para "foo" <> blockQuote (para "bar") <> para "baz")
, "Verse tag" =:
T.unlines [ "<verse>"
, ""
@ -514,6 +524,12 @@ tests =
] =?>
header 2 "Foo" <>
para (spanWith ("bar", [], []) mempty)
, "Headers terminate lists" =:
T.unlines [ " - foo"
, "* bar"
] =?>
bulletList [ para "foo" ] <>
header 1 "bar"
]
, testGroup "Directives"
[ "Title" =:
@ -846,6 +862,15 @@ tests =
, para "c"
]
]
, "List continuation afeter nested list" =:
T.unlines
[ " - - foo"
, ""
, " bar"
] =?>
bulletList [ bulletList [ para "foo" ] <>
para "bar"
]
-- Emacs Muse allows to separate lists with two or more blank lines.
-- Text::Amuse (Amusewiki engine) always creates a single list as of version 0.82.
-- pandoc follows Emacs Muse behavior
@ -1087,7 +1112,21 @@ tests =
, para "* Bar"
]
]
, "List inside a tag" =:
, "Bullet list inside a tag" =:
T.unlines
[ "<quote>"
, " - First"
, ""
, " - Second"
, ""
, " - Third"
, "</quote>"
] =?>
blockQuote (bulletList [ para "First"
, para "Second"
, para "Third"
])
, "Ordered list inside a tag" =:
T.unlines
[ "<quote>"
, " 1. First"