Muse reader: refactor to avoid reparsing

Lists are parsed in linear instead of exponential time now.

Contents of block tags, such as <quote>, is parsed directly,
without storing it in a string and parsing with parseFromString.

Fixed a bug: headers did not terminate lists.
This commit is contained in:
Alexander Krotov 2018-02-06 03:17:31 +03:00
parent 10c8b9f4bb
commit 8aed3652c2
2 changed files with 233 additions and 77 deletions

View file

@ -47,7 +47,7 @@ import Data.List (stripPrefix, intercalate)
import Data.List.Split (splitOn) import Data.List.Split (splitOn)
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe, isNothing)
import Data.Text (Text, unpack) import Data.Text (Text, unpack)
import System.FilePath (takeExtension) import System.FilePath (takeExtension)
import Text.HTML.TagSoup import Text.HTML.TagSoup
@ -82,6 +82,7 @@ data MuseState = MuseState { museMeta :: F Meta -- ^ Document metadata
, museLogMessages :: [LogMessage] , museLogMessages :: [LogMessage]
, museNotes :: M.Map String (SourcePos, F Blocks) , museNotes :: M.Map String (SourcePos, F Blocks)
, museInLink :: Bool , museInLink :: Bool
, museInPara :: Bool
} }
instance Default MuseState where instance Default MuseState where
@ -96,6 +97,7 @@ defaultMuseState = MuseState { museMeta = return nullMeta
, museLogMessages = [] , museLogMessages = []
, museNotes = M.empty , museNotes = M.empty
, museInLink = False , museInLink = False
, museInPara = False
} }
type MuseParser = ParserT String MuseState type MuseParser = ParserT String MuseState
@ -149,6 +151,12 @@ htmlElement tag = try $ do
where where
endtag = void $ htmlTag (~== TagClose tag) endtag = void $ htmlTag (~== TagClose tag)
htmlBlock :: PandocMonad m => String -> MuseParser m (Attr, String)
htmlBlock tag = try $ do
res <- htmlElement tag
manyTill spaceChar eol
return res
htmlAttrToPandoc :: [Attribute String] -> Attr htmlAttrToPandoc :: [Attribute String] -> Attr
htmlAttrToPandoc attrs = (ident, classes, keyvals) htmlAttrToPandoc attrs = (ident, classes, keyvals)
where where
@ -159,13 +167,13 @@ htmlAttrToPandoc attrs = (ident, classes, keyvals)
parseHtmlContent :: PandocMonad m parseHtmlContent :: PandocMonad m
=> String -> MuseParser m (Attr, F Blocks) => String -> MuseParser m (Attr, F Blocks)
parseHtmlContent tag = do parseHtmlContent tag = do
(attr, content) <- htmlElement tag (TagOpen _ attr, _) <- htmlTag (~== TagOpen tag [])
parsedContent <- parseContent (content ++ "\n") manyTill spaceChar eol
content <- parseBlocksTill (manyTill spaceChar endtag)
manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline manyTill spaceChar eol -- closing tag must be followed by optional whitespace and newline
return (attr, mconcat parsedContent) return (htmlAttrToPandoc attr, content)
where where
parseContent = parseFromString $ manyTill parseBlock endOfContent endtag = void $ htmlTag (~== TagClose tag)
endOfContent = try $ skipMany blankline >> skipSpaces >> eof
commonPrefix :: String -> String -> String commonPrefix :: String -> String -> String
commonPrefix _ [] = [] commonPrefix _ [] = []
@ -248,19 +256,85 @@ directive = do
parseBlocks :: PandocMonad m parseBlocks :: PandocMonad m
=> MuseParser m (F Blocks) => MuseParser m (F Blocks)
parseBlocks = parseBlocks =
try (mempty <$ eof) <|> try parseEnd <|>
try blockStart <|> try blockStart <|>
try listStart <|>
try paraStart try paraStart
where where
parseEnd = mempty <$ eof
blockStart = do first <- header <|> blockElements <|> amuseNoteBlock <|> emacsNoteBlock blockStart = do first <- header <|> blockElements <|> amuseNoteBlock <|> emacsNoteBlock
rest <- parseBlocks rest <- parseBlocks
return $ first B.<> rest return $ first B.<> rest
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, rest) <- anyListUntil parseBlocks
return $ first B.<> rest
paraStart = do paraStart = do
indent <- length <$> many spaceChar indent <- length <$> many spaceChar
(first, rest) <- paraUntil ((mempty <$ eof) <|> blockStart) (first, rest) <- paraUntil parseBlocks
let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first let first' = if indent >= 2 && indent < 6 then B.blockQuote <$> first else first
return $ first' B.<> rest return $ first' B.<> rest
parseBlocksTill :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks)
parseBlocksTill end =
try parseEnd <|>
try blockStart <|>
try listStart <|>
try paraStart
where
parseEnd = mempty <$ end
blockStart = do first <- blockElements
rest <- continuation
return $ first B.<> rest
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, e) <- anyListUntil ((Left <$> end) <|> (Right <$> continuation))
case e of
Left _ -> return first
Right rest -> return $ first B.<> rest
paraStart = do (first, e) <- paraUntil ((Left <$> end) <|> (Right <$> continuation))
case e of
Left _ -> return $ first
Right rest -> return $ first B.<> rest
continuation = parseBlocksTill end
listItemContentsUntil :: PandocMonad m
=> Int
-> MuseParser m a
-> MuseParser m (F Blocks, a)
listItemContentsUntil col end =
try blockStart <|>
try listStart <|>
try paraStart
where
parseEnd = do e <- end
return (mempty, e)
paraStart = do
(first, e) <- paraUntil ((Right <$> continuation) <|> (Left <$> end))
case e of
Left ee -> return (first, ee)
Right (rest, ee) -> return (first B.<> rest, ee)
blockStart = do first <- blockElements
(rest, e) <- continuation <|> parseEnd
return (first B.<> rest, e)
listStart = do
st <- getState
setState $ st{ museInPara = False }
(first, e) <- anyListUntil ((Right <$> continuation) <|> (Left <$> end))
case e of
Left ee -> return (first, ee)
Right (rest, ee) -> return $ (first B.<> rest, ee)
continuation = try $ do blank <- optionMaybe blankline
skipMany blankline
indentWith col
st <- getState
setState $ st{ museInPara = museInPara st && isNothing blank }
listItemContentsUntil col end
parseBlock :: PandocMonad m => MuseParser m (F Blocks) parseBlock :: PandocMonad m => MuseParser m (F Blocks)
parseBlock = do parseBlock = do
res <- blockElements <|> para res <- blockElements <|> para
@ -269,24 +343,24 @@ parseBlock = do
where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements))) where para = fst <$> paraUntil (try (eof <|> void (lookAhead blockElements)))
blockElements :: PandocMonad m => MuseParser m (F Blocks) blockElements :: PandocMonad m => MuseParser m (F Blocks)
blockElements = choice [ mempty <$ blankline blockElements = do
, comment st <- getState
, separator setState $ st{ museInPara = False }
, example choice [ mempty <$ blankline
, exampleTag , comment
, literalTag , separator
, centerTag , example
, rightTag , exampleTag
, quoteTag , literalTag
, divTag , centerTag
, verseTag , rightTag
, lineBlock , quoteTag
, bulletList , divTag
, orderedList , verseTag
, definitionList , lineBlock
, table , table
, commentTag , commentTag
] ]
comment :: PandocMonad m => MuseParser m (F Blocks) comment :: PandocMonad m => MuseParser m (F Blocks)
comment = try $ do comment = try $ do
@ -343,13 +417,13 @@ dropSpacePrefix lns =
exampleTag :: PandocMonad m => MuseParser m (F Blocks) exampleTag :: PandocMonad m => MuseParser m (F Blocks)
exampleTag = try $ do exampleTag = try $ do
many spaceChar many spaceChar
(attr, contents) <- htmlElement "example" (attr, contents) <- htmlBlock "example"
return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents return $ return $ B.codeBlockWith attr $ rchop $ intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop contents
literalTag :: PandocMonad m => MuseParser m (F Blocks) literalTag :: PandocMonad m => MuseParser m (F Blocks)
literalTag = do literalTag = do
guardDisabled Ext_amuse -- Text::Amuse does not support <literal> guardDisabled Ext_amuse -- Text::Amuse does not support <literal>
(return . rawBlock) <$> htmlElement "literal" (return . rawBlock) <$> htmlBlock "literal"
where where
-- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML -- FIXME: Emacs Muse inserts <literal> without style into all output formats, but we assume HTML
format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs format (_, _, kvs) = fromMaybe "html" $ lookup "style" kvs
@ -385,18 +459,22 @@ verseLines = do
verseTag :: PandocMonad m => MuseParser m (F Blocks) verseTag :: PandocMonad m => MuseParser m (F Blocks)
verseTag = do verseTag = do
(_, content) <- htmlElement "verse" (_, content) <- htmlBlock "verse"
parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content) parseFromString verseLines (intercalate "\n" $ dropSpacePrefix $ splitOn "\n" $ lchop content)
commentTag :: PandocMonad m => MuseParser m (F Blocks) commentTag :: PandocMonad m => MuseParser m (F Blocks)
commentTag = htmlElement "comment" >> return mempty commentTag = htmlBlock "comment" >> return mempty
-- Indented paragraph is either center, right or quote -- Indented paragraph is either center, right or quote
paraUntil :: PandocMonad m paraUntil :: PandocMonad m
=> MuseParser m a => MuseParser m a
-> MuseParser m (F Blocks, a) -> MuseParser m (F Blocks, a)
paraUntil end = do paraUntil end = do
state <- getState
guard $ not $ museInPara state
setState $ state{ museInPara = True }
(l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end) (l, e) <- someUntil inline $ try (manyTill spaceChar eol >> end)
updateState (\st -> st { museInPara = False })
return (fmap (B.para) $ trimInlinesF $ mconcat l, e) return (fmap (B.para) $ trimInlinesF $ mconcat l, e)
noteMarker :: PandocMonad m => MuseParser m String noteMarker :: PandocMonad m => MuseParser m String
@ -413,6 +491,8 @@ amuseNoteBlock = try $ do
guardEnabled Ext_amuse guardEnabled Ext_amuse
pos <- getPosition pos <- getPosition
ref <- noteMarker <* spaceChar ref <- noteMarker <* spaceChar
st <- getState
setState $ st{ museInPara = False }
content <- listItemContents content <- listItemContents
oldnotes <- museNotes <$> getState oldnotes <- museNotes <$> getState
case M.lookup ref oldnotes of case M.lookup ref oldnotes of
@ -465,35 +545,36 @@ lineBlock = try $ do
-- lists -- lists
-- --
listItemContents' :: PandocMonad m => Int -> MuseParser m (F Blocks) bulletListItemsUntil :: PandocMonad m
listItemContents' col = => Int
mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col) -> MuseParser m a
-> MuseParser m ([F Blocks], a)
bulletListItemsUntil indent end = try $ do
char '-'
void spaceChar <|> lookAhead eol
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (indent + 2) ((Right <$> try (optional blankline >> indentWith indent >> bulletListItemsUntil indent end)) <|> (Left <$> end))
case e of
Left ee -> return ([x], ee)
Right (xs, ee) -> return (x:xs, ee)
bulletListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
bulletListUntil end = try $ do
many spaceChar
pos <- getPosition
let indent = sourceColumn pos - 1
guard $ indent /= 0
(items, e) <- bulletListItemsUntil indent end
return $ (B.bulletList <$> sequence items, e)
listItemContents :: PandocMonad m => MuseParser m (F Blocks) listItemContents :: PandocMonad m => MuseParser m (F Blocks)
listItemContents = do listItemContents = do
pos <- getPosition pos <- getPosition
let col = sourceColumn pos - 1 let col = sourceColumn pos - 1
listItemContents' col mconcat <$> parseBlock `sepBy1` try (skipMany blankline >> indentWith col)
listItem :: PandocMonad m => Int -> MuseParser m a -> MuseParser m (F Blocks)
listItem n p = try $ do
optional blankline
count n spaceChar
p
void spaceChar <|> lookAhead eol
listItemContents
bulletList :: PandocMonad m => MuseParser m (F Blocks)
bulletList = try $ do
many spaceChar
pos <- getPosition
let col = sourceColumn pos
guard $ col /= 1
char '-'
void spaceChar <|> lookAhead eol
first <- listItemContents
rest <- many $ listItem (col - 1) (char '-')
return $ B.bulletList <$> sequence (first : rest)
-- | Parses an ordered list marker and returns list attributes. -- | Parses an ordered list marker and returns list attributes.
anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes anyMuseOrderedListMarker :: PandocMonad m => MuseParser m ListAttributes
@ -516,38 +597,74 @@ museOrderedListMarker style = do
char '.' char '.'
return start return start
orderedList :: PandocMonad m => MuseParser m (F Blocks) orderedListItemsUntil :: PandocMonad m
orderedList = try $ do => Int
-> ListNumberStyle
-> MuseParser m a
-> MuseParser m ([F Blocks], a)
orderedListItemsUntil indent style end =
continuation
where
continuation = try $ do
pos <- getPosition
void spaceChar <|> lookAhead eol
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optionMaybe blankline >> indentWith indent >> museOrderedListMarker style >> continuation)) <|> (Left <$> end))
case e of
Left ee -> return ([x], ee)
Right (xs, ee) -> return (x:xs, ee)
orderedListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
orderedListUntil end = try $ do
many spaceChar many spaceChar
pos <- getPosition pos <- getPosition
let col = sourceColumn pos let indent = sourceColumn pos - 1
guard $ col /= 1 guard $ indent /= 0
p@(_, style, _) <- anyMuseOrderedListMarker p@(_, style, _) <- anyMuseOrderedListMarker
guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman] guard $ style `elem` [Decimal, LowerAlpha, UpperAlpha, LowerRoman, UpperRoman]
void spaceChar <|> lookAhead eol (items, e) <- orderedListItemsUntil indent style end
first <- listItemContents return $ (B.orderedListWith p <$> sequence items, e)
rest <- many $ listItem (col - 1) (museOrderedListMarker style)
return $ B.orderedListWith p <$> sequence (first : rest)
definitionListItem :: PandocMonad m => MuseParser m (F (Inlines, [Blocks])) definitionListItemsUntil :: PandocMonad m
definitionListItem = try $ do => Int
pos <- getPosition -> MuseParser m a
term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::") -> MuseParser m ([F (Inlines, [Blocks])], a)
void spaceChar <|> lookAhead eol definitionListItemsUntil indent end =
contents <- listItemContents' $ sourceColumn pos continuation
pure $ do lineContent' <- contents where continuation = try $ do
term' <- term pos <- getPosition
pure (term', [lineContent']) term <- trimInlinesF . mconcat <$> manyTill (choice inlineList) (string "::")
void spaceChar <|> lookAhead eol
st <- getState
setState $ st{ museInPara = False }
(x, e) <- listItemContentsUntil (sourceColumn pos) ((Right <$> try (optional blankline >> count indent spaceChar >> continuation)) <|> (Left <$> end))
let xx = do
term' <- term
x' <- x
(return (term', [x']))::(F (Inlines, [Blocks]))
case e of
Left ee -> return $ ([xx], ee)
Right (xs, ee) -> return $ (xx : xs, ee)
definitionList :: PandocMonad m => MuseParser m (F Blocks) definitionListUntil :: PandocMonad m
definitionList = try $ do => MuseParser m a
-> MuseParser m (F Blocks, a)
definitionListUntil end = try $ do
many spaceChar many spaceChar
pos <- getPosition pos <- getPosition
let indent = sourceColumn pos - 1 let indent = sourceColumn pos - 1
guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse guardDisabled Ext_amuse <|> guard (indent /= 0) -- Initial space is required by Amusewiki, but not Emacs Muse
first <- definitionListItem (items, e) <- definitionListItemsUntil indent end
rest <- many $ try (optional blankline >> count indent spaceChar >> definitionListItem) return (B.definitionList <$> sequence items, e)
return $ B.definitionList <$> sequence (first : rest)
anyListUntil :: PandocMonad m
=> MuseParser m a
-> MuseParser m (F Blocks, a)
anyListUntil end =
bulletListUntil end <|> orderedListUntil end <|> definitionListUntil end
-- --
-- tables -- tables

View file

@ -313,6 +313,16 @@ tests =
, "</quote>" , "</quote>"
] ]
=?> blockQuote (para $ text "Hello, world") =?> blockQuote (para $ text "Hello, world")
, "Nested quote tag" =:
T.unlines [ "<quote>"
, "foo"
, "<quote>"
, "bar"
, "</quote>"
, "baz"
, "</quote>"
] =?>
blockQuote (para "foo" <> blockQuote (para "bar") <> para "baz")
, "Verse tag" =: , "Verse tag" =:
T.unlines [ "<verse>" T.unlines [ "<verse>"
, "" , ""
@ -514,6 +524,12 @@ tests =
] =?> ] =?>
header 2 "Foo" <> header 2 "Foo" <>
para (spanWith ("bar", [], []) mempty) para (spanWith ("bar", [], []) mempty)
, "Headers terminate lists" =:
T.unlines [ " - foo"
, "* bar"
] =?>
bulletList [ para "foo" ] <>
header 1 "bar"
] ]
, testGroup "Directives" , testGroup "Directives"
[ "Title" =: [ "Title" =:
@ -846,6 +862,15 @@ tests =
, para "c" , para "c"
] ]
] ]
, "List continuation afeter nested list" =:
T.unlines
[ " - - foo"
, ""
, " bar"
] =?>
bulletList [ bulletList [ para "foo" ] <>
para "bar"
]
-- Emacs Muse allows to separate lists with two or more blank lines. -- Emacs Muse allows to separate lists with two or more blank lines.
-- Text::Amuse (Amusewiki engine) always creates a single list as of version 0.82. -- Text::Amuse (Amusewiki engine) always creates a single list as of version 0.82.
-- pandoc follows Emacs Muse behavior -- pandoc follows Emacs Muse behavior
@ -1087,7 +1112,21 @@ tests =
, para "* Bar" , para "* Bar"
] ]
] ]
, "List inside a tag" =: , "Bullet list inside a tag" =:
T.unlines
[ "<quote>"
, " - First"
, ""
, " - Second"
, ""
, " - Third"
, "</quote>"
] =?>
blockQuote (bulletList [ para "First"
, para "Second"
, para "Third"
])
, "Ordered list inside a tag" =:
T.unlines T.unlines
[ "<quote>" [ "<quote>"
, " 1. First" , " 1. First"