LaTeX reader: use custom TokStream...

that keeps track of whether macros are expanded. This allows
us to improve performance a bit by avoiding unnecessary
runs of the macro expansion code (e.g. from 24 ms to 20 ms on
our standard benchmark).
This commit is contained in:
John MacFarlane 2022-03-11 19:51:59 -08:00
parent d523eca3d3
commit b423c17100
5 changed files with 67 additions and 36 deletions

View file

@ -84,7 +84,7 @@ readLaTeX :: (PandocMonad m, ToSources a)
readLaTeX opts ltx = do
let sources = toSources ltx
parsed <- runParserT parseLaTeX def{ sOptions = opts } "source"
(tokenizeSources sources)
(TokStream False (tokenizeSources sources))
case parsed of
Right result -> return result
Left e -> throwError $ PandocParsecError sources e
@ -516,11 +516,11 @@ ifToggle = do
spaces
no <- braced
toggles <- sToggles <$> getState
inp <- getInput
TokStream _ inp <- getInput
let name' = untokenize name
case M.lookup name' toggles of
Just True -> setInput (yes ++ inp)
Just False -> setInput (no ++ inp)
Just True -> setInput $ TokStream False (yes ++ inp)
Just False -> setInput $ TokStream False (no ++ inp)
Nothing -> do
pos <- getPosition
report $ UndefinedToggle name' pos
@ -532,9 +532,10 @@ ifstrequal = do
str2 <- tok
ifequal <- braced
ifnotequal <- braced
TokStream _ ts <- getInput
if str1 == str2
then getInput >>= setInput . (ifequal ++)
else getInput >>= setInput . (ifnotequal ++)
then setInput $ TokStream False (ifequal ++ ts)
else setInput $ TokStream False (ifnotequal ++ ts)
return mempty
coloredInline :: PandocMonad m => Text -> LP m Inlines
@ -602,7 +603,7 @@ lookupListDefault d = (fromMaybe d .) . lookupList
inline :: PandocMonad m => LP m Inlines
inline = do
Tok pos toktype t <- lookAhead anyTok
Tok pos toktype t <- peekTok
let symbolAsString = str . untoken <$> anySymbol
let unescapedSymbolAsString =
do s <- untoken <$> anySymbol
@ -652,7 +653,8 @@ opt = do
toks <- try (sp *> bracketedToks <* sp)
-- now parse the toks as inlines
st <- getState
parsed <- runParserT (mconcat <$> many inline) st "bracketed option" toks
parsed <- runParserT (mconcat <$> many inline) st "bracketed option"
(TokStream False toks)
case parsed of
Right result -> return result
Left e -> throwError $ PandocParsecError (toSources toks) e
@ -700,7 +702,7 @@ doSubfile = do
skipMany opt
f <- T.unpack . removeDoubleQuotes . T.strip . untokenize <$> braced
oldToks <- getInput
setInput []
setInput $ TokStream False []
insertIncluded ".tex" f
bs <- blocks
eof
@ -747,7 +749,8 @@ insertIncluded defaultExtension f' = do
Nothing -> do
report $ CouldNotLoadIncludeFile (T.pack f) pos
return ""
getInput >>= setInput . (tokenize (initialPos f) contents ++)
TokStream _ ts <- getInput
setInput $ TokStream False (tokenize (initialPos f) contents ++ ts)
updateState dropLatestIncludeFile
authors :: PandocMonad m => LP m ()
@ -1265,7 +1268,7 @@ orderedList' = try $ do
block :: PandocMonad m => LP m Blocks
block = do
Tok _ toktype _ <- lookAhead anyTok
Tok _ toktype _ <- peekTok
res <- (case toktype of
Newline -> mempty <$ spaces1
Spaces -> mempty <$ spaces1

View file

@ -117,7 +117,8 @@ simpleCiteArgs inline = try $ do
-- now parse the toks as inlines
st <- getState
parsed <- lift $
runParserT (mconcat <$> many inline) st "bracketed option" toks
runParserT (mconcat <$> many inline) st "bracketed option"
(TokStream False toks)
case parsed of
Right result -> return result
Left e -> throwError $ PandocParsecError (toSources toks) e

View file

@ -94,8 +94,9 @@ verbTok stopchar = do
Nothing -> return t
Just i -> do
let (t1, t2) = T.splitAt i txt
inp <- getInput
setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar)
TokStream macrosExpanded inp <- getInput
setInput $ TokStream macrosExpanded
$ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar)
: tokenize (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp
return $ Tok pos toktype t1

View file

@ -24,6 +24,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing
, LaTeXState(..)
, defaultLaTeXState
, LP
, TokStream(..)
, withVerbatimMode
, rawLaTeXParser
, applyMacros
@ -34,6 +35,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing
, untoken
, toksToString
, satisfyTok
, peekTok
, parseFromToks
, disablingWithRaw
, doMacros
@ -119,6 +121,7 @@ import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
ArgSpec (..), Tok (..), TokType (..))
import Text.Pandoc.Shared
import Text.Parsec.Pos
import Text.Parsec (Stream(uncons))
import Text.Pandoc.Walk
newtype DottedNum = DottedNum [Int]
@ -243,7 +246,16 @@ instance HasMeta LaTeXState where
instance Default LaTeXState where
def = defaultLaTeXState
type LP m = ParserT [Tok] LaTeXState m
-- The Boolean is True if macros have already been expanded,
-- False if they need expanding.
data TokStream = TokStream !Bool [Tok]
deriving (Show)
instance Monad m => Stream TokStream m Tok where
uncons (TokStream _ []) = return Nothing
uncons (TokStream _ (t:ts)) = return $ Just (t, TokStream False ts)
type LP m = ParserT TokStream LaTeXState m
withVerbatimMode :: PandocMonad m => LP m a -> LP m a
withVerbatimMode parser = do
@ -269,11 +281,12 @@ rawLaTeXParser toks parser valParser = do
let preparser = setStartPos >> parser
let rawparser = (,) <$> withRaw valParser <*> getState
res' <- lift $ runParserT (withRaw (preparser >> getPosition))
lstate "chunk" toks
lstate "chunk" $ TokStream False toks
case res' of
Left _ -> mzero
Right (endpos, toks') -> do
res <- lift $ runParserT rawparser lstate' "chunk" toks'
res <- lift $ runParserT rawparser lstate' "chunk"
$ TokStream False toks'
case res of
Left _ -> mzero
Right ((val, raw), st) -> do
@ -303,7 +316,8 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
pstate <- getState
let lstate = def{ sOptions = extractReaderOptions pstate
, sMacros = extractMacros pstate :| [] }
res <- runParserT retokenize lstate "math" (tokenize (initialPos "math") s)
res <- runParserT retokenize lstate "math" $
TokStream False (tokenize (initialPos "math") s)
case res of
Left e -> Prelude.fail (show e)
Right s' -> return s'
@ -458,7 +472,7 @@ toksToString = T.unpack . untokenize
parseFromToks :: PandocMonad m => LP m a -> [Tok] -> LP m a
parseFromToks parser toks = do
oldInput <- getInput
setInput toks
setInput $ TokStream False toks
oldpos <- getPosition
case toks of
Tok pos _ _ : _ -> setPosition pos
@ -487,15 +501,22 @@ satisfyTok f = do
return $! res
where matcher t | f t = Just t
| otherwise = Nothing
updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
updatePos _spos _ (Tok pos _ _ : _) = pos
updatePos spos (Tok _ _ t) [] = incSourceColumn spos (T.length t)
updatePos :: SourcePos -> Tok -> TokStream -> SourcePos
updatePos _spos _ (TokStream _ (Tok pos _ _ : _)) = pos
updatePos spos (Tok _ _ t) _ = incSourceColumn spos (T.length t)
peekTok :: PandocMonad m => LP m Tok
peekTok = do
doMacros
lookAhead (satisfyTok (const True))
doMacros :: PandocMonad m => LP m ()
doMacros = do
st <- getState
unless (sVerbatimMode st) $
getInput >>= doMacros' 1 >>= setInput
TokStream macrosExpanded toks <- getInput
unless macrosExpanded $ do
st <- getState
unless (sVerbatimMode st) $
doMacros' 1 toks >>= setInput . TokStream True
doMacros' :: PandocMonad m => Int -> [Tok] -> LP m [Tok]
doMacros' n inp =
@ -568,10 +589,10 @@ doMacros' n inp =
Just o -> do
x <- option o bracketedToks
getargs (M.singleton 1 x) $ drop 1 argspecs
rest <- getInput
TokStream _ rest <- getInput
return (args, rest)
lstate <- getState
res <- lift $ runParserT getargs' lstate "args" ts
res <- lift $ runParserT getargs' lstate "args" $ TokStream False ts
case res of
Left _ -> Prelude.fail $ "Could not parse arguments for " ++
T.unpack name
@ -599,7 +620,8 @@ trySpecialMacro _ _ = mzero
handleIf :: PandocMonad m => Bool -> [Tok] -> LP m [Tok]
handleIf b ts = do
res' <- lift $ runParserT (ifParser b) defaultLaTeXState "tokens" ts
res' <- lift $ runParserT (ifParser b) defaultLaTeXState "tokens"
$ TokStream False ts
case res' of
Left _ -> Prelude.fail "Could not parse conditional"
Right ts' -> return ts'
@ -610,7 +632,7 @@ ifParser b = do
*> anyTok)
elseToks <- (controlSeq "else" >> manyTill anyTok (controlSeq "fi"))
<|> ([] <$ controlSeq "fi")
rest <- getInput
TokStream _ rest <- getInput
return $ (if b then ifToks else elseToks) ++ rest
startsWithAlphaNum :: Text -> Bool
@ -717,8 +739,9 @@ singleChar = singleCharTok <|> singleCharFromWord
singleCharFromWord = do
Tok pos toktype t <- disablingWithRaw $ satisfyTok isWordTok
let (t1, t2) = (T.take 1 t, T.drop 1 t)
inp <- getInput
setInput $ Tok pos toktype t1 : Tok (incSourceColumn pos 1) toktype t2 : inp
TokStream macrosExpanded inp <- getInput
setInput $ TokStream macrosExpanded
$ Tok pos toktype t1 : Tok (incSourceColumn pos 1) toktype t2 : inp
anyTok
specialChars :: Set.Set Char
@ -802,7 +825,8 @@ retokenizeComment = (do
Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1))
(sourceColumn pos)) toktype' txt'
let newtoks = map updPos $ tokenize pos $ T.tail txt
getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++))
TokStream macrosExpanded ts <- getInput
setInput $ TokStream macrosExpanded ((Tok pos Symbol "%" : newtoks) ++ ts))
<|> return ()
bracedOrToken :: PandocMonad m => LP m [Tok]

View file

@ -62,10 +62,11 @@ amp = symbol '&'
-- Split a Word into individual Symbols (for parseAligns)
splitWordTok :: PandocMonad m => LP m ()
splitWordTok = do
inp <- getInput
TokStream macrosExpanded inp <- getInput
case inp of
(Tok spos Word t : rest) ->
setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest
setInput $ TokStream macrosExpanded
$ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest
_ -> return ()
parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))]
@ -108,8 +109,9 @@ parseAligns = try $ do
spaces
spec <- braced
case safeRead ds of
Just n ->
getInput >>= setInput . (mconcat (replicate n spec) ++)
Just n -> do
TokStream _ ts <- getInput
setInput $ TokStream False (mconcat (replicate n spec) ++ ts)
Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number"
bgroup
spaces