DocBook, JATS, OPML readers: performance optimization.

With the new XML parser, we can avoid the expensive tree
normalization step we used to do.

This gives a significant speed boost in docbook and JATS
parsing (e.g. 9.7 to 6 ms).
This commit is contained in:
John MacFarlane 2021-02-18 21:24:31 -08:00
parent ef642e2bbc
commit 98d26c2345
3 changed files with 8 additions and 64 deletions

View file

@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
import Data.Char (isSpace, toUpper, isLetter)
import Data.Char (isSpace, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@ -540,8 +540,9 @@ instance Default DBState where
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readDocBook _ inp = do
tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
tree <- either (throwError . PandocXMLError "") return $
parseXMLContents
(TL.fromStrict . handleInstructions $ crFilter inp)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
@ -571,25 +572,6 @@ getFigure e = do
modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty }
return res
-- normalize input, consolidating adjacent Text and CRef elements
normalizeTree :: [Content] -> [Content]
normalizeTree = everywhere (mkT go)
where go :: [Content] -> [Content]
go (Text (CData CDataRaw _ _):xs) = xs
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
Text (CData CDataText (s1 <> s2) z):xs
go (Text (CData CDataText s1 z):CRef r:xs) =
Text (CData CDataText (s1 <> convertEntity r) z):xs
go (CRef r:Text (CData CDataText s1 z):xs) =
Text (CData CDataText (convertEntity r <> s1) z):xs
go (CRef r1:CRef r2:xs) =
Text (CData CDataText (convertEntity r1 <>
convertEntity r2) Nothing):xs
go xs = xs
convertEntity :: Text -> Text
convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e)
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr elt =

View file

@ -54,30 +54,11 @@ instance Default JATSState where
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
tree <- either (throwError . PandocXMLError "")
(return . normalizeTree) $
tree <- either (throwError . PandocXMLError "") return $
parseXMLContents (TL.fromStrict $ crFilter inp)
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
-- normalize input, consolidating adjacent Text and CRef elements
normalizeTree :: [Content] -> [Content]
normalizeTree = everywhere (mkT go)
where go :: [Content] -> [Content]
go (Text (CData CDataRaw _ _):xs) = xs
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
Text (CData CDataText (s1 <> s2) z):xs
go (Text (CData CDataText s1 z):CRef r:xs) =
Text (CData CDataText (s1 <> convertEntity r) z):xs
go (CRef r:Text (CData CDataText s1 z):xs) =
Text (CData CDataText (convertEntity r <> s1) z):xs
go (CRef r1:CRef r2:xs) =
Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
go xs = xs
convertEntity :: Text -> Text
convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e)
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr =
@ -454,7 +435,8 @@ elementToStr x = x
parseInline :: PandocMonad m => Content -> JATS m Inlines
parseInline (Text (CData _ s _)) = return $ text s
parseInline (CRef ref) = return . text . convertEntity $ ref
parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack)
$ lookupEntity (T.unpack ref)
parseInline (Elem e) =
case qName (elName e) of
"italic" -> innerInlines emph

View file

@ -14,12 +14,10 @@ Conversion of OPML to 'Pandoc' document.
module Text.Pandoc.Readers.OPML ( readOPML ) where
import Control.Monad.State.Strict
import Data.Default
import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
@ -53,7 +51,7 @@ readOPML opts inp = do
(bs, st') <- runStateT
(case parseXMLContents (TL.fromStrict (crFilter inp)) of
Left msg -> throwError $ PandocXMLError "" msg
Right ns -> mapM parseBlock $ normalizeTree ns)
Right ns -> mapM parseBlock ns)
def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
@ -61,24 +59,6 @@ readOPML opts inp = do
setDate (opmlDocDate st') $
doc $ mconcat bs
-- normalize input, consolidating adjacent Text and CRef elements
normalizeTree :: [Content] -> [Content]
normalizeTree = everywhere (mkT go)
where go :: [Content] -> [Content]
go (Text (CData CDataRaw _ _):xs) = xs
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
Text (CData CDataText (s1 <> s2) z):xs
go (Text (CData CDataText s1 z):CRef r:xs) =
Text (CData CDataText (s1 <> convertEntity r) z):xs
go (CRef r:Text (CData CDataText s1 z):xs) =
Text (CData CDataText (convertEntity r <> s1) z):xs
go (CRef r1:CRef r2:xs) =
Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
go xs = xs
convertEntity :: Text -> Text
convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e))
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr elt =