DocBook, JATS, OPML readers: performance optimization.
With the new XML parser, we can avoid the expensive tree normalization step we used to do. This gives a significant speed boost in docbook and JATS parsing (e.g. 9.7 to 6 ms).
This commit is contained in:
parent
ef642e2bbc
commit
98d26c2345
3 changed files with 8 additions and 64 deletions
|
@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
|
|||
-}
|
||||
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
|
||||
import Control.Monad.State.Strict
|
||||
import Data.Char (isSpace, toUpper, isLetter)
|
||||
import Data.Char (isSpace, isLetter)
|
||||
import Data.Default
|
||||
import Data.Either (rights)
|
||||
import Data.Foldable (asum)
|
||||
|
@ -540,8 +540,9 @@ instance Default DBState where
|
|||
|
||||
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
|
||||
readDocBook _ inp = do
|
||||
tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
|
||||
parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
|
||||
tree <- either (throwError . PandocXMLError "") return $
|
||||
parseXMLContents
|
||||
(TL.fromStrict . handleInstructions $ crFilter inp)
|
||||
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
|
||||
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
|
||||
|
||||
|
@ -571,25 +572,6 @@ getFigure e = do
|
|||
modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty }
|
||||
return res
|
||||
|
||||
-- normalize input, consolidating adjacent Text and CRef elements
|
||||
normalizeTree :: [Content] -> [Content]
|
||||
normalizeTree = everywhere (mkT go)
|
||||
where go :: [Content] -> [Content]
|
||||
go (Text (CData CDataRaw _ _):xs) = xs
|
||||
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
|
||||
Text (CData CDataText (s1 <> s2) z):xs
|
||||
go (Text (CData CDataText s1 z):CRef r:xs) =
|
||||
Text (CData CDataText (s1 <> convertEntity r) z):xs
|
||||
go (CRef r:Text (CData CDataText s1 z):xs) =
|
||||
Text (CData CDataText (convertEntity r <> s1) z):xs
|
||||
go (CRef r1:CRef r2:xs) =
|
||||
Text (CData CDataText (convertEntity r1 <>
|
||||
convertEntity r2) Nothing):xs
|
||||
go xs = xs
|
||||
|
||||
convertEntity :: Text -> Text
|
||||
convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e)
|
||||
|
||||
-- convenience function to get an attribute value, defaulting to ""
|
||||
attrValue :: Text -> Element -> Text
|
||||
attrValue attr elt =
|
||||
|
|
|
@ -54,30 +54,11 @@ instance Default JATSState where
|
|||
|
||||
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
|
||||
readJATS _ inp = do
|
||||
tree <- either (throwError . PandocXMLError "")
|
||||
(return . normalizeTree) $
|
||||
tree <- either (throwError . PandocXMLError "") return $
|
||||
parseXMLContents (TL.fromStrict $ crFilter inp)
|
||||
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
|
||||
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
|
||||
|
||||
-- normalize input, consolidating adjacent Text and CRef elements
|
||||
normalizeTree :: [Content] -> [Content]
|
||||
normalizeTree = everywhere (mkT go)
|
||||
where go :: [Content] -> [Content]
|
||||
go (Text (CData CDataRaw _ _):xs) = xs
|
||||
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
|
||||
Text (CData CDataText (s1 <> s2) z):xs
|
||||
go (Text (CData CDataText s1 z):CRef r:xs) =
|
||||
Text (CData CDataText (s1 <> convertEntity r) z):xs
|
||||
go (CRef r:Text (CData CDataText s1 z):xs) =
|
||||
Text (CData CDataText (convertEntity r <> s1) z):xs
|
||||
go (CRef r1:CRef r2:xs) =
|
||||
Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
|
||||
go xs = xs
|
||||
|
||||
convertEntity :: Text -> Text
|
||||
convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e)
|
||||
|
||||
-- convenience function to get an attribute value, defaulting to ""
|
||||
attrValue :: Text -> Element -> Text
|
||||
attrValue attr =
|
||||
|
@ -454,7 +435,8 @@ elementToStr x = x
|
|||
|
||||
parseInline :: PandocMonad m => Content -> JATS m Inlines
|
||||
parseInline (Text (CData _ s _)) = return $ text s
|
||||
parseInline (CRef ref) = return . text . convertEntity $ ref
|
||||
parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack)
|
||||
$ lookupEntity (T.unpack ref)
|
||||
parseInline (Elem e) =
|
||||
case qName (elName e) of
|
||||
"italic" -> innerInlines emph
|
||||
|
|
|
@ -14,12 +14,10 @@ Conversion of OPML to 'Pandoc' document.
|
|||
module Text.Pandoc.Readers.OPML ( readOPML ) where
|
||||
import Control.Monad.State.Strict
|
||||
import Data.Default
|
||||
import Data.Generics
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.Lazy as TL
|
||||
import Text.HTML.TagSoup.Entity (lookupEntity)
|
||||
import Text.Pandoc.Builder
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad)
|
||||
import Text.Pandoc.Options
|
||||
|
@ -53,7 +51,7 @@ readOPML opts inp = do
|
|||
(bs, st') <- runStateT
|
||||
(case parseXMLContents (TL.fromStrict (crFilter inp)) of
|
||||
Left msg -> throwError $ PandocXMLError "" msg
|
||||
Right ns -> mapM parseBlock $ normalizeTree ns)
|
||||
Right ns -> mapM parseBlock ns)
|
||||
def{ opmlOptions = opts }
|
||||
return $
|
||||
setTitle (opmlDocTitle st') $
|
||||
|
@ -61,24 +59,6 @@ readOPML opts inp = do
|
|||
setDate (opmlDocDate st') $
|
||||
doc $ mconcat bs
|
||||
|
||||
-- normalize input, consolidating adjacent Text and CRef elements
|
||||
normalizeTree :: [Content] -> [Content]
|
||||
normalizeTree = everywhere (mkT go)
|
||||
where go :: [Content] -> [Content]
|
||||
go (Text (CData CDataRaw _ _):xs) = xs
|
||||
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
|
||||
Text (CData CDataText (s1 <> s2) z):xs
|
||||
go (Text (CData CDataText s1 z):CRef r:xs) =
|
||||
Text (CData CDataText (s1 <> convertEntity r) z):xs
|
||||
go (CRef r:Text (CData CDataText s1 z):xs) =
|
||||
Text (CData CDataText (convertEntity r <> s1) z):xs
|
||||
go (CRef r1:CRef r2:xs) =
|
||||
Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
|
||||
go xs = xs
|
||||
|
||||
convertEntity :: Text -> Text
|
||||
convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e))
|
||||
|
||||
-- convenience function to get an attribute value, defaulting to ""
|
||||
attrValue :: Text -> Element -> Text
|
||||
attrValue attr elt =
|
||||
|
|
Loading…
Add table
Reference in a new issue