Add new unexported module T.P.XMLParser.

This exports functions that uses xml-conduit's parser to
produce an xml-light Element or [Content].  This allows
existing pandoc code to use a better parser without
much modification.

The new parser is used in all places where xml-light's
parser was previously used.  Benchmarks show a significant
performance improvement in parsing XML-based formats
(especially ODT and FB2).

Note that the xml-light types use String, so the
conversion from xml-conduit types involves a lot
of extra allocation.  It would be desirable to
avoid that in the future by gradually switching
to using xml-conduit directly. This can be done
module by module.

The new parser also reports errors, which we report
when possible.

A new constructor PandocXMLError has been added to
PandocError in T.P.Error [API change].

Closes #7091, which was the main stimulus.

These changes revealed the need for some changes
in the tests.  The docbook-reader.docbook test
lacked definitions for the entities it used; these
have been added. And the docx golden tests have been
updated, because the new parser does not preserve
the order of attributes.

Add entity defs to docbook-reader.docbook.

Update golden tests for docx.
This commit is contained in:
John MacFarlane 2021-02-08 23:35:19 -08:00
parent 9994ad977d
commit 8ca191604d
98 changed files with 238 additions and 91 deletions

View file

@ -1464,6 +1464,7 @@ Nonzero exit codes have the following meanings:
24 PandocCiteprocError
31 PandocEpubSubdirectoryError
43 PandocPDFError
44 PandocXMLError
47 PandocPDFProgramNotFoundError
61 PandocHttpError
62 PandocShouldNeverHappenError

View file

@ -493,6 +493,7 @@ library
unicode-transforms >= 0.3 && < 0.4,
unordered-containers >= 0.2 && < 0.3,
xml >= 1.3.12 && < 1.4,
xml-conduit >= 1.7 && < 1.10,
zip-archive >= 0.2.3.4 && < 0.5,
zlib >= 0.5 && < 0.7
if os(windows) && arch(i386)
@ -686,6 +687,7 @@ library
Text.Pandoc.Lua.PandocLua,
Text.Pandoc.Lua.Util,
Text.Pandoc.Lua.Walk,
Text.Pandoc.XMLParser,
Text.Pandoc.CSS,
Text.Pandoc.CSV,
Text.Pandoc.RoffChar,

View file

@ -48,6 +48,7 @@ data PandocError = PandocIOError Text IOError
| PandocFailOnWarningError
| PandocPDFProgramNotFoundError Text
| PandocPDFError Text
| PandocXMLError Text Text
| PandocFilterError Text Text
| PandocLuaError Text
| PandocCouldNotFindDataFileError Text
@ -103,6 +104,8 @@ handleError (Left e) =
PandocPDFProgramNotFoundError pdfprog -> err 47 $
pdfprog <> " not found. Please select a different --pdf-engine or install " <> pdfprog
PandocPDFError logmsg -> err 43 $ "Error producing PDF.\n" <> logmsg
PandocXMLError fp logmsg -> err 44 $ "Invalid XML" <>
(if T.null fp then "" else " in " <> fp) <> ":\n" <> logmsg
PandocFilterError filtername msg -> err 83 $ "Error running filter " <>
filtername <> ":\n" <> msg
PandocLuaError msg -> err 84 $ "Error running Lua:\n" <> msg

View file

@ -45,7 +45,9 @@ import Text.Pandoc.Definition
import Text.Pandoc.Options
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Text.XML.Light as Xml
import Text.Pandoc.XMLParser (parseXMLElement)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as TE
import Control.Applicative
import qualified Data.Attoparsec.ByteString.Char8 as A
@ -327,7 +329,8 @@ getSize img =
svgSize :: WriterOptions -> ByteString -> Maybe ImageSize
svgSize opts img = do
doc <- Xml.parseXMLDoc $ UTF8.toString img
doc <- either (const mzero) return $ parseXMLElement
$ TL.fromStrict $ UTF8.toText img
let viewboxSize = do
vb <- Xml.findAttrBy (== Xml.QName "viewBox" Nothing Nothing) doc
[_,_,w,h] <- mapM safeRead (T.words (T.pack vb))

View file

@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
import Data.Char (isSpace, toUpper)
import Data.Char (isSpace, toUpper, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@ -21,7 +21,10 @@ import Data.List (intersperse,elemIndex)
import Data.Maybe (fromMaybe,mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Control.Monad.Except (throwError)
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import Text.Pandoc.Options
@ -29,6 +32,7 @@ import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLContents)
{-
@ -537,22 +541,25 @@ instance Default DBState where
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readDocBook _ inp = do
let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp
tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
-- We treat <?asciidoc-br?> specially (issue #1236), converting it
-- to <br/>, since xml-light doesn't parse the instruction correctly.
-- Other xml instructions are simply removed from the input stream.
-- We treat certain processing instructions by converting them to tags
-- beginning "pi-".
handleInstructions :: Text -> Text
handleInstructions = T.pack . handleInstructions' . T.unpack
handleInstructions' :: String -> String
handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs
handleInstructions' xs = case break (=='<') xs of
(ys, []) -> ys
([], '<':zs) -> '<' : handleInstructions' zs
(ys, zs) -> ys ++ handleInstructions' zs
handleInstructions t =
let (x,y) = T.breakOn "<?" t
in if T.null y
then x
else
let (w,z) = T.breakOn "?>" y
in (if T.takeWhile (\c -> isLetter c || c == '-')
(T.drop 2 w) `elem` ["asciidoc-br", "dbfo"]
then x <> "<pi-" <> T.drop 2 w <> "/>"
else x <> w <> T.take 2 z) <>
handleInstructions (T.drop 2 z)
getFigure :: PandocMonad m => Element -> DB m Blocks
getFigure e = do
@ -892,7 +899,11 @@ parseBlock (Elem e) =
"subtitle" -> return mempty -- handled in parent element
_ -> skip >> getBlocks e
where skip = do
lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
let qn = T.pack $ qName $ elName e
let name = if "pi-" `T.isPrefixOf` qn
then "<?" <> qn <> "?>"
else qn
lift $ report $ IgnoredElement name
return mempty
codeBlockWithLang = do
@ -964,7 +975,7 @@ parseBlock (Elem e) =
cs -> map toAlignment cs
let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9')
|| x == '.') s)
let textWidth = case filterChild (named "?dbfo") e of
let textWidth = case filterChild (named "pi-dbfo") e of
Just d -> case attrValue "table-width" d of
"" -> 1.0
w -> fromMaybe 100.0 (parseWidth w) / 100.0
@ -1165,12 +1176,15 @@ parseInline (Elem e) =
"title" -> return mempty
"affiliation" -> skip
-- Note: this isn't a real docbook tag; it's what we convert
-- <?asciidor-br?> to in handleInstructions, above. A kludge to
-- work around xml-light's inability to parse an instruction.
"br" -> return linebreak
-- <?asciidor-br?> to in handleInstructions, above.
"pi-asciidoc-br" -> return linebreak
_ -> skip >> innerInlines id
where skip = do
lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
let qn = T.pack $ qName $ elName e
let name = if "pi-" `T.isPrefixOf` qn
then "<?" <> qn <> "?>"
else qn
lift $ report $ IgnoredElement name
return mempty
innerInlines f = extractSpaces f . mconcat <$>

View file

@ -74,6 +74,7 @@ import Text.TeXMath.Readers.OMML (readOMML)
import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont)
import Text.XML.Light
import qualified Text.XML.Light.Cursor as XMLC
import Text.Pandoc.XMLParser (parseXMLElement)
data ReaderEnv = ReaderEnv { envNotes :: Notes
, envComments :: Comments
@ -343,10 +344,16 @@ archiveToDocxWithWarnings archive = do
Right doc -> Right (Docx doc, stateWarnings st)
Left e -> Left e
parseXMLFromEntry :: Entry -> Maybe Element
parseXMLFromEntry entry =
case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of
Left _ -> Nothing
Right el -> Just el
getDocumentXmlPath :: Archive -> Maybe FilePath
getDocumentXmlPath zf = do
entry <- findEntryByPath "_rels/.rels" zf
relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
relsElem <- parseXMLFromEntry entry
let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem
rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e ==
Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")
@ -362,7 +369,7 @@ archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
docPath <- asks envDocXmlPath
entry <- maybeToD $ findEntryByPath docPath zf
docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
docElem <- maybeToD $ parseXMLFromEntry entry
let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem)
@ -401,9 +408,9 @@ constructBogusParStyleData stName = ParStyle
archiveToNotes :: Archive -> Notes
archiveToNotes zf =
let fnElem = findEntryByPath "word/footnotes.xml" zf
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
>>= parseXMLFromEntry
enElem = findEntryByPath "word/endnotes.xml" zf
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
>>= parseXMLFromEntry
fn_namespaces = maybe [] elemToNameSpaces fnElem
en_namespaces = maybe [] elemToNameSpaces enElem
ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
@ -415,7 +422,7 @@ archiveToNotes zf =
archiveToComments :: Archive -> Comments
archiveToComments zf =
let cmtsElem = findEntryByPath "word/comments.xml" zf
>>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
>>= parseXMLFromEntry
cmts_namespaces = maybe [] elemToNameSpaces cmtsElem
cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces)
in
@ -445,7 +452,7 @@ filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship]
filePathToRelationships ar docXmlPath fp
| Just relType <- filePathToRelType fp docXmlPath
, Just entry <- findEntryByPath fp ar
, Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =
, Just relElems <- parseXMLFromEntry entry =
mapMaybe (relElemToRelationship relType) $ elChildren relElems
filePathToRelationships _ _ _ = []
@ -527,7 +534,7 @@ archiveToNumbering' zf =
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
numberingElem <- parseXMLFromEntry entry
let namespaces = elemToNameSpaces numberingElem
numElems = findChildrenByName namespaces "w" "num" numberingElem
absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem

View file

@ -53,6 +53,7 @@ import Data.Coerce
import Text.Pandoc.Readers.Docx.Util
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLElement)
newtype CharStyleId = CharStyleId T.Text
deriving (Show, Eq, Ord, IsString, FromStyleId)
@ -135,19 +136,22 @@ defaultRunStyle = RunStyle { isBold = Nothing
, rParentStyle = Nothing
}
archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) =>
(a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
archiveToStyles'
:: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2)
=> (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
archiveToStyles' conv1 conv2 zf =
let stylesElem = findEntryByPath "word/styles.xml" zf >>=
(parseXMLDoc . UTF8.toStringLazy . fromEntry)
in
case stylesElem of
Nothing -> (M.empty, M.empty)
Just styElem ->
let namespaces = elemToNameSpaces styElem
in
( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing,
M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing)
case findEntryByPath "word/styles.xml" zf of
Nothing -> (M.empty, M.empty)
Just entry ->
case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
Left _ -> (M.empty, M.empty)
Right styElem ->
let namespaces = elemToNameSpaces styElem
in
( M.fromList $ map (\r -> (conv1 r, r)) $
buildBasedOnList namespaces styElem Nothing,
M.fromList $ map (\p -> (conv2 p, p)) $
buildBasedOnList namespaces styElem Nothing)
isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool
isBasedOnStyle ns element parentStyle

View file

@ -17,7 +17,7 @@ module Text.Pandoc.Readers.EPUB
(readEPUB)
where
import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry,
import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry,
toArchiveOrFail)
import Control.DeepSeq (NFData, deepseq)
import Control.Monad (guard, liftM, liftM2, mplus)
@ -41,9 +41,10 @@ import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Options (ReaderOptions (..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI)
import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy)
import Text.Pandoc.Walk (query, walk)
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLElement)
type Items = M.Map String (FilePath, MimeType)
@ -181,7 +182,7 @@ renameMeta s = T.pack s
getManifest :: PandocMonad m => Archive -> m (String, Element)
getManifest archive = do
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
docElem <- parseXMLDocE metaEntry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
as <- fmap (map attrToPair . elAttribs)
@ -190,7 +191,7 @@ getManifest archive = do
let rootdir = dropFileName manifestFile
--mime <- lookup "media-type" as
manifest <- findEntryByPathE manifestFile archive
(rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
(rootdir,) <$> parseXMLDocE manifest
-- Fixup
@ -284,8 +285,12 @@ findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
findEntryByPathE (normalise . unEscapeString -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
parseXMLDocE :: PandocMonad m => String -> m Element
parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
parseXMLDocE :: PandocMonad m => Entry -> m Element
parseXMLDocE entry =
either (throwError . PandocXMLError fp) return $ parseXMLElement doc
where
doc = UTF8.toTextLazy . fromEntry $ entry
fp = T.pack $ eRelativePath entry
findElementE :: PandocMonad m => QName -> Element -> m Element
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x

View file

@ -32,6 +32,7 @@ import Data.List (intersperse)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Default
import Data.Maybe
import Text.HTML.TagSoup.Entity (lookupEntity)
@ -42,6 +43,7 @@ import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter)
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLElement)
type FB2 m = StateT FB2State m
@ -64,10 +66,10 @@ instance HasMeta FB2State where
readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readFB2 _ inp =
case parseXMLDoc $ crFilter inp of
Nothing -> throwError $ PandocParseError "Not an XML document"
Just e -> do
(bs, st) <- runStateT (parseRootElement e) def
case parseXMLElement $ TL.fromStrict $ crFilter inp of
Left msg -> throwError $ PandocXMLError "" msg
Right el -> do
(bs, st) <- runStateT (parseRootElement el) def
let authors = if null $ fb2Authors st
then id
else setMeta "author" (map text $ reverse $ fb2Authors st)

View file

@ -14,6 +14,8 @@ Conversion of JATS XML to 'Pandoc' document.
module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict
import Control.Monad.Except (throwError)
import Text.Pandoc.Error (PandocError(..))
import Data.Char (isDigit, isSpace, toUpper)
import Data.Default
import Data.Generics
@ -22,6 +24,7 @@ import qualified Data.Map as Map
import Data.Maybe (maybeToList, fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@ -29,6 +32,7 @@ import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLContents)
import qualified Data.Set as S (fromList, member)
import Data.Set ((\\))
@ -51,8 +55,9 @@ instance Default JATSState where
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
let tree = normalizeTree . parseXML
$ T.unpack $ crFilter inp
tree <- either (throwError . PandocXMLError "")
(return . normalizeTree) $
parseXMLContents (TL.fromStrict $ crFilter inp)
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)

View file

@ -19,14 +19,18 @@ import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.Pandoc.Shared (crFilter, blocksToInlines')
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLContents)
import Control.Monad.Except (throwError)
type OPML m = StateT OPMLState m
@ -49,8 +53,10 @@ instance Default OPMLState where
readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readOPML opts inp = do
(bs, st') <- runStateT
(mapM parseBlock $ normalizeTree $
parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts }
(case parseXMLContents (TL.fromStrict (crFilter inp)) of
Left msg -> throwError $ PandocXMLError "" msg
Right ns -> mapM parseBlock $ normalizeTree ns)
def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
setAuthors (opmlDocAuthors st') $

View file

@ -15,6 +15,7 @@ module Text.Pandoc.Readers.Odt ( readOdt ) where
import Codec.Archive.Zip
import qualified Text.XML.Light as XML
import Text.Pandoc.XMLParser (parseXMLElement)
import qualified Data.ByteString.Lazy as B
@ -66,18 +67,18 @@ bytesToOdt bytes = case toArchiveOrFail bytes of
--
archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)
archiveToOdt archive = either (Left. PandocParseError) Right $ do
let onFailure msg Nothing = Left msg
archiveToOdt archive = do
let onFailure msg Nothing = Left $ PandocParseError msg
onFailure _ (Just x) = Right x
contentEntry <- onFailure "Could not find content.xml"
(findEntryByPath "content.xml" archive)
stylesEntry <- onFailure "Could not find styles.xml"
(findEntryByPath "styles.xml" archive)
contentElem <- onFailure "Could not find content element"
(entryToXmlElem contentEntry)
stylesElem <- onFailure "Could not find styles element"
(entryToXmlElem stylesEntry)
styles <- either (\_ -> Left "Could not read styles") Right
contentElem <- entryToXmlElem contentEntry
stylesElem <- entryToXmlElem stylesEntry
styles <- either
(\_ -> Left $ PandocParseError "Could not read styles")
Right
(chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem))
let filePathIsOdtMedia :: FilePath -> Bool
filePathIsOdtMedia fp =
@ -85,10 +86,13 @@ archiveToOdt archive = either (Left. PandocParseError) Right $ do
in (dir == "Pictures/") || (dir /= "./" && name == "content.xml")
let media = filteredFilesFromArchive archive filePathIsOdtMedia
let startState = readerState styles media
either (\_ -> Left "Could not convert opendocument") Right
either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right
(runConverter' read_body startState contentElem)
--
entryToXmlElem :: Entry -> Maybe XML.Element
entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry
entryToXmlElem :: Entry -> Either PandocError XML.Element
entryToXmlElem entry =
case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
Right x -> Right x
Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg

View file

@ -55,8 +55,9 @@ import Text.Pandoc.Walk (query, walk, walkM)
import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB)
import Text.Printf (printf)
import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..),
add_attrs, lookupAttr, node, onlyElems, parseXML,
add_attrs, lookupAttr, node, onlyElems,
ppElement, showElement, strContent, unode, unqual)
import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.XML (escapeStringForXML)
import Text.DocTemplates (FromContext(lookupContext), Context(..),
ToContext(toVal), Val(..))
@ -160,7 +161,12 @@ mkEntry path content = do
getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata
getEPUBMetadata opts meta = do
let md = metadataFromMeta opts meta
let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts
elts <- case writerEpubMetadata opts of
Nothing -> return []
Just t -> case parseXMLContents (TL.fromStrict t) of
Left msg -> throwError $
PandocXMLError "epub metadata" msg
Right ns -> return (onlyElems ns)
let md' = foldr addMetadataFromXML md elts
let addIdentifier m =
if null (epubIdentifier m)
@ -836,7 +842,8 @@ pandocToEPUB version opts doc = do
: case subs of
[] -> []
(_:_) -> [unode "ol" ! [("class","toc")] $ subs]
where titElements = parseXML titRendered
where titElements = either (const []) id $
parseXMLContents (TL.fromStrict titRendered)
titRendered = case P.runPure
(writeHtmlStringForEPUB version
opts{ writerTemplate = Nothing

View file

@ -19,7 +19,7 @@ FictionBook is an XML-based e-book format. For more information see:
module Text.Pandoc.Writers.FB2 (writeFB2) where
import Control.Monad (zipWithM)
import Control.Monad.Except (catchError)
import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify)
import Data.ByteString.Base64 (encode)
import Data.Char (isAscii, isControl, isSpace)
@ -27,16 +27,18 @@ import Data.Either (lefts, rights)
import Data.List (intercalate)
import Data.Text (Text, pack)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as TE
import Network.HTTP (urlEncode)
import Text.XML.Light
import qualified Text.XML.Light as X
import qualified Text.XML.Light.Cursor as XC
import qualified Text.XML.Light.Input as XI
import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Logging
import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def)
import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers,
@ -307,7 +309,10 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter .
map (el "p" . el "code" . T.unpack) . T.lines $ s
blockToXml (RawBlock f str) =
if f == Format "fb2"
then return $ XI.parseXML str
then
case parseXMLContents (TL.fromStrict str) of
Left msg -> throwError $ PandocXMLError "" msg
Right nds -> return nds
else return []
blockToXml (Div _ bs) = cMapM blockToXml bs
blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs

View file

@ -13,7 +13,7 @@ Conversion of 'Pandoc' documents to ODT.
-}
module Text.Pandoc.Writers.ODT ( writeODT ) where
import Codec.Archive.Zip
import Control.Monad.Except (catchError)
import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict
import qualified Data.ByteString.Lazy as B
import Data.Generics (everywhere', mkT)
@ -27,6 +27,7 @@ import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.ImageSize
import Text.Pandoc.Logging
import Text.Pandoc.MIME (extensionFromMimeType, getMimeType)
@ -35,10 +36,11 @@ import Text.DocLayout
import Text.Pandoc.Shared (stringify, pandocVersion, tshow)
import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks,
fixDisplayMath)
import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy)
import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy)
import Text.Pandoc.Walk
import Text.Pandoc.Writers.OpenDocument (writeOpenDocument)
import Text.Pandoc.XML
import Text.Pandoc.XMLParser (parseXMLElement)
import Text.TeXMath
import Text.XML.Light
@ -172,17 +174,18 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive
updateStyleWithLang Nothing arch = return arch
updateStyleWithLang (Just lang) arch = do
epochtime <- floor `fmap` lift P.getPOSIXTime
return arch{ zEntries = [if eRelativePath e == "styles.xml"
then case parseXMLDoc
(toStringLazy (fromEntry e)) of
Nothing -> e
Just d ->
toEntry "styles.xml" epochtime
( fromStringLazy
. ppTopElement
. addLang lang $ d )
else e
| e <- zEntries arch] }
entries <- mapM (\e -> if eRelativePath e == "styles.xml"
then case parseXMLElement
(toTextLazy (fromEntry e)) of
Left msg -> throwError $
PandocXMLError "styles.xml" msg
Right d -> return $
toEntry "styles.xml" epochtime
( fromStringLazy
. ppTopElement
. addLang lang $ d )
else return e) (zEntries arch)
return arch{ zEntries = entries }
addLang :: Lang -> Element -> Element
addLang lang = everywhere' (mkT updateLangAttr)

View file

@ -35,6 +35,7 @@ import qualified Data.Text as T
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light as XML
import Text.Pandoc.XMLParser (parseXMLElement)
mknode :: Node t => String -> [(String,String)] -> t -> Element
mknode s attrs =
@ -62,10 +63,10 @@ parseXml refArchive distArchive relpath =
findEntryByPath relpath distArchive of
Nothing -> throwError $ PandocSomeError $
T.pack relpath <> " missing in reference file"
Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of
Nothing -> throwError $ PandocSomeError $
T.pack relpath <> " corrupt in reference file"
Just d -> return d
Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of
Left msg ->
throwError $ PandocXMLError (T.pack relpath) msg
Right d -> return d
-- Copied from Util

View file

@ -29,6 +29,7 @@ import Data.Time.Clock (UTCTime)
import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime)
import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension)
import Text.XML.Light
import Text.Pandoc.XMLParser (parseXMLElement)
import Text.Pandoc.Definition
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@ -77,7 +78,8 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer)
getPresentationSize refArchive distArchive = do
entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus`
findEntryByPath "ppt/presentation.xml" distArchive
presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry
presElement <- either (const Nothing) return $
parseXMLElement $ UTF8.toTextLazy $ fromEntry entry
let ns = elemToNameSpaces presElement
sldSize <- findChild (elemName ns "p" "sldSz") presElement
cxS <- findAttr (QName "cx" Nothing Nothing) sldSize

View file

@ -0,0 +1,66 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.XMLParser
Copyright : Copyright (C) 2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Bridge to allow using xml-conduit's parser with xml-light's types.
-}
module Text.Pandoc.XMLParser
( parseXMLElement
, parseXMLContents
, module Text.XML.Light.Types
) where
import qualified Control.Exception as E
import qualified Text.XML as Conduit
import Text.XML.Unresolved (InvalidEventStream(..))
import qualified Text.XML.Light as Light
import Text.XML.Light.Types
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Map as M
import Data.Maybe (mapMaybe)
-- Drop in replacement for parseXMLDoc in xml-light.
parseXMLElement :: TL.Text -> Either T.Text Light.Element
parseXMLElement t =
elementToElement . Conduit.documentRoot <$>
either (Left . T.pack . E.displayException) Right
(Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t)
parseXMLContents :: TL.Text -> Either T.Text [Light.Content]
parseXMLContents t =
case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of
Left e ->
case E.fromException e of
Just (ContentAfterRoot _) ->
elContent <$> parseXMLElement ("<wrapper>" <> t <> "</wrapper>")
_ -> Left . T.pack . E.displayException $ e
Right x -> Right [Light.Elem . elementToElement . Conduit.documentRoot $ x]
elementToElement :: Conduit.Element -> Light.Element
elementToElement (Conduit.Element name attribMap nodes) =
Light.Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing
where
attrs = map (\(n,v) -> Light.Attr (nameToQname n) (T.unpack v)) $
M.toList attribMap
nameToQname (Conduit.Name localName mbns mbpref) =
case mbpref of
Nothing | "xmlns:" `T.isPrefixOf` localName ->
Light.QName (T.unpack $ T.drop 6 localName) (T.unpack <$> mbns)
(Just "xmlns")
_ -> Light.QName (T.unpack localName) (T.unpack <$> mbns)
(T.unpack <$> mbpref)
nodeToContent :: Conduit.Node -> Maybe Light.Content
nodeToContent (Conduit.NodeElement el) =
Just (Light.Elem (elementToElement el))
nodeToContent (Conduit.NodeContent t) =
Just (Light.Text (Light.CData Light.CDataText (T.unpack t) Nothing))
nodeToContent _ = Nothing

View file

@ -88,6 +88,7 @@ tests = [ testGroup "inline code"
"<p>\n\
\ <inline-formula><alternatives>\n\
\ <tex-math><![CDATA[\\sigma|_{\\{x\\}}]]></tex-math>\n\
\ </alternatives></inline-formula>\n\
\</p>"
=?> para (math "\\sigma|_{\\{x\\}}")
, test jats "math ml only" $

View file

@ -4,7 +4,7 @@
<caption>
<p>bar</p>
</caption>
<graphic xlink:href="foo.png" xlink:alt-text="baz">
<graphic xlink:href="foo.png" xlink:alt-text="baz" />
</fig>
^D
[Para [Image ("fig-1",[],[]) [Str "bar"] ("foo.png","fig:")]]
@ -17,7 +17,7 @@
<title>foo</title>
<p>bar</p>
</caption>
<graphic xlink:href="foo.png" xlink:alt-text="baz">
<graphic xlink:href="foo.png" xlink:alt-text="baz" />
</fig>
^D
[Para [Image ("fig-1",[],[]) [Str "foo",LineBreak,Str "bar"] ("foo.png","fig:")]]

View file

@ -1,6 +1,11 @@
<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"
[
<!ENTITY GHC "GHC" >
<!ENTITY let "LET" >
<!ENTITY case "CASE" >
]>
<article>
<articleinfo>
<title>Pandoc Test Suite</title>

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,4 +1,4 @@
Pandoc (Meta {unMeta = fromList [("author",MetaList [MetaInlines [Str "John",Space,Str "MacFarlane"]]),("title",MetaInlines [Str "Pandoc",Space,Str "Test",Space,Str "Suite"])]})
Pandoc (Meta {unMeta = fromList [("author",MetaList [MetaInlines [Str "John",Space,Str "MacFarlane"],MetaInlines [Str "Anonymous"]]),("title",MetaInlines [Str "Pandoc",Space,Str "Test",Space,Str "Suite"])]})
[Para [Str "This",Space,Str "is",Space,Str "a",Space,Str "set",Space,Str "of",Space,Str "tests",Space,Str "for",Space,Str "pandoc.",Space,Str "Most",Space,Str "of",Space,Str "them",Space,Str "are",Space,Str "adapted",Space,Str "from",Space,Str "John",Space,Str "Gruber's",Space,Str "markdown",Space,Str "test",Space,Str "suite."]
,Header 1 ("headers",[],[]) [Str "Headers"]
,Header 2 ("level-2-with-an-embedded-link",[],[]) [Str "Level",Space,Str "2",Space,Str "with",Space,Str "an",SoftBreak,Link ("",[],[]) [Str "embedded",SoftBreak,Str "link"] ("/url","")]

View file

@ -20,6 +20,7 @@
<surname>MacFarlane</surname>
<given-names>John</given-names>
</name>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Anonymous</surname>

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.