Docx Reader: preprocess Document body to unwrap "w:sdt" elements

We walk through the document (using the zipper in
Text.XML.Light.Cursor) to unwrap the sdt tags before doing the rest of
the parsing of the document. Note that the function is generically
named `walkDocument` in case we need to do any further preprocessing
in the future.

Closes #4190
This commit is contained in:
Jesse Rosenthal 2017-12-27 09:49:28 -05:00
parent 7e8cfc0990
commit dc3ee500a0

View file

@ -73,6 +73,7 @@ import Text.TeXMath (Exp)
import Text.TeXMath.Readers.OMML (readOMML)
import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont)
import Text.XML.Light
import qualified Text.XML.Light.Cursor as XMLC
data ReaderEnv = ReaderEnv { envNotes :: Notes
, envComments :: Comments
@ -117,6 +118,32 @@ mapD f xs =
in
concatMapM handler xs
unwrapSDT :: NameSpaces -> Content -> Content
unwrapSDT ns (Elem element)
| isElem ns "w" "sdt" element
, Just sdtContent <- findChildByName ns "w" "sdtContent" element
, child : _ <- elChildren sdtContent
= Elem child
unwrapSDT _ content = content
walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor
walkDocument' ns cur =
let modifiedCur = XMLC.modifyContent (unwrapSDT ns) cur
in
case XMLC.nextDF modifiedCur of
Just cur' -> walkDocument' ns cur'
Nothing -> XMLC.root modifiedCur
walkDocument :: NameSpaces -> Element -> Maybe Element
walkDocument ns element =
let cur = XMLC.fromContent (Elem element)
cur' = walkDocument' ns cur
in
case XMLC.toTree cur' of
Elem element' -> Just element'
_ -> Nothing
data Docx = Docx Document
deriving Show
@ -298,7 +325,10 @@ archiveToDocument zf = do
docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
body <- elemToBody namespaces bodyElem
let bodyElem' = case walkDocument namespaces bodyElem of
Just e -> e
Nothing -> bodyElem
body <- elemToBody namespaces bodyElem'
return $ Document namespaces body
elemToBody :: NameSpaces -> Element -> D Body