Docx Reader: preprocess Document body to unwrap "w:sdt" elements
We walk through the document (using the zipper in Text.XML.Light.Cursor) to unwrap the sdt tags before doing the rest of the parsing of the document. Note that the function is generically named `walkDocument` in case we need to do any further preprocessing in the future. Closes #4190
This commit is contained in:
parent
7e8cfc0990
commit
dc3ee500a0
1 changed files with 31 additions and 1 deletions
|
@ -73,6 +73,7 @@ import Text.TeXMath (Exp)
|
|||
import Text.TeXMath.Readers.OMML (readOMML)
|
||||
import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont)
|
||||
import Text.XML.Light
|
||||
import qualified Text.XML.Light.Cursor as XMLC
|
||||
|
||||
data ReaderEnv = ReaderEnv { envNotes :: Notes
|
||||
, envComments :: Comments
|
||||
|
@ -117,6 +118,32 @@ mapD f xs =
|
|||
in
|
||||
concatMapM handler xs
|
||||
|
||||
unwrapSDT :: NameSpaces -> Content -> Content
|
||||
unwrapSDT ns (Elem element)
|
||||
| isElem ns "w" "sdt" element
|
||||
, Just sdtContent <- findChildByName ns "w" "sdtContent" element
|
||||
, child : _ <- elChildren sdtContent
|
||||
= Elem child
|
||||
unwrapSDT _ content = content
|
||||
|
||||
walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor
|
||||
walkDocument' ns cur =
|
||||
let modifiedCur = XMLC.modifyContent (unwrapSDT ns) cur
|
||||
in
|
||||
case XMLC.nextDF modifiedCur of
|
||||
Just cur' -> walkDocument' ns cur'
|
||||
Nothing -> XMLC.root modifiedCur
|
||||
|
||||
walkDocument :: NameSpaces -> Element -> Maybe Element
|
||||
walkDocument ns element =
|
||||
let cur = XMLC.fromContent (Elem element)
|
||||
cur' = walkDocument' ns cur
|
||||
in
|
||||
case XMLC.toTree cur' of
|
||||
Elem element' -> Just element'
|
||||
_ -> Nothing
|
||||
|
||||
|
||||
data Docx = Docx Document
|
||||
deriving Show
|
||||
|
||||
|
@ -298,7 +325,10 @@ archiveToDocument zf = do
|
|||
docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
|
||||
let namespaces = elemToNameSpaces docElem
|
||||
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
|
||||
body <- elemToBody namespaces bodyElem
|
||||
let bodyElem' = case walkDocument namespaces bodyElem of
|
||||
Just e -> e
|
||||
Nothing -> bodyElem
|
||||
body <- elemToBody namespaces bodyElem'
|
||||
return $ Document namespaces body
|
||||
|
||||
elemToBody :: NameSpaces -> Element -> D Body
|
||||
|
|
Loading…
Add table
Reference in a new issue