Docx reader: Dynamically determine document.xml path.

The desktop Word program places the main document file in
"word/document.xml", but the online word places it in
"word/document2.xml". This file path is actually stated in the root
"_rels/.rels" file, in the "Relationship" element with an
"http://../officedocument" type.

Closes #5277
This commit is contained in:
Jesse Rosenthal 2019-02-06 21:06:14 -05:00
parent 5d3b8ede15
commit 4cce0efa48

View file

@ -359,12 +359,21 @@ archiveToDocxWithWarnings archive = do
Right doc -> Right (Docx doc, stateWarnings st) Right doc -> Right (Docx doc, stateWarnings st)
Left e -> Left e Left e -> Left e
getDocumentPath :: Archive -> Maybe String
getDocumentPath zf = do
entry <- findEntryByPath "_rels/.rels" zf
relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem
rel <- listToMaybe $
filter (\e -> findAttr (QName "Type" Nothing Nothing) e ==
Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")
rels
findAttr (QName "Target" Nothing Nothing) rel
archiveToDocument :: Archive -> D Document archiveToDocument :: Archive -> D Document
archiveToDocument zf = do archiveToDocument zf = do
entry <- maybeToD $ findEntryByPath "word/document.xml" zf docPath <- maybeToD $ getDocumentPath zf
`mplus` findEntryByPath "word/document2.xml" zf -- see #5277 entry <- maybeToD $ findEntryByPath docPath zf
docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let namespaces = elemToNameSpaces docElem let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem