Handle Word files generated by Microsoft Word Online.

For some reason, Word in Office 365 Online uses `document2.xml`
for the content, instead of `document.xml`.  This causes pandoc
not to be able to parse docx.

This quick fix has the parser check for both `document.xml`
and `document2.xml`.

Addresses #5277, but a more robust solution would be to
get the name of the main document dynamically (who knows
whether it might change again?).
This commit is contained in:
John MacFarlane 2019-02-06 09:01:26 -08:00
parent 59fa4eb17e
commit 2b003d4a6b

View file

@ -364,6 +364,7 @@ archiveToDocxWithWarnings archive = do
archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
entry <- maybeToD $ findEntryByPath "word/document.xml" zf
`mplus` findEntryByPath "word/document2.xml" zf -- see #5277
docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
@ -478,6 +479,7 @@ archiveToComments zf =
filePathToRelType :: FilePath -> Maybe DocumentLocation
filePathToRelType "word/_rels/document.xml.rels" = Just InDocument
filePathToRelType "word/_rels/document2.xml.rels" = Just InDocument
filePathToRelType "word/_rels/footnotes.xml.rels" = Just InFootnote
filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote
filePathToRelType _ = Nothing