Improve docx reader's robustness in extracting images.

The docx reader made a couple assumptions about how docx
containers were laid out that were not always true, with
the result that some images in documents did not get
found/extracted.

Closes #7511.
This commit is contained in:
John MacFarlane 2021-08-19 10:49:20 -07:00
parent 5159d6653b
commit ef4efa5373

View file

@ -507,9 +507,7 @@ archiveToRelationships archive docXmlPath =
filePathIsMedia :: FilePath -> Bool
filePathIsMedia fp =
let (dir, _) = splitFileName fp
in
(dir == "word/media/")
"media" `elem` splitPath (takeDirectory fp)
lookupLevel :: T.Text -> T.Text -> Numbering -> Maybe Level
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
@ -774,8 +772,11 @@ expandDrawingId s = do
target <- asks (fmap T.unpack . lookupRelationship location s . envRelationships)
case target of
Just filepath -> do
bytes <- asks (lookup ("word/" ++ filepath) . envMedia)
case bytes of
media <- asks envMedia
let filepath' = case filepath of
('/':rest) -> rest
_ -> "word/" ++ filepath
case lookup filepath' media of
Just bs -> return (filepath, bs)
Nothing -> throwError DocxError
Nothing -> throwError DocxError