Improve docx reader's robustness in extracting images.

The docx reader made a couple assumptions about how docx containers were laid out that were not always true, with the result that some images in documents did not get found/extracted. Closes #7511.
2021-08-19 10:49:20 -07:00 · 2021-08-19 10:49:20 -07:00 · ef4efa5373
commit ef4efa5373
parent 5159d6653b
1 changed files with 6 additions and 5 deletions
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@ -507,9 +507,7 @@ archiveToRelationships archive docXmlPath =

 filePathIsMedia :: FilePath -> Bool
 filePathIsMedia fp =
-  let (dir, _) = splitFileName fp
-  in
-   (dir == "word/media/")
+  "media" `elem` splitPath (takeDirectory fp)

 lookupLevel :: T.Text -> T.Text -> Numbering -> Maybe Level
 lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
@ -774,8 +772,11 @@ expandDrawingId s = do
  target <- asks (fmap T.unpack . lookupRelationship location s . envRelationships)
  case target of
    Just filepath -> do
-      bytes <- asks (lookup ("word/" ++ filepath) . envMedia)
-      case bytes of
+      media <- asks envMedia
+      let filepath' = case filepath of
+                        ('/':rest) -> rest
+                        _ -> "word/" ++ filepath
+      case lookup filepath' media of
        Just bs -> return (filepath, bs)
        Nothing -> throwError DocxError
    Nothing -> throwError DocxError