From 3776e828a83048697e5c64d9fb4bedc0145197dc Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 10 Jun 2021 16:47:02 -0700
Subject: [PATCH] Fix MediaBag regressions.

With the 2.14 release `--extract-media` stopped working as before;
there could be mismatches between the paths in the rendered document and
the extracted media.

This patch makes several changes (while keeping the same API).

The `mediaPath` in 2.14 was always constructed from the SHA1 hash of
the media contents.  Now, we preserve the original path unless it's
an absolute path or contains `..` segments (in that case we use a path
based on the SHA1 hash of the contents).

When constructing a path from the SHA1 hash, we always use the
original extension, if there is one. Otherwise we look up an
appropriate extension for the mime type.

`mediaDirectory` and `mediaItems` now use the `mediaPath`, rather
than the mediabag key, for the first component of the tuple.
This makes more sense, I think, and fits with the documentation
of these functions; eventually, though, we should rework the API so that
`mediaItems` returns both the keys and the MediaItems.

Rewriting of source paths in `extractMedia` has been fixed.

`fillMediaBag` has been modified so that it doesn't modify
image paths (that was part of the problem in #7345).

We now do path normalization (e.g. `\` separators on Windows) only
in writing the media; the paths are left unchanged in the image
links (sensibly, since they might be URLs and not file paths).

These changes should restore the original behavior from before 2.14.

Closes #7345.
---
 MANUAL.txt                           | 12 ++++----
 src/Text/Pandoc/Class/IO.hs          | 41 ++++++++++++++--------------
 src/Text/Pandoc/Class/PandocMonad.hs | 17 ++++--------
 src/Text/Pandoc/MediaBag.hs          | 25 +++++++++--------
 4 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/MANUAL.txt b/MANUAL.txt
index b3a1f95e2..ef569433a 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -675,12 +675,12 @@ header when requesting a document from a URL:
 :   Extract images and other media contained in or linked from
     the source document to the path *DIR*, creating it if
     necessary, and adjust the images references in the document
-    so they point to the extracted files.  If the source format is
-    a binary container (docx, epub, or odt), the media is
-    extracted from the container and the original
-    filenames are used. Otherwise the media is read from the
-    file system or downloaded, and new filenames are constructed
-    based on SHA1 hashes of the contents.
+    so they point to the extracted files.  Media are downloaded,
+    read from the file system, or extracted from a binary
+    container (e.g. docx), as needed.  The original file paths
+    are used if they are relative paths not containing `..`.
+    Otherwise filenames are constructed from the SHA1 hash of
+    the contents.
 
 `--abbreviations=`*FILE*
 
diff --git a/src/Text/Pandoc/Class/IO.hs b/src/Text/Pandoc/Class/IO.hs
index 6df39d4d0..169074860 100644
--- a/src/Text/Pandoc/Class/IO.hs
+++ b/src/Text/Pandoc/Class/IO.hs
@@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image))
 import Text.Pandoc.Error (PandocError (..))
 import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage)
 import Text.Pandoc.MIME (MimeType)
-import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaDirectory)
+import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaItems)
 import Text.Pandoc.Walk (walk)
 import qualified Control.Exception as E
 import qualified Data.ByteString as B
@@ -200,31 +200,32 @@ alertIndent (l:ls) = do
 extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc
 extractMedia dir d = do
   media <- getMediaBag
-  case [fp | (fp, _, _) <- mediaDirectory media] of
-    []  -> return d
-    fps -> do
-      mapM_ (writeMedia dir media) fps
-      return $ walk (adjustImagePath dir fps) d
+  let items = mediaItems media
+  if null items
+    then return d
+    else do
+      mapM_ (writeMedia dir) items
+      return $ walk (adjustImagePath dir media) d
 
 -- | Write the contents of a media bag to a path.
 writeMedia :: (PandocMonad m, MonadIO m)
-           => FilePath -> MediaBag -> FilePath
+           => FilePath
+           -> (FilePath, MimeType, BL.ByteString)
            -> m ()
-writeMedia dir mediabag subpath = do
-  let mbcontents = lookupMedia subpath mediabag
-  case mbcontents of
-       Nothing -> throwError $ PandocResourceNotFound $ pack subpath
-       Just item -> do
-         -- we normalize to get proper path separators for the platform
-         let fullpath = dir </> normalise (mediaPath item)
-         liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
-         logIOError $ BL.writeFile fullpath $ mediaContents item
+writeMedia dir (fp, _mt, bs) = do
+  -- we normalize to get proper path separators for the platform
+  let fullpath = normalise $ dir </> fp
+  liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
+  logIOError $ BL.writeFile fullpath bs
 
 -- | If the given Inline element is an image with a @src@ path equal to
 -- one in the list of @paths@, then prepends @dir@ to the image source;
 -- returns the element unchanged otherwise.
-adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline
-adjustImagePath dir paths (Image attr lab (src, tit))
-   | unpack src `elem` paths
-     = Image attr lab (pack (normalise $ dir </> unpack src), tit)
+adjustImagePath :: FilePath -> MediaBag -> Inline -> Inline
+adjustImagePath dir mediabag (Image attr lab (src, tit)) =
+  case lookupMedia (T.unpack src) mediabag of
+    Nothing -> Image attr lab (src, tit)
+    Just item ->
+      let fullpath = dir </> mediaPath item
+      in  Image attr lab (T.pack fullpath, tit)
 adjustImagePath _ _ x = x
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 4eb80df29..439aec071 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -638,17 +638,12 @@ fillMediaBag d = walkM handleImage d
         handleImage (Image attr lab (src, tit)) = catchError
           (do mediabag <- getMediaBag
               let fp = T.unpack src
-              src' <- T.pack <$> case lookupMedia fp mediabag of
-                        Just item -> return $ mediaPath item
-                        Nothing -> do
-                          (bs, mt) <- fetchItem src
-                          insertMedia fp mt (BL.fromStrict bs)
-                          mediabag' <- getMediaBag
-                          case lookupMedia fp mediabag' of
-                             Just item -> return $ mediaPath item
-                             Nothing -> throwError $ PandocSomeError $
-                               src <> " not successfully inserted into MediaBag"
-              return $ Image attr lab (src', tit))
+              case lookupMedia fp mediabag of
+                Just _ -> return ()
+                Nothing -> do
+                  (bs, mt) <- fetchItem src
+                  insertMedia fp mt (BL.fromStrict bs)
+              return $ Image attr lab (src, tit))
           (\e ->
               case e of
                 PandocResourceNotFound _ -> do
diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs
index a65f315fc..06fba5632 100644
--- a/src/Text/Pandoc/MediaBag.hs
+++ b/src/Text/Pandoc/MediaBag.hs
@@ -71,16 +71,21 @@ insertMedia :: FilePath       -- ^ relative path and canonical name of resource
             -> MediaBag
             -> MediaBag
 insertMedia fp mbMime contents (MediaBag mediamap) =
-  MediaBag (M.insert (canonicalize fp) mediaItem mediamap)
-  where mediaItem = MediaItem{ mediaPath = showDigest (sha1 contents) <>
-                                 "." <> ext
+  MediaBag (M.insert fp' mediaItem mediamap)
+  where mediaItem = MediaItem{ mediaPath = newpath
                              , mediaContents = contents
                              , mediaMimeType = mt }
+        fp' = canonicalize fp
+        newpath = if isRelative fp && ".." `notElem` splitPath fp
+                     then T.unpack fp'
+                     else showDigest (sha1 contents) <> "." <> ext
         fallback = case takeExtension fp of
                         ".gz" -> getMimeTypeDef $ dropExtension fp
                         _     -> getMimeTypeDef fp
         mt = fromMaybe fallback mbMime
-        ext = maybe (takeExtension fp) T.unpack $ extensionFromMimeType mt
+        ext = case takeExtension fp of
+               '.':e  -> e
+               _      -> maybe "" T.unpack $ extensionFromMimeType mt
 
 
 -- | Lookup a media item in a 'MediaBag', returning mime type and contents.
@@ -92,13 +97,11 @@ lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
 -- | Get a list of the file paths stored in a 'MediaBag', with
 -- their corresponding mime types and the lengths in bytes of the contents.
 mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
-mediaDirectory (MediaBag mediamap) =
-  M.foldrWithKey (\fp item ->
-      ((T.unpack fp, mediaMimeType item,
-        fromIntegral (BL.length (mediaContents item))):)) [] mediamap
+mediaDirectory mediabag =
+  map (\(fp, mt, bs) -> (fp, mt, fromIntegral (BL.length bs)))
+    (mediaItems mediabag)
 
 mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
 mediaItems (MediaBag mediamap) =
-  M.foldrWithKey (\fp item ->
-      ((T.unpack fp, mediaMimeType item, mediaContents item):))
-     [] mediamap
+  map (\item -> (mediaPath item, mediaMimeType item, mediaContents item))
+      (M.elems mediamap)