Fix MediaBag regressions.
With the 2.14 release `--extract-media` stopped working as before; there could be mismatches between the paths in the rendered document and the extracted media. This patch makes several changes (while keeping the same API). The `mediaPath` in 2.14 was always constructed from the SHA1 hash of the media contents. Now, we preserve the original path unless it's an absolute path or contains `..` segments (in that case we use a path based on the SHA1 hash of the contents). When constructing a path from the SHA1 hash, we always use the original extension, if there is one. Otherwise we look up an appropriate extension for the mime type. `mediaDirectory` and `mediaItems` now use the `mediaPath`, rather than the mediabag key, for the first component of the tuple. This makes more sense, I think, and fits with the documentation of these functions; eventually, though, we should rework the API so that `mediaItems` returns both the keys and the MediaItems. Rewriting of source paths in `extractMedia` has been fixed. `fillMediaBag` has been modified so that it doesn't modify image paths (that was part of the problem in #7345). We now do path normalization (e.g. `\` separators on Windows) only in writing the media; the paths are left unchanged in the image links (sensibly, since they might be URLs and not file paths). These changes should restore the original behavior from before 2.14. Closes #7345.
This commit is contained in:
parent
aa79b3035c
commit
3776e828a8
4 changed files with 47 additions and 48 deletions
12
MANUAL.txt
12
MANUAL.txt
|
@ -675,12 +675,12 @@ header when requesting a document from a URL:
|
|||
: Extract images and other media contained in or linked from
|
||||
the source document to the path *DIR*, creating it if
|
||||
necessary, and adjust the images references in the document
|
||||
so they point to the extracted files. If the source format is
|
||||
a binary container (docx, epub, or odt), the media is
|
||||
extracted from the container and the original
|
||||
filenames are used. Otherwise the media is read from the
|
||||
file system or downloaded, and new filenames are constructed
|
||||
based on SHA1 hashes of the contents.
|
||||
so they point to the extracted files. Media are downloaded,
|
||||
read from the file system, or extracted from a binary
|
||||
container (e.g. docx), as needed. The original file paths
|
||||
are used if they are relative paths not containing `..`.
|
||||
Otherwise filenames are constructed from the SHA1 hash of
|
||||
the contents.
|
||||
|
||||
`--abbreviations=`*FILE*
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image))
|
|||
import Text.Pandoc.Error (PandocError (..))
|
||||
import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage)
|
||||
import Text.Pandoc.MIME (MimeType)
|
||||
import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaDirectory)
|
||||
import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaItems)
|
||||
import Text.Pandoc.Walk (walk)
|
||||
import qualified Control.Exception as E
|
||||
import qualified Data.ByteString as B
|
||||
|
@ -200,31 +200,32 @@ alertIndent (l:ls) = do
|
|||
extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc
|
||||
extractMedia dir d = do
|
||||
media <- getMediaBag
|
||||
case [fp | (fp, _, _) <- mediaDirectory media] of
|
||||
[] -> return d
|
||||
fps -> do
|
||||
mapM_ (writeMedia dir media) fps
|
||||
return $ walk (adjustImagePath dir fps) d
|
||||
let items = mediaItems media
|
||||
if null items
|
||||
then return d
|
||||
else do
|
||||
mapM_ (writeMedia dir) items
|
||||
return $ walk (adjustImagePath dir media) d
|
||||
|
||||
-- | Write the contents of a media bag to a path.
|
||||
writeMedia :: (PandocMonad m, MonadIO m)
|
||||
=> FilePath -> MediaBag -> FilePath
|
||||
=> FilePath
|
||||
-> (FilePath, MimeType, BL.ByteString)
|
||||
-> m ()
|
||||
writeMedia dir mediabag subpath = do
|
||||
let mbcontents = lookupMedia subpath mediabag
|
||||
case mbcontents of
|
||||
Nothing -> throwError $ PandocResourceNotFound $ pack subpath
|
||||
Just item -> do
|
||||
-- we normalize to get proper path separators for the platform
|
||||
let fullpath = dir </> normalise (mediaPath item)
|
||||
liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
|
||||
logIOError $ BL.writeFile fullpath $ mediaContents item
|
||||
writeMedia dir (fp, _mt, bs) = do
|
||||
-- we normalize to get proper path separators for the platform
|
||||
let fullpath = normalise $ dir </> fp
|
||||
liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
|
||||
logIOError $ BL.writeFile fullpath bs
|
||||
|
||||
-- | If the given Inline element is an image with a @src@ path equal to
|
||||
-- one in the list of @paths@, then prepends @dir@ to the image source;
|
||||
-- returns the element unchanged otherwise.
|
||||
adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline
|
||||
adjustImagePath dir paths (Image attr lab (src, tit))
|
||||
| unpack src `elem` paths
|
||||
= Image attr lab (pack (normalise $ dir </> unpack src), tit)
|
||||
adjustImagePath :: FilePath -> MediaBag -> Inline -> Inline
|
||||
adjustImagePath dir mediabag (Image attr lab (src, tit)) =
|
||||
case lookupMedia (T.unpack src) mediabag of
|
||||
Nothing -> Image attr lab (src, tit)
|
||||
Just item ->
|
||||
let fullpath = dir </> mediaPath item
|
||||
in Image attr lab (T.pack fullpath, tit)
|
||||
adjustImagePath _ _ x = x
|
||||
|
|
|
@ -638,17 +638,12 @@ fillMediaBag d = walkM handleImage d
|
|||
handleImage (Image attr lab (src, tit)) = catchError
|
||||
(do mediabag <- getMediaBag
|
||||
let fp = T.unpack src
|
||||
src' <- T.pack <$> case lookupMedia fp mediabag of
|
||||
Just item -> return $ mediaPath item
|
||||
Nothing -> do
|
||||
(bs, mt) <- fetchItem src
|
||||
insertMedia fp mt (BL.fromStrict bs)
|
||||
mediabag' <- getMediaBag
|
||||
case lookupMedia fp mediabag' of
|
||||
Just item -> return $ mediaPath item
|
||||
Nothing -> throwError $ PandocSomeError $
|
||||
src <> " not successfully inserted into MediaBag"
|
||||
return $ Image attr lab (src', tit))
|
||||
case lookupMedia fp mediabag of
|
||||
Just _ -> return ()
|
||||
Nothing -> do
|
||||
(bs, mt) <- fetchItem src
|
||||
insertMedia fp mt (BL.fromStrict bs)
|
||||
return $ Image attr lab (src, tit))
|
||||
(\e ->
|
||||
case e of
|
||||
PandocResourceNotFound _ -> do
|
||||
|
|
|
@ -71,16 +71,21 @@ insertMedia :: FilePath -- ^ relative path and canonical name of resource
|
|||
-> MediaBag
|
||||
-> MediaBag
|
||||
insertMedia fp mbMime contents (MediaBag mediamap) =
|
||||
MediaBag (M.insert (canonicalize fp) mediaItem mediamap)
|
||||
where mediaItem = MediaItem{ mediaPath = showDigest (sha1 contents) <>
|
||||
"." <> ext
|
||||
MediaBag (M.insert fp' mediaItem mediamap)
|
||||
where mediaItem = MediaItem{ mediaPath = newpath
|
||||
, mediaContents = contents
|
||||
, mediaMimeType = mt }
|
||||
fp' = canonicalize fp
|
||||
newpath = if isRelative fp && ".." `notElem` splitPath fp
|
||||
then T.unpack fp'
|
||||
else showDigest (sha1 contents) <> "." <> ext
|
||||
fallback = case takeExtension fp of
|
||||
".gz" -> getMimeTypeDef $ dropExtension fp
|
||||
_ -> getMimeTypeDef fp
|
||||
mt = fromMaybe fallback mbMime
|
||||
ext = maybe (takeExtension fp) T.unpack $ extensionFromMimeType mt
|
||||
ext = case takeExtension fp of
|
||||
'.':e -> e
|
||||
_ -> maybe "" T.unpack $ extensionFromMimeType mt
|
||||
|
||||
|
||||
-- | Lookup a media item in a 'MediaBag', returning mime type and contents.
|
||||
|
@ -92,13 +97,11 @@ lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
|
|||
-- | Get a list of the file paths stored in a 'MediaBag', with
|
||||
-- their corresponding mime types and the lengths in bytes of the contents.
|
||||
mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
|
||||
mediaDirectory (MediaBag mediamap) =
|
||||
M.foldrWithKey (\fp item ->
|
||||
((T.unpack fp, mediaMimeType item,
|
||||
fromIntegral (BL.length (mediaContents item))):)) [] mediamap
|
||||
mediaDirectory mediabag =
|
||||
map (\(fp, mt, bs) -> (fp, mt, fromIntegral (BL.length bs)))
|
||||
(mediaItems mediabag)
|
||||
|
||||
mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
|
||||
mediaItems (MediaBag mediamap) =
|
||||
M.foldrWithKey (\fp item ->
|
||||
((T.unpack fp, mediaMimeType item, mediaContents item):))
|
||||
[] mediamap
|
||||
map (\item -> (mediaPath item, mediaMimeType item, mediaContents item))
|
||||
(M.elems mediamap)
|
||||
|
|
Loading…
Add table
Reference in a new issue