MediaBag improvements.
In the current dev version, we will sometimes add a version of an image with a hashed name, keeping the original version with the original name, which would leave to undesirable duplication. This change separates the media's filename from the media's canonical name (which is the path of the link in the document itself). Filenames are based on SHA1 hashes and assigned automatically. In Text.Pandoc.MediaBag: - Export MediaItem type [API change]. - Change MediaBag type to a map from Text to MediaItem [API change]. - `lookupMedia` now returns a `MediaItem` [API change]. - Change `insertMedia` so it sets the `mediaPath` to a filename based on the SHA1 hash of the contents. This will be used when contents are extracted. In Text.Pandoc.Class.PandocMonad: - Remove `fetchMediaResource` [API change]. Lua MediaBag module has been changed minimally. In the future it would be better, probably, to give Lua access to the full MediaItem type.
This commit is contained in:
parent
58fbf56548
commit
8511f6fdf6
5 changed files with 55 additions and 48 deletions
|
@ -62,7 +62,7 @@ import Text.Pandoc.Definition (Pandoc, Inline (Image))
|
|||
import Text.Pandoc.Error (PandocError (..))
|
||||
import Text.Pandoc.Logging (LogMessage (..), messageVerbosity, showLogMessage)
|
||||
import Text.Pandoc.MIME (MimeType)
|
||||
import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory)
|
||||
import Text.Pandoc.MediaBag (MediaBag, MediaItem(..), lookupMedia, mediaDirectory)
|
||||
import Text.Pandoc.Walk (walk)
|
||||
import qualified Control.Exception as E
|
||||
import qualified Data.ByteString as B
|
||||
|
@ -213,14 +213,13 @@ writeMedia :: (PandocMonad m, MonadIO m)
|
|||
writeMedia dir mediabag subpath = do
|
||||
-- we join and split to convert a/b/c to a\b\c on Windows;
|
||||
-- in zip containers all paths use /
|
||||
let fullpath = dir </> unEscapeString (normalise subpath)
|
||||
let mbcontents = lookupMedia subpath mediabag
|
||||
case mbcontents of
|
||||
Nothing -> throwError $ PandocResourceNotFound $ pack subpath
|
||||
Just (_, bs) -> do
|
||||
report $ Extracting $ pack fullpath
|
||||
Just item -> do
|
||||
let fullpath = dir </> mediaPath item
|
||||
liftIOError (createDirectoryIfMissing True) (takeDirectory fullpath)
|
||||
logIOError $ BL.writeFile fullpath bs
|
||||
logIOError $ BL.writeFile fullpath $ mediaContents item
|
||||
|
||||
-- | If the given Inline element is an image with a @src@ path equal to
|
||||
-- one in the list of @paths@, then prepends @dir@ to the image source;
|
||||
|
|
|
@ -37,7 +37,6 @@ module Text.Pandoc.Class.PandocMonad
|
|||
, setUserDataDir
|
||||
, getUserDataDir
|
||||
, fetchItem
|
||||
, fetchMediaResource
|
||||
, getInputFiles
|
||||
, setInputFiles
|
||||
, getOutputFile
|
||||
|
@ -57,8 +56,6 @@ module Text.Pandoc.Class.PandocMonad
|
|||
import Codec.Archive.Zip
|
||||
import Control.Monad.Except (MonadError (catchError, throwError),
|
||||
MonadTrans, lift, when)
|
||||
import Data.Digest.Pure.SHA (sha1, showDigest)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.List (foldl')
|
||||
import Data.Time (UTCTime)
|
||||
import Data.Time.Clock.POSIX (POSIXTime, utcTimeToPOSIXSeconds,
|
||||
|
@ -67,7 +64,7 @@ import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime)
|
|||
import Network.URI ( escapeURIString, nonStrictRelativeTo,
|
||||
unEscapeString, parseURIReference, isAllowedInURI,
|
||||
parseURI, URI(..) )
|
||||
import System.FilePath ((</>), (<.>), takeExtension, dropExtension,
|
||||
import System.FilePath ((</>), takeExtension, dropExtension,
|
||||
isRelative, splitDirectories)
|
||||
import System.Random (StdGen)
|
||||
import Text.Collate.Lang (Lang(..), parseLang, renderLang)
|
||||
|
@ -75,8 +72,8 @@ import Text.Pandoc.Class.CommonState (CommonState (..))
|
|||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Error
|
||||
import Text.Pandoc.Logging
|
||||
import Text.Pandoc.MIME (MimeType, getMimeType, extensionFromMimeType)
|
||||
import Text.Pandoc.MediaBag (MediaBag, lookupMedia)
|
||||
import Text.Pandoc.MIME (MimeType, getMimeType)
|
||||
import Text.Pandoc.MediaBag (MediaBag, lookupMedia, MediaItem(..))
|
||||
import Text.Pandoc.Shared (uriPathToPath, safeRead)
|
||||
import Text.Pandoc.Translations (Term(..), Translations, lookupTerm,
|
||||
readTranslations)
|
||||
|
@ -376,7 +373,8 @@ fetchItem :: PandocMonad m
|
|||
fetchItem s = do
|
||||
mediabag <- getMediaBag
|
||||
case lookupMedia (T.unpack s) mediabag of
|
||||
Just (mime, bs) -> return (BL.toStrict bs, Just mime)
|
||||
Just item -> return (BL.toStrict (mediaContents item),
|
||||
Just (mediaMimeType item))
|
||||
Nothing -> downloadOrRead s
|
||||
|
||||
-- | Returns the content and, if available, the MIME type of a resource.
|
||||
|
@ -629,19 +627,6 @@ withPaths (p:ps) action fp =
|
|||
catchError (action (p </> fp))
|
||||
(\_ -> withPaths ps action fp)
|
||||
|
||||
-- | Fetch local or remote resource (like an image) and provide data suitable
|
||||
-- for adding it to the MediaBag.
|
||||
fetchMediaResource :: PandocMonad m
|
||||
=> T.Text -> m (FilePath, Maybe MimeType, BL.ByteString)
|
||||
fetchMediaResource src = do
|
||||
(bs, mt) <- fetchItem src
|
||||
let ext = fromMaybe (T.pack $ takeExtension $ T.unpack src)
|
||||
(mt >>= extensionFromMimeType)
|
||||
let bs' = BL.fromChunks [bs]
|
||||
let basename = showDigest $ sha1 bs'
|
||||
let fname = basename <.> T.unpack ext
|
||||
return (fname, mt, bs')
|
||||
|
||||
-- | Traverse tree, filling media bag for any images that
|
||||
-- aren't already in the media bag.
|
||||
fillMediaBag :: PandocMonad m => Pandoc -> m Pandoc
|
||||
|
@ -649,12 +634,18 @@ fillMediaBag d = walkM handleImage d
|
|||
where handleImage :: PandocMonad m => Inline -> m Inline
|
||||
handleImage (Image attr lab (src, tit)) = catchError
|
||||
(do mediabag <- getMediaBag
|
||||
case lookupMedia (T.unpack src) mediabag of
|
||||
Just (_, _) -> return $ Image attr lab (src, tit)
|
||||
Nothing -> do
|
||||
(fname, mt, bs) <- fetchMediaResource src
|
||||
insertMedia fname mt bs
|
||||
return $ Image attr lab (T.pack fname, tit))
|
||||
let fp = T.unpack src
|
||||
src' <- T.pack <$> case lookupMedia fp mediabag of
|
||||
Just item -> return $ mediaPath item
|
||||
Nothing -> do
|
||||
(bs, mt) <- fetchItem src
|
||||
insertMedia fp mt (BL.fromStrict bs)
|
||||
mediabag' <- getMediaBag
|
||||
case lookupMedia fp mediabag' of
|
||||
Just item -> return $ mediaPath item
|
||||
Nothing -> throwError $ PandocSomeError $
|
||||
src <> " not successfully inserted into MediaBag"
|
||||
return $ Image attr lab (src', tit))
|
||||
(\e ->
|
||||
case e of
|
||||
PandocResourceNotFound _ -> do
|
||||
|
|
|
@ -73,9 +73,9 @@ lookup fp = do
|
|||
res <- MB.lookupMedia fp <$> getMediaBag
|
||||
liftPandocLua $ case res of
|
||||
Nothing -> 1 <$ Lua.pushnil
|
||||
Just (mimeType, contents) -> do
|
||||
Lua.push mimeType
|
||||
Lua.push contents
|
||||
Just item -> do
|
||||
Lua.push $ MB.mediaMimeType item
|
||||
Lua.push $ MB.mediaContents item
|
||||
return 2
|
||||
|
||||
list :: PandocLua NumResults
|
||||
|
|
|
@ -15,6 +15,7 @@ Definition of a MediaBag object to hold binary resources, and an
|
|||
interface for interacting with it.
|
||||
-}
|
||||
module Text.Pandoc.MediaBag (
|
||||
MediaItem(..),
|
||||
MediaBag,
|
||||
deleteMedia,
|
||||
lookupMedia,
|
||||
|
@ -28,15 +29,23 @@ import qualified Data.Map as M
|
|||
import Data.Maybe (fromMaybe)
|
||||
import Data.Typeable (Typeable)
|
||||
import System.FilePath
|
||||
import Text.Pandoc.MIME (MimeType, getMimeTypeDef)
|
||||
import Text.Pandoc.MIME (MimeType, getMimeTypeDef, extensionFromMimeType)
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Data.Digest.Pure.SHA (sha1, showDigest)
|
||||
|
||||
data MediaItem =
|
||||
MediaItem
|
||||
{ mediaMimeType :: MimeType
|
||||
, mediaPath :: FilePath
|
||||
, mediaContents :: BL.ByteString
|
||||
} deriving (Eq, Ord, Show, Data, Typeable)
|
||||
|
||||
-- | A container for a collection of binary resources, with names and
|
||||
-- mime types. Note that a 'MediaBag' is a Monoid, so 'mempty'
|
||||
-- can be used for an empty 'MediaBag', and '<>' can be used to append
|
||||
-- two 'MediaBag's.
|
||||
newtype MediaBag = MediaBag (M.Map Text (MimeType, BL.ByteString))
|
||||
newtype MediaBag = MediaBag (M.Map Text MediaItem)
|
||||
deriving (Semigroup, Monoid, Data, Typeable)
|
||||
|
||||
instance Show MediaBag where
|
||||
|
@ -62,26 +71,34 @@ insertMedia :: FilePath -- ^ relative path and canonical name of resource
|
|||
-> MediaBag
|
||||
-> MediaBag
|
||||
insertMedia fp mbMime contents (MediaBag mediamap) =
|
||||
MediaBag (M.insert (canonicalize fp) (mime, contents) mediamap)
|
||||
where mime = fromMaybe fallback mbMime
|
||||
MediaBag (M.insert (canonicalize fp) mediaItem mediamap)
|
||||
where mediaItem = MediaItem{ mediaPath = showDigest (sha1 contents) <>
|
||||
"." <> ext
|
||||
, mediaContents = contents
|
||||
, mediaMimeType = mt }
|
||||
fallback = case takeExtension fp of
|
||||
".gz" -> getMimeTypeDef $ dropExtension fp
|
||||
_ -> getMimeTypeDef fp
|
||||
mt = fromMaybe fallback mbMime
|
||||
ext = maybe (takeExtension fp) T.unpack $ extensionFromMimeType mt
|
||||
|
||||
|
||||
-- | Lookup a media item in a 'MediaBag', returning mime type and contents.
|
||||
lookupMedia :: FilePath
|
||||
-> MediaBag
|
||||
-> Maybe (MimeType, BL.ByteString)
|
||||
-> Maybe MediaItem
|
||||
lookupMedia fp (MediaBag mediamap) = M.lookup (canonicalize fp) mediamap
|
||||
|
||||
-- | Get a list of the file paths stored in a 'MediaBag', with
|
||||
-- their corresponding mime types and the lengths in bytes of the contents.
|
||||
mediaDirectory :: MediaBag -> [(FilePath, MimeType, Int)]
|
||||
mediaDirectory (MediaBag mediamap) =
|
||||
M.foldrWithKey (\fp (mime,contents) ->
|
||||
((T.unpack fp, mime, fromIntegral (BL.length contents)):)) [] mediamap
|
||||
M.foldrWithKey (\fp item ->
|
||||
((T.unpack fp, mediaMimeType item,
|
||||
fromIntegral (BL.length (mediaContents item))):)) [] mediamap
|
||||
|
||||
mediaItems :: MediaBag -> [(FilePath, MimeType, BL.ByteString)]
|
||||
mediaItems (MediaBag mediamap) =
|
||||
M.foldrWithKey (\fp (mime,contents) ->
|
||||
((T.unpack fp, mime, contents):)) [] mediamap
|
||||
M.foldrWithKey (\fp item ->
|
||||
((T.unpack fp, mediaMimeType item, mediaContents item):))
|
||||
[] mediamap
|
||||
|
|
|
@ -24,7 +24,7 @@ import Test.Tasty.HUnit
|
|||
import Tests.Helpers
|
||||
import Text.Pandoc
|
||||
import qualified Text.Pandoc.Class as P
|
||||
import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory)
|
||||
import qualified Text.Pandoc.MediaBag as MB
|
||||
import Text.Pandoc.UTF8 as UTF8
|
||||
|
||||
-- We define a wrapper around pandoc that doesn't normalize in the
|
||||
|
@ -91,11 +91,11 @@ getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString)
|
|||
getMedia archivePath mediaPath = fmap fromEntry . findEntryByPath
|
||||
("word/" ++ mediaPath) . toArchive <$> B.readFile archivePath
|
||||
|
||||
compareMediaPathIO :: FilePath -> MediaBag -> FilePath -> IO Bool
|
||||
compareMediaPathIO :: FilePath -> MB.MediaBag -> FilePath -> IO Bool
|
||||
compareMediaPathIO mediaPath mediaBag docxPath = do
|
||||
docxMedia <- getMedia docxPath mediaPath
|
||||
let mbBS = case lookupMedia mediaPath mediaBag of
|
||||
Just (_, bs) -> bs
|
||||
let mbBS = case MB.lookupMedia mediaPath mediaBag of
|
||||
Just item -> MB.mediaContents item
|
||||
Nothing -> error ("couldn't find " ++
|
||||
mediaPath ++
|
||||
" in media bag")
|
||||
|
@ -110,7 +110,7 @@ compareMediaBagIO docxFile = do
|
|||
mb <- runIOorExplode $ readDocx defopts df >> P.getMediaBag
|
||||
bools <- mapM
|
||||
(\(fp, _, _) -> compareMediaPathIO fp mb docxFile)
|
||||
(mediaDirectory mb)
|
||||
(MB.mediaDirectory mb)
|
||||
return $ and bools
|
||||
|
||||
testMediaBagIO :: String -> FilePath -> IO TestTree
|
||||
|
|
Loading…
Add table
Reference in a new issue