Allow --extract-media to work with non-binary input formats.

If `--extract-media` is supplied with a non-binary input format,
pandoc will attempt to extract the contents of all linked images,
whether in local files, data: uris, or external uris.

They will be named based on the sha1 hash of the contents.

Closes #1583, #2289.

Notes:

- One thing that is slightly subideal with this commit is that
  identical resources will be downloaded multiple times.  To improve
  this we could have mediabag store an original filename/url +
  a new name.

- We might think about reusing some of this code, since more or less the
  same thing is done in the Docx, EPUB, PDF writers (with slight
  variations).
This commit is contained in:
John MacFarlane 2017-05-07 11:45:33 +02:00
parent de0fd90051
commit 400fe3188e

View file

@ -45,6 +45,7 @@ import Data.Aeson (eitherDecode', encode)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
import Data.Char (toLower, toUpper) import Data.Char (toLower, toUpper)
import Data.Digest.Pure.SHA (sha1, showDigest)
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.Foldable (foldrM) import Data.Foldable (foldrM)
import Data.List (intercalate, isPrefixOf, isSuffixOf, sort) import Data.List (intercalate, isPrefixOf, isSuffixOf, sort)
@ -68,17 +69,19 @@ import System.IO (stdout)
import System.IO.Error (isDoesNotExistError) import System.IO.Error (isDoesNotExistError)
import Text.Pandoc import Text.Pandoc
import Text.Pandoc.Builder (setMeta) import Text.Pandoc.Builder (setMeta)
import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag) import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag,
fetchItem, insertMedia)
import Text.Pandoc.Highlighting (highlightingStyles) import Text.Pandoc.Highlighting (highlightingStyles)
import Text.Pandoc.Lua ( runLuaFilter ) import Text.Pandoc.Lua ( runLuaFilter )
import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory) import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory)
import Text.Pandoc.MIME (extensionFromMimeType)
import Text.Pandoc.PDF (makePDF) import Text.Pandoc.PDF (makePDF)
import Text.Pandoc.Process (pipeProcess) import Text.Pandoc.Process (pipeProcess)
import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI) import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
import Text.Pandoc.Shared (headerShift, openURL, readDataFile, import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
readDataFileUTF8, safeRead, tabFilter) readDataFileUTF8, safeRead, tabFilter)
import qualified Text.Pandoc.UTF8 as UTF8 import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Walk (walk) import Text.Pandoc.Walk (walkM, walk)
import Text.Pandoc.XML (toEntities) import Text.Pandoc.XML (toEntities)
import Text.Printf import Text.Printf
#ifndef _WINDOWS #ifndef _WINDOWS
@ -413,11 +416,15 @@ convertWithOpts opts = do
runIO' $ do runIO' $ do
(doc, media) <- withMediaBag $ sourceToDoc sources >>= (doc, media) <- withMediaBag $ sourceToDoc sources >>=
(maybe return extractMedia (optExtractMedia opts) ( (if isJust (optExtractMedia opts)
then fillMedia (writerSourceURL writerOptions)
else return)
>=> maybe return extractMedia (optExtractMedia opts)
>=> return . flip (foldr addMetadata) metadata >=> return . flip (foldr addMetadata) metadata
>=> applyTransforms transforms >=> applyTransforms transforms
>=> applyLuaFilters datadir (optLuaFilters opts) [format] >=> applyLuaFilters datadir (optLuaFilters opts) [format]
>=> applyFilters datadir filters' [format]) >=> applyFilters datadir filters' [format]
)
case writer of case writer of
ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
@ -723,6 +730,21 @@ defaultWriterName x =
-- Transformations of a Pandoc document post-parsing: -- Transformations of a Pandoc document post-parsing:
-- | Traverse tree, filling media bag.
fillMedia :: Maybe String -> Pandoc -> PandocIO Pandoc
fillMedia sourceURL d = walkM handleImage d
where handleImage :: Inline -> PandocIO Inline
handleImage (Image attr lab (src, tit)) = do
(bs, mt) <- fetchItem sourceURL src
let ext = fromMaybe (takeExtension src)
(mt >>= extensionFromMimeType)
let bs' = B.fromChunks [bs]
let basename = showDigest $ sha1 bs'
let fname = basename <.> ext
insertMedia fname mt bs'
return $ Image attr lab (fname, tit)
handleImage x = return x
extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc
extractMedia dir d = do extractMedia dir d = do
media <- getMediaBag media <- getMediaBag