Allow --extract-media
to work with non-binary input formats.
If `--extract-media` is supplied with a non-binary input format, pandoc will attempt to extract the contents of all linked images, whether in local files, data: uris, or external uris. They will be named based on the sha1 hash of the contents. Closes #1583, #2289. Notes: - One thing that is slightly subideal with this commit is that identical resources will be downloaded multiple times. To improve this we could have mediabag store an original filename/url + a new name. - We might think about reusing some of this code, since more or less the same thing is done in the Docx, EPUB, PDF writers (with slight variations).
This commit is contained in:
parent
de0fd90051
commit
400fe3188e
1 changed files with 26 additions and 4 deletions
|
@ -45,6 +45,7 @@ import Data.Aeson (eitherDecode', encode)
|
||||||
import qualified Data.ByteString as BS
|
import qualified Data.ByteString as BS
|
||||||
import qualified Data.ByteString.Lazy as B
|
import qualified Data.ByteString.Lazy as B
|
||||||
import Data.Char (toLower, toUpper)
|
import Data.Char (toLower, toUpper)
|
||||||
|
import Data.Digest.Pure.SHA (sha1, showDigest)
|
||||||
import qualified Data.Set as Set
|
import qualified Data.Set as Set
|
||||||
import Data.Foldable (foldrM)
|
import Data.Foldable (foldrM)
|
||||||
import Data.List (intercalate, isPrefixOf, isSuffixOf, sort)
|
import Data.List (intercalate, isPrefixOf, isSuffixOf, sort)
|
||||||
|
@ -68,17 +69,19 @@ import System.IO (stdout)
|
||||||
import System.IO.Error (isDoesNotExistError)
|
import System.IO.Error (isDoesNotExistError)
|
||||||
import Text.Pandoc
|
import Text.Pandoc
|
||||||
import Text.Pandoc.Builder (setMeta)
|
import Text.Pandoc.Builder (setMeta)
|
||||||
import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag)
|
import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag,
|
||||||
|
fetchItem, insertMedia)
|
||||||
import Text.Pandoc.Highlighting (highlightingStyles)
|
import Text.Pandoc.Highlighting (highlightingStyles)
|
||||||
import Text.Pandoc.Lua ( runLuaFilter )
|
import Text.Pandoc.Lua ( runLuaFilter )
|
||||||
import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory)
|
import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory)
|
||||||
|
import Text.Pandoc.MIME (extensionFromMimeType)
|
||||||
import Text.Pandoc.PDF (makePDF)
|
import Text.Pandoc.PDF (makePDF)
|
||||||
import Text.Pandoc.Process (pipeProcess)
|
import Text.Pandoc.Process (pipeProcess)
|
||||||
import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
|
import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
|
||||||
import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
|
import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
|
||||||
readDataFileUTF8, safeRead, tabFilter)
|
readDataFileUTF8, safeRead, tabFilter)
|
||||||
import qualified Text.Pandoc.UTF8 as UTF8
|
import qualified Text.Pandoc.UTF8 as UTF8
|
||||||
import Text.Pandoc.Walk (walk)
|
import Text.Pandoc.Walk (walkM, walk)
|
||||||
import Text.Pandoc.XML (toEntities)
|
import Text.Pandoc.XML (toEntities)
|
||||||
import Text.Printf
|
import Text.Printf
|
||||||
#ifndef _WINDOWS
|
#ifndef _WINDOWS
|
||||||
|
@ -413,11 +416,15 @@ convertWithOpts opts = do
|
||||||
|
|
||||||
runIO' $ do
|
runIO' $ do
|
||||||
(doc, media) <- withMediaBag $ sourceToDoc sources >>=
|
(doc, media) <- withMediaBag $ sourceToDoc sources >>=
|
||||||
(maybe return extractMedia (optExtractMedia opts)
|
( (if isJust (optExtractMedia opts)
|
||||||
|
then fillMedia (writerSourceURL writerOptions)
|
||||||
|
else return)
|
||||||
|
>=> maybe return extractMedia (optExtractMedia opts)
|
||||||
>=> return . flip (foldr addMetadata) metadata
|
>=> return . flip (foldr addMetadata) metadata
|
||||||
>=> applyTransforms transforms
|
>=> applyTransforms transforms
|
||||||
>=> applyLuaFilters datadir (optLuaFilters opts) [format]
|
>=> applyLuaFilters datadir (optLuaFilters opts) [format]
|
||||||
>=> applyFilters datadir filters' [format])
|
>=> applyFilters datadir filters' [format]
|
||||||
|
)
|
||||||
|
|
||||||
case writer of
|
case writer of
|
||||||
ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
|
ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
|
||||||
|
@ -723,6 +730,21 @@ defaultWriterName x =
|
||||||
|
|
||||||
-- Transformations of a Pandoc document post-parsing:
|
-- Transformations of a Pandoc document post-parsing:
|
||||||
|
|
||||||
|
-- | Traverse tree, filling media bag.
|
||||||
|
fillMedia :: Maybe String -> Pandoc -> PandocIO Pandoc
|
||||||
|
fillMedia sourceURL d = walkM handleImage d
|
||||||
|
where handleImage :: Inline -> PandocIO Inline
|
||||||
|
handleImage (Image attr lab (src, tit)) = do
|
||||||
|
(bs, mt) <- fetchItem sourceURL src
|
||||||
|
let ext = fromMaybe (takeExtension src)
|
||||||
|
(mt >>= extensionFromMimeType)
|
||||||
|
let bs' = B.fromChunks [bs]
|
||||||
|
let basename = showDigest $ sha1 bs'
|
||||||
|
let fname = basename <.> ext
|
||||||
|
insertMedia fname mt bs'
|
||||||
|
return $ Image attr lab (fname, tit)
|
||||||
|
handleImage x = return x
|
||||||
|
|
||||||
extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc
|
extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc
|
||||||
extractMedia dir d = do
|
extractMedia dir d = do
|
||||||
media <- getMediaBag
|
media <- getMediaBag
|
||||||
|
|
Loading…
Reference in a new issue