From b5210bc175ca7915934082158d4197ab89efa9a2 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Wed, 30 Jul 2014 11:44:25 -0700
Subject: [PATCH] Added --extract-media option.

This has been documented to affect the epub and docx readers, so
we should either add the epub reader before the next release or
change the documentation.
---
 README    |  8 ++++++-
 pandoc.hs | 63 +++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/README b/README
index 2dfd5bb3d..c5223543c 100644
--- a/README
+++ b/README
@@ -318,7 +318,13 @@ Reader options
     classes, respectively. The author and time of change is
     included. *all* is useful for scripting: only accepting changes
     from a certain reviewer, say, or before a certain date. This
-    option only affects the Docx reader.
+    option only affects the docx reader.
+
+`--extract-media=`*DIR*
+:   Extract images and other media contained in a docx or epub container
+    to the path *DIR*, creating it if necessary, and adjust the images
+    references in the document so they point to the extracted files.
+    This option only affects the docx and epub readers.
 
 General writer options
 ----------------------
diff --git a/pandoc.hs b/pandoc.hs
index 629c16c86..a1dedae3d 100644
--- a/pandoc.hs
+++ b/pandoc.hs
@@ -33,6 +33,7 @@ module Main where
 import Text.Pandoc
 import Text.Pandoc.Builder (setMeta)
 import Text.Pandoc.PDF (makePDF)
+import Text.Pandoc.Walk (walk)
 import Text.Pandoc.Readers.LaTeX (handleIncludes)
 import Text.Pandoc.Shared ( tabFilter, readDataFileUTF8, readDataFile,
                             safeRead, headerShift, normalize, err, warn,
@@ -49,7 +50,8 @@ import System.Console.GetOpt
 import Data.Char ( toLower )
 import Data.List ( intercalate, isPrefixOf, isSuffixOf, sort )
 import System.Directory ( getAppUserDataDirectory, findExecutable,
-                          doesFileExist, Permissions(..), getPermissions )
+                          doesFileExist, Permissions(..), getPermissions,
+                          createDirectoryIfMissing )
 import System.IO ( stdout, stderr )
 import System.IO.Error ( isDoesNotExistError )
 import qualified Control.Exception as E
@@ -182,6 +184,7 @@ data Opt = Opt
     , optAscii             :: Bool       -- ^ Use ascii characters only in html
     , optTeXLigatures      :: Bool       -- ^ Use TeX ligatures for quotes/dashes
     , optDefaultImageExtension :: String -- ^ Default image extension
+    , optExtractMedia      :: Maybe FilePath -- ^ Path to extract embedded media
     , optTrace             :: Bool       -- ^ Print debug information
     , optTrackChanges      :: TrackChanges -- ^ Accept or reject MS Word track-changes.
     }
@@ -239,6 +242,7 @@ defaultOpts = Opt
     , optAscii                 = False
     , optTeXLigatures          = True
     , optDefaultImageExtension = ""
+    , optExtractMedia          = Nothing
     , optTrace                 = False
     , optTrackChanges          = AcceptChanges
     }
@@ -343,6 +347,26 @@ options =
                   "NUMBER")
                  "" -- "Tab stop (default 4)"
 
+    , Option "" ["track-changes"]
+                 (ReqArg
+                  (\arg opt -> do
+                     action <- case arg of
+                            "accept" -> return AcceptChanges
+                            "reject" -> return RejectChanges
+                            "all"    -> return AllChanges
+                            _        -> err 6
+                               ("Unknown option for track-changes: " ++ arg)
+                     return opt { optTrackChanges = action })
+                  "accept|reject|all")
+                 "" -- "Accepting or reject MS Word track-changes.""
+
+    , Option "" ["extract-media"]
+                 (ReqArg
+                  (\arg opt -> do
+                    return opt { optExtractMedia = Just arg })
+                  "PATH")
+                 "" -- "Directory to which to extract embedded media"
+
     , Option "s" ["standalone"]
                  (NoArg
                   (\opt -> return opt { optStandalone = True }))
@@ -787,19 +811,6 @@ options =
                   (\opt -> return opt { optTrace = True }))
                  "" -- "Turn on diagnostic tracing in readers."
 
-    , Option "" ["track-changes"]
-                 (ReqArg
-                  (\arg opt -> do
-                     action <- case arg of
-                            "accept" -> return AcceptChanges
-                            "reject" -> return RejectChanges
-                            "all"    -> return AllChanges
-                            _        -> err 6
-                               ("Unknown option for track-changes: " ++ arg)
-                     return opt { optTrackChanges = action })
-                  "accept|reject|all")
-                 "" -- "Accepting or reject MS Word track-changes.""
-
     , Option "" ["dump-args"]
                  (NoArg
                   (\opt -> return opt { optDumpArgs = True }))
@@ -998,6 +1009,7 @@ main = do
               , optAscii                 = ascii
               , optTeXLigatures          = texLigatures
               , optDefaultImageExtension = defaultImageExtension
+              , optExtractMedia          = mbExtractMedia
               , optTrace                 = trace
               , optTrackChanges          = trackChanges
              } = opts
@@ -1196,13 +1208,32 @@ main = do
                            then handleIncludes
                            else return
 
+  let writeMedia :: FilePath -> (FilePath, B.ByteString) -> IO ()
+      writeMedia dir (subpath, bs) = do
+        -- we join and split to convert a/b/c to a\b\c on Windows;
+        -- in zip containers all paths use /
+        let fullpath = dir </> joinPath (splitPath subpath)
+        createDirectoryIfMissing True $ takeDirectory fullpath
+        warn $ "extracting " ++ fullpath
+        B.writeFile fullpath bs
+
+  let adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline
+      adjustImagePath dir paths (Image lab (src, tit))
+         | src `elem` paths = Image lab (dir ++ "/" ++ src, tit)
+      adjustImagePath _ _ x = x
+
   doc <- case reader of
           StringReader r->
             readSources sources >>=
               handleIncludes' . convertTabs . intercalate "\n" >>=
               r readerOpts
-          ByteStringReader r -> readFiles sources >>= r readerOpts >>=
-                                (return . fst)
+          ByteStringReader r -> do
+              (d, media) <- readFiles sources >>= r readerOpts
+              case mbExtractMedia of
+                   Just dir | not (M.null media) -> do
+                     mapM_ (writeMedia dir) $ M.toList media
+                     return $ walk (adjustImagePath dir (M.keys media)) d
+                   _  -> return d
 
 
   let doc0 = M.foldWithKey setMeta doc metadata