From d6d7c9620abddc5e5e45450c091bc8a73bac8f66 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sat, 21 Aug 2021 15:30:13 -0700
Subject: [PATCH] Add `--sandbox` option.

+ Add sandbox feature for readers.  When this option is used,
  readers and writers only have access to input files (and
  other files specified directly on command line).  This restriction
  is enforced in the type system.
+ Filters, PDF production, custom writers are unaffected.  This
  feature only insulates the actual readers and writers, not
  the pipeline around them in Text.Pandoc.App.
+ Note that when `--sandboxed` is specified, readers won't have
  access to the resource path, nor will anything have access to
  the user data directory.
+ Add module Text.Pandoc.Class.Sandbox, defining
  `sandbox`.  Exported via Text.Pandoc.Class. [API change]

Closes #5045.
---
 MANUAL.txt                                | 13 +++-
 pandoc.cabal                              |  1 +
 src/Text/Pandoc/App.hs                    | 29 +++++++--
 src/Text/Pandoc/App/CommandLineOptions.hs |  5 ++
 src/Text/Pandoc/App/Opt.hs                |  4 ++
 src/Text/Pandoc/App/OutputSettings.hs     | 22 ++++++-
 src/Text/Pandoc/Class.hs                  |  2 +
 src/Text/Pandoc/Class/PandocSandboxed.hs  | 79 -----------------------
 src/Text/Pandoc/Class/Sandbox.hs          | 50 ++++++++++++++
 9 files changed, 120 insertions(+), 85 deletions(-)
 delete mode 100644 src/Text/Pandoc/Class/PandocSandboxed.hs
 create mode 100644 src/Text/Pandoc/Class/Sandbox.hs

diff --git a/MANUAL.txt b/MANUAL.txt
index 5dc35c8ff..75e74f1cd 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -730,6 +730,16 @@ header when requesting a document from a URL:
     document in standalone mode. If no *VAL* is specified, the
     key will be given the value `true`.
 
+`--sandbox`
+
+:   Run pandoc in a sandbox, limiting IO operations in readers
+    and writers to reading the files specified on the command line.
+    Note that this option does not limit IO operations by
+    filters or in the production of PDF documents.  But it does
+    offer security against, for example, disclosure of files
+    through the use of `include` directives.  Anyone using
+    pandoc on untrusted user input should use this option.
+
 `-D` *FORMAT*, `--print-default-template=`*FORMAT*
 
 :   Print the system default template for an output *FORMAT*. (See `-t`
@@ -6543,7 +6553,8 @@ application, here are some things to keep in mind:
 2. Several input formats (including HTML, Org, and RST) support `include`
    directives that allow the contents of a file to be included in the
    output. An untrusted attacker could use these to view the contents of
-   files on the file system.
+   files on the file system.  (Using the `--sandbox` option can
+   protect against this threat.)
 
 3. If your application uses pandoc as a Haskell library (rather than
    shelling out to the executable), it is possible to use it in a mode
diff --git a/pandoc.cabal b/pandoc.cabal
index b90a61942..da53cb1cd 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -622,6 +622,7 @@ library
                    Text.Pandoc.Class.PandocMonad,
                    Text.Pandoc.Class.PandocIO,
                    Text.Pandoc.Class.PandocPure,
+                   Text.Pandoc.Class.Sandbox,
                    Text.Pandoc.Filter.JSON,
                    Text.Pandoc.Filter.Lua,
                    Text.Pandoc.Filter.Path,
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 15236896c..f7c1f218d 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -139,7 +139,26 @@ convertWithOpts opts = do
             <> "` instead of `pandoc " <> inputFile <> " -o " <> outputFile <> "`."
         _ -> return ()
 
-    (reader, readerExts) <- getReader readerName
+
+    let makeSandboxed pureReader =
+          let files = maybe id (:) (optReferenceDoc opts) .
+                      maybe id (:) (optEpubMetadata opts) .
+                      maybe id (:) (optEpubCoverImage opts) .
+                      maybe id (:) (optCSL opts) .
+                      maybe id (:) (optCitationAbbreviations opts) $
+                      optEpubFonts opts ++
+                      optBibliography opts
+           in  case pureReader of
+                 TextReader r -> TextReader $ \o t -> sandbox files (r o t)
+                 ByteStringReader r
+                            -> ByteStringReader $ \o t -> sandbox files (r o t)
+
+    (reader, readerExts) <-
+      if optSandbox opts
+         then case runPure (getReader readerName) of
+                Left e -> throwError e
+                Right (r, rexts) -> return (makeSandboxed r, rexts)
+         else getReader readerName
 
     outputSettings <- optToOutputSettings opts
     let format = outputFormat outputSettings
@@ -274,8 +293,9 @@ convertWithOpts opts = do
              ByteStringReader r ->
                mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs)
             >>=
-              (   (if isJust (optExtractMedia opts)
-                        || writerNameBase == "docx" -- for fallback png creation
+              (   (if not (optSandbox opts) &&
+                      (isJust (optExtractMedia opts)
+                       || writerNameBase == "docx") -- for fallback pngs
                       then fillMediaBag
                       else return)
               >=> return . adjustMetadata (metadataFromFile <>)
@@ -286,7 +306,8 @@ convertWithOpts opts = do
               >=> maybe return extractMedia (optExtractMedia opts)
               )
 
-    when (writerNameBase == "docx") $ do -- create fallback pngs for svgs
+    when (writerNameBase == "docx" && not (optSandbox opts)) $ do
+      -- create fallback pngs for svgs
       items <- mediaItems <$> getMediaBag
       forM_ items $ \(fp, mt, bs) ->
         case T.takeWhile (/=';') mt of
diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs
index d2c12573c..99017000a 100644
--- a/src/Text/Pandoc/App/CommandLineOptions.hs
+++ b/src/Text/Pandoc/App/CommandLineOptions.hs
@@ -188,6 +188,11 @@ options =
                   (\opt -> return opt { optFileScope = True }))
                  "" -- "Parse input files before combining"
 
+    , Option "" ["sandbox"]
+                 (NoArg
+                  (\opt -> return opt { optSandbox = True }))
+                 ""
+
     , Option "s" ["standalone"]
                  (NoArg
                   (\opt -> return opt { optStandalone = True }))
diff --git a/src/Text/Pandoc/App/Opt.hs b/src/Text/Pandoc/App/Opt.hs
index d54d932b7..48eb15fdf 100644
--- a/src/Text/Pandoc/App/Opt.hs
+++ b/src/Text/Pandoc/App/Opt.hs
@@ -160,6 +160,7 @@ data Opt = Opt
     , optCSL                   :: Maybe FilePath -- ^ CSL stylesheet
     , optBibliography          :: [FilePath]  -- ^ Bibliography files
     , optCitationAbbreviations :: Maybe FilePath -- ^ Citation abbreviations
+    , optSandbox               :: Bool
     } deriving (Generic, Show)
 
 instance FromYAML (Opt -> Opt) where
@@ -595,6 +596,8 @@ doOpt (k',v) = do
       parseYAML v >>= \x -> return (\o -> o{ optEol = x })
     "strip-comments" ->
       parseYAML v >>= \x -> return (\o -> o  { optStripComments = x })
+    "sandbox" ->
+      parseYAML v >>= \x -> return (\o -> o  { optSandbox = x })
     _ -> failAtNode k' $ "Unknown option " ++ show k
 
 -- | Defaults for command-line options.
@@ -673,6 +676,7 @@ defaultOpts = Opt
     , optCSL                   = Nothing
     , optBibliography          = []
     , optCitationAbbreviations = Nothing
+    , optSandbox               = False
     }
 
 parseStringKey ::  Node Pos -> Parser Text
diff --git a/src/Text/Pandoc/App/OutputSettings.hs b/src/Text/Pandoc/App/OutputSettings.hs
index 3f83f4b21..7b057713b 100644
--- a/src/Text/Pandoc/App/OutputSettings.hs
+++ b/src/Text/Pandoc/App/OutputSettings.hs
@@ -90,11 +90,31 @@ optToOutputSettings opts = do
                   then writerName
                   else T.toLower $ baseWriterName writerName
 
+  let makeSandboxed pureWriter =
+          let files = maybe id (:) (optReferenceDoc opts) .
+                      maybe id (:) (optEpubMetadata opts) .
+                      maybe id (:) (optEpubCoverImage opts) .
+                      maybe id (:) (optCSL opts) .
+                      maybe id (:) (optCitationAbbreviations opts) $
+                      optEpubFonts opts ++
+                      optBibliography opts
+           in  case pureWriter of
+                 TextWriter w -> TextWriter $ \o d -> sandbox files (w o d)
+                 ByteStringWriter w
+                            -> ByteStringWriter $ \o d -> sandbox files (w o d)
+
+
   (writer, writerExts) <-
             if ".lua" `T.isSuffixOf` format
                then return (TextWriter
                        (\o d -> writeCustom (T.unpack writerName) o d), mempty)
-               else getWriter (T.toLower writerName)
+               else if optSandbox opts
+                       then
+                         case runPure (getWriter writerName) of
+                           Left e -> throwError e
+                           Right (w, wexts) ->
+                                  return (makeSandboxed w, wexts)
+                       else getWriter (T.toLower writerName)
 
   let standalone = optStandalone opts || not (isTextFormat format) || pdfOutput
 
diff --git a/src/Text/Pandoc/Class.hs b/src/Text/Pandoc/Class.hs
index 2f28ac4dd..6394df251 100644
--- a/src/Text/Pandoc/Class.hs
+++ b/src/Text/Pandoc/Class.hs
@@ -19,6 +19,7 @@ module Text.Pandoc.Class
   , module Text.Pandoc.Class.PandocIO
   , module Text.Pandoc.Class.PandocMonad
   , module Text.Pandoc.Class.PandocPure
+  , module Text.Pandoc.Class.Sandbox
   , Translations
   ) where
 
@@ -27,3 +28,4 @@ import Text.Pandoc.Class.PandocMonad
 import Text.Pandoc.Class.PandocIO
 import Text.Pandoc.Class.PandocPure
 import Text.Pandoc.Translations (Translations)
+import Text.Pandoc.Class.Sandbox
diff --git a/src/Text/Pandoc/Class/PandocSandboxed.hs b/src/Text/Pandoc/Class/PandocSandboxed.hs
deleted file mode 100644
index 61ee1f1c6..000000000
--- a/src/Text/Pandoc/Class/PandocSandboxed.hs
+++ /dev/null
@@ -1,79 +0,0 @@
-{-# LANGUAGE DeriveFunctor #-}
-{-# LANGUAGE GeneralizedNewtypeDeriving #-}
-{- |
-Module      : Text.Pandoc.Class.PandocIO
-Copyright   : Copyright (C) 2016-2020 Jesse Rosenthal, John MacFarlane
-License     : GNU GPL, version 2 or above
-
-Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu>
-Stability   : alpha
-Portability : portable
-
-This module defines @'PandocIO'@, an IO-based instance of the
-@'PandocMonad'@ type class. File, data, and network access all are run
-using IO operators.
--}
-module Text.Pandoc.Class.PandocIO
-  ( PandocIO(..)
-  , runIO
-  , runIOorExplode
-  , extractMedia
- ) where
-
-import Control.Monad.Except (ExceptT, MonadError, runExceptT)
-import Control.Monad.IO.Class (MonadIO)
-import Control.Monad.State (StateT, evalStateT, lift, get, put)
-import Data.Default (Default (def))
-import Text.Pandoc.Class.CommonState (CommonState (..))
-import Text.Pandoc.Class.PandocMonad
-import Text.Pandoc.Definition
-import Text.Pandoc.Error
-import qualified Text.Pandoc.Class.IO as IO
-import Control.Monad.Catch (MonadCatch, MonadMask, MonadThrow)
-
--- | Evaluate a 'PandocIO' operation.
-runIO :: PandocIO a -> IO (Either PandocError a)
-runIO ma = flip evalStateT def $ runExceptT $ unPandocIO ma
-
--- | Evaluate a 'PandocIO' operation, handling any errors
--- by exiting with an appropriate message and error status.
-runIOorExplode :: PandocIO a -> IO a
-runIOorExplode ma = runIO ma >>= handleError
-
-newtype PandocIO a = PandocIO {
-  unPandocIO :: ExceptT PandocError (StateT CommonState IO) a
-  } deriving ( MonadIO
-             , Functor
-             , Applicative
-             , Monad
-             , MonadCatch
-             , MonadMask
-             , MonadThrow
-             , MonadError PandocError
-             )
-
-instance PandocMonad PandocIO where
-  lookupEnv = IO.lookupEnv
-  getCurrentTime = IO.getCurrentTime
-  getCurrentTimeZone = IO.getCurrentTimeZone
-  newStdGen = IO.newStdGen
-  newUniqueHash = IO.newUniqueHash
-
-  openURL = IO.openURL
-  readFileLazy = IO.readFileLazy
-  readFileStrict = IO.readFileStrict
-  readStdinStrict = IO.readStdinStrict
-
-  glob = IO.glob
-  fileExists = IO.fileExists
-  getDataFileName = IO.getDataFileName
-  getModificationTime = IO.getModificationTime
-
-  getCommonState = PandocIO $ lift get
-  putCommonState = PandocIO . lift . put
-
-  logOutput = IO.logOutput
-
--- | Extract media from the mediabag into a directory.
-extractMedia :: (PandocMonad m, MonadIO m) => FilePath -> Pandoc -> m Pandoc
-extractMedia = IO.extractMedia
diff --git a/src/Text/Pandoc/Class/Sandbox.hs b/src/Text/Pandoc/Class/Sandbox.hs
new file mode 100644
index 000000000..8bc0f1e77
--- /dev/null
+++ b/src/Text/Pandoc/Class/Sandbox.hs
@@ -0,0 +1,50 @@
+{- |
+Module      : Text.Pandoc.Class.Sandbox
+Copyright   : Copyright (C) 2021 John MacFarlane
+License     : GNU GPL, version 2 or above
+
+Maintainer  : John MacFarlane (<jgm@berkeley.edu>)
+Stability   : alpha
+Portability : portable
+
+This module provides a way to run PandocMonad actions in a sandbox
+(pure context, with no IO allowed and access only to designated files).
+-}
+
+module Text.Pandoc.Class.Sandbox
+  ( sandbox )
+where
+
+import Control.Monad (foldM)
+import Control.Monad.Except (throwError)
+import Control.Monad.IO.Class (MonadIO (liftIO))
+import Text.Pandoc.Class.PandocMonad
+import Text.Pandoc.Class.PandocPure
+import Text.Pandoc.Class.CommonState (CommonState(..))
+import Text.Pandoc.Logging (messageVerbosity)
+
+-- | Lift a PandocPure action into any instance of PandocMonad.
+-- The main computation is done purely, but CommonState is preserved
+-- continuously, and warnings are emitted after the action completes.
+-- The parameter is a list of FilePaths which will be added to the
+-- ersatz file system and be available for reading.
+sandbox :: (PandocMonad m, MonadIO m) => [FilePath] -> PandocPure a -> m a
+sandbox files action = do
+  oldState <- getCommonState
+  tree <- liftIO $ foldM addToFileTree mempty files
+  case runPure (do putCommonState oldState
+                   modifyPureState $ \ps -> ps{ stFiles = tree }
+                   result <- action
+                   st <- getCommonState
+                   return (st, result)) of
+          Left e -> throwError e
+          Right (st, result) -> do
+            putCommonState st
+            let verbosity = stVerbosity st
+            -- emit warnings, since these are not printed in runPure
+            let newMessages = reverse $ take
+                  (length (stLog st) - length (stLog oldState)) (stLog st)
+            mapM_ logOutput
+              (filter ((<= verbosity) . messageVerbosity) newMessages)
+            return result
+