From 77faccb505992c944cd1b92f50e4e00d2927682b Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 28 Feb 2019 20:28:16 -0800
Subject: [PATCH] Shared: add filterIpynbOutput. [API change]

Add command line option `--ipynb-output=all|none|best`.

Closes #5339.
---
 MANUAL.txt                                |  9 +++++++
 src/Text/Pandoc/App.hs                    | 29 ++++++++++++++++-------
 src/Text/Pandoc/App/CommandLineOptions.hs | 12 +++++++++-
 src/Text/Pandoc/App/Opt.hs                |  2 ++
 src/Text/Pandoc/Shared.hs                 | 29 ++++++++++++++++++++++-
 5 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/MANUAL.txt b/MANUAL.txt
index 66e2d25f8..d92a7afe5 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -1192,6 +1192,15 @@ Options affecting specific writers {.options}
     the EPUB-specific contents.  The default is `EPUB`.  To put
     the EPUB contents in the top level, use an empty string.
 
+`--ipynb-output=all|none|best`
+
+:   Determines how ipynb output cells are treated. `all` means
+    that all of the data formats included in the original are
+    preserved.  `none` means that the contents of data cells
+    are omitted.  `best` causes pandoc to try to pick the
+    richest data block in each output cell that is compatible
+    with the output format.  The default is `best`.
+
 `--pdf-engine=`*PROGRAM*
 
 :   Use the specified engine when producing PDF output.
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 4e4e3211c..cf70f3971 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -73,7 +73,7 @@ import Text.Pandoc.PDF (makePDF)
 import Text.Pandoc.Readers.Markdown (yamlToMeta)
 import Text.Pandoc.SelfContained (makeDataURI, makeSelfContained)
 import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs,
-         headerShift, isURI, tabFilter, uriPathToPath)
+         headerShift, isURI, tabFilter, uriPathToPath, filterIpynbOutput)
 import qualified Text.Pandoc.UTF8 as UTF8
 #ifndef _WINDOWS
 import System.Posix.IO (stdOutput)
@@ -247,8 +247,20 @@ convertWithOpts opts = do
                               (writerExtensions writerOptions) &&
                               writerWrapText writerOptions == WrapPreserve)
                          then (eastAsianLineBreakFilter :)
-                         else id) $
-                     []
+                         else id) .
+                     (case optIpynbOutput opts of
+                       "all"    -> id
+                       "none"   -> (filterIpynbOutput Nothing :)
+                       "best"   -> (filterIpynbOutput (Just $
+                                     if htmlFormat writerName
+                                        then Format "html"
+                                        else
+                                          case writerName of
+                                            "latex"  -> Format "latex"
+                                            "beamer" -> Format "latex"
+                                            _        -> Format writerName) :)
+                       _  -> id)  -- should not happen
+                     $ []
 
     let sourceToDoc :: [FilePath] -> PandocIO Pandoc
         sourceToDoc sources' =
@@ -293,15 +305,12 @@ convertWithOpts opts = do
                                      TL.unpack (TE.decodeUtf8With TE.lenientDecode err')
 
         Nothing -> do
-                let htmlFormat = format `elem`
-                      ["html","html4","html5","s5","slidy",
-                       "slideous","dzslides","revealjs"]
-                    addNl = if standalone
+                let addNl = if standalone
                                then id
                                else (<> T.singleton '\n')
                 output <- addNl <$> f writerOptions doc
                 writerFn eol outputFile =<<
-                  if optSelfContained opts && htmlFormat
+                  if optSelfContained opts && htmlFormat writerName
                      -- TODO not maximally efficient; change type
                      -- of makeSelfContained so it works w/ Text
                      then T.pack <$> makeSelfContained (T.unpack output)
@@ -309,6 +318,10 @@ convertWithOpts opts = do
 
 type Transform = Pandoc -> Pandoc
 
+htmlFormat :: String -> Bool
+htmlFormat = (`elem` ["html","html4","html5","s5","slidy",
+                      "slideous","dzslides","revealjs"])
+
 isTextFormat :: String -> Bool
 isTextFormat s = s `notElem` ["odt","docx","epub2","epub3","epub","pptx"]
 
diff --git a/src/Text/Pandoc/App/CommandLineOptions.hs b/src/Text/Pandoc/App/CommandLineOptions.hs
index 6ae167ebf..be93357cb 100644
--- a/src/Text/Pandoc/App/CommandLineOptions.hs
+++ b/src/Text/Pandoc/App/CommandLineOptions.hs
@@ -643,7 +643,17 @@ options =
                  "NUMBER")
                  "" -- "Header level at which to split chapters in EPUB"
 
-    , Option "" ["pdf-engine"]
+    , Option "" ["ipynb-output"]
+                 (ReqArg
+                  (\arg opt ->
+                    if arg `notElem` ["all","none","best"]
+                       then E.throwIO $ PandocOptionError $
+                             "ipynb-output must be all, none, or best"
+                       else return opt { optIpynbOutput = arg })
+                 "all|none|best")
+                 "" -- "Starting number for sections, subsections, etc."
+
+     , Option "" ["pdf-engine"]
                  (ReqArg
                   (\arg opt -> do
                      let b = takeBaseName arg
diff --git a/src/Text/Pandoc/App/Opt.hs b/src/Text/Pandoc/App/Opt.hs
index 698fdc96b..59405cbeb 100644
--- a/src/Text/Pandoc/App/Opt.hs
+++ b/src/Text/Pandoc/App/Opt.hs
@@ -123,6 +123,7 @@ data Opt = Opt
     , optFileScope             :: Bool         -- ^ Parse input files before combining
     , optTitlePrefix           :: Maybe String     -- ^ Prefix for title
     , optCss                   :: [FilePath]       -- ^ CSS files to link to
+    , optIpynbOutput           :: String           -- ^ Maybe f to use best data; Nothing to omit
     , optIncludeBeforeBody     :: [FilePath]       -- ^ Files to include before
     , optIncludeAfterBody      :: [FilePath]       -- ^ Files to include after body
     , optIncludeInHeader       :: [FilePath]       -- ^ Files to include in header
@@ -196,6 +197,7 @@ defaultOpts = Opt
     , optFileScope             = False
     , optTitlePrefix           = Nothing
     , optCss                   = []
+    , optIpynbOutput           = "best"
     , optIncludeBeforeBody     = []
     , optIncludeAfterBody      = []
     , optIncludeInHeader       = []
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index db00d5aa4..992e57b6a 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -86,6 +86,7 @@ module Text.Pandoc.Shared (
                      eastAsianLineBreakFilter,
                      underlineSpan,
                      splitSentences,
+                     filterIpynbOutput,
                      -- * TagSoup HTML handling
                      renderTags',
                      -- * File handling
@@ -122,12 +123,13 @@ import Data.Char (isAlpha, isLower, isSpace, isUpper, toLower, isAlphaNum,
                   generalCategory, GeneralCategory(NonSpacingMark,
                   SpacingCombiningMark, EnclosingMark, ConnectorPunctuation))
 import Data.Data (Data, Typeable)
-import Data.List (find, intercalate, intersperse, stripPrefix)
+import Data.List (find, intercalate, intersperse, stripPrefix, sortBy)
 import qualified Data.Map as M
 import Data.Maybe (mapMaybe)
 import Data.Sequence (ViewL (..), ViewR (..), viewl, viewr)
 import qualified Data.Set as Set
 import qualified Data.Text as T
+import Data.Ord (comparing)
 import Data.Version (showVersion)
 import Network.URI (URI (uriScheme), escapeURIString, parseURI)
 import Paths_pandoc (version)
@@ -689,6 +691,31 @@ splitSentences xs =
   let (sent, rest) = breakSentence xs
   in  if null rest then [sent] else sent : splitSentences rest
 
+-- | Process ipynb output cells.  If mode is Nothing,
+-- remove all output.  If mode is Just format, select
+-- best output for the format.
+filterIpynbOutput :: Maybe Format -> Pandoc -> Pandoc
+filterIpynbOutput mode = walk go
+  where go (Div (ident, ("output":os), kvs) bs) =
+          case mode of
+            Nothing  -> Div (ident, ("output":os), kvs) []
+            Just fmt -> Div (ident, ("output":os), kvs) $
+              take 1 $ sortBy (comparing rank) bs
+                where
+                  rank (RawBlock (Format "html") _)
+                    | fmt == Format "html" = (1 :: Int)
+                    | fmt == Format "markdown" = 2
+                    | otherwise = 3
+                  rank (RawBlock (Format "latex") _)
+                    | fmt == Format "latex" = 1
+                    | fmt == Format "markdown" = 2
+                    | otherwise = 3
+                  rank (RawBlock f _)
+                    | fmt == f = 1
+                    | otherwise = 3
+                  rank _ = 2
+        go x = x
+
 --
 -- TagSoup HTML handling
 --