Introduce file-scope parsing (parse-before-combine)

Traditionally pandoc operates on multiple files by first concetenating them (around extra line breaks) and then processing the joined file. So it only parses a multi-file document at the document scope. This has the benefit that footnotes and links can be in different files, but it also introduces a couple of difficulties: - it is difficult to join files with footnotes without some sort of preprocessing, which makes it difficult to write academic documents in small pieces. - it makes it impossible to process multiple binary input files, which can't be catted. - it makes it impossible to process files from different input formats. This commit introduces alternative method. Instead of catting the files first, it parses the files first, and then combines the parsed output. This makes it impossible to have links across multiple files, and auto-identified headers won't work correctly if headers in multiple files have the same name. On the other hand, footnotes across multiple files will work correctly and will allow more freedom for input formats. Since ByteStringReaders can currently only read one binary file, and will ignore subsequent files, we also changes the behavior to automatically parse before combining if using the ByteStringReader. If we use one file, it will work as normal. If there is more than one file it will combine them after parsing (assuming that the format is the same). Note that this is intended to be an optional method, defaulting to off. Turn it on with `--file-scope`.
2016-02-20 21:27:08 -05:00 · 2016-02-20 21:27:08 -05:00 · 5c055b4cf3
commit 5c055b4cf3
parent 68fd333ec4
2 changed files with 25 additions and 4 deletions
--- a/pandoc.hs
+++ b/pandoc.hs
@ -215,6 +215,7 @@ data Opt = Opt
    , optExtractMedia      :: Maybe FilePath -- ^ Path to extract embedded media
    , optTrace             :: Bool       -- ^ Print debug information
    , optTrackChanges      :: TrackChanges -- ^ Accept or reject MS Word track-changes.
+    , optFileScope        :: Bool         -- ^ Parse input files before combining
    , optKaTeXStylesheet   :: Maybe String     -- ^ Path to stylesheet for KaTeX
    , optKaTeXJS           :: Maybe String     -- ^ Path to js file for KaTeX
    }
@ -278,6 +279,7 @@ defaultOpts = Opt
    , optExtractMedia          = Nothing
    , optTrace                 = False
    , optTrackChanges          = AcceptChanges
+    , optFileScope            = False
    , optKaTeXStylesheet       = Nothing
    , optKaTeXJS               = Nothing
    }
@ -387,6 +389,11 @@ options =
                  "accept|reject|all")
                 "" -- "Accepting or reject MS Word track-changes.""

+    , Option "" ["file-scope"]
+                 (NoArg
+                  (\opt -> return opt { optFileScope = True }))
+                 "" -- "Parse input files before combining"
+
    , Option "" ["extract-media"]
                 (ReqArg
                  (\arg opt ->
@ -1117,6 +1124,7 @@ convertWithOpts opts args = do
              , optExtractMedia          = mbExtractMedia
              , optTrace                 = trace
              , optTrackChanges          = trackChanges
+              , optFileScope            = fileScope
              , optKaTeXStylesheet       = katexStylesheet
              , optKaTeXJS               = katexJS
             } = opts
@ -1269,6 +1277,7 @@ convertWithOpts opts args = do
                      , readerDefaultImageExtension = defaultImageExtension
                      , readerTrace = trace
                      , readerTrackChanges = trackChanges
+                      , readerFileScope   = fileScope
                      }

  when (not (isTextFormat format) && outputFile == "-") $
@ -1301,13 +1310,23 @@ convertWithOpts opts args = do
                               then handleIncludes
                               else return . Right

-  (doc, media) <- fmap handleError $
+  let sourceToDoc :: [FilePath] -> IO (Pandoc, MediaBag)
+      sourceToDoc sources' = fmap handleError $
        case reader of
          StringReader r-> do
-            srcs <- convertTabs . intercalate "\n" <$> readSources sources
+            srcs <- convertTabs . intercalate "\n" <$> readSources sources'
            doc <- handleIncludes' srcs
            either (return . Left) (\s -> fmap (,mempty) <$> r readerOpts s) doc
-          ByteStringReader r -> readFiles sources >>= r readerOpts
+          ByteStringReader r -> readFiles sources' >>= r readerOpts
+
+  -- We parse first if fileScope is set OR if the reader is a
+  -- BSReader. So, if it's a StringReader AND not fileScope, we
+  -- don't.
+  (doc, media) <- case reader of
+    (StringReader _) | not fileScope -> sourceToDoc sources
+    _                                   -> do
+      pairs <- mapM (\s -> sourceToDoc [s]) sources
+      return (mconcat $ map fst pairs, mconcat $ map snd pairs)

  let writerOptions = def { writerStandalone       = standalone',
                            writerTemplate         = templ,
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@ -264,6 +264,7 @@ data ReaderOptions = ReaderOptions{
       , readerDefaultImageExtension :: String -- ^ Default extension for images
       , readerTrace           :: Bool -- ^ Print debugging info
       , readerTrackChanges    :: TrackChanges
+       , readerFileScope      :: Bool -- ^ Parse before combining
 } deriving (Show, Read, Data, Typeable, Generic)

 instance Default ReaderOptions
@ -280,6 +281,7 @@ instance Default ReaderOptions
               , readerDefaultImageExtension = ""
               , readerTrace                 = False
               , readerTrackChanges          = AcceptChanges
+               , readerFileScope             = False
               }

 --