Introduce file-scope parsing (parse-before-combine)

Traditionally pandoc operates on multiple files by first concetenating
them (around extra line breaks) and then processing the joined file. So
it only parses a multi-file document at the document scope. This has the
benefit that footnotes and links can be in different files, but it also
introduces a couple of difficulties:

  - it is difficult to join files with footnotes without some sort of
    preprocessing, which makes it difficult to write academic documents
    in small pieces.

  - it makes it impossible to process multiple binary input files, which
    can't be catted.

  - it makes it impossible to process files from different input
    formats.

This commit introduces alternative method. Instead of catting the files
first, it parses the files first, and then combines the parsed
output. This makes it impossible to have links across multiple files,
and auto-identified headers won't work correctly if headers in multiple
files have the same name. On the other hand, footnotes across multiple
files will work correctly and will allow more freedom for input formats.

Since ByteStringReaders can currently only read one binary file, and
will ignore subsequent files, we also changes the behavior to
automatically parse before combining if using the ByteStringReader. If
we use one file, it will work as normal. If there is more than one file
it will combine them after parsing (assuming that the format is the
same).

Note that this is intended to be an optional method, defaulting to
off. Turn it on with `--file-scope`.
This commit is contained in:
Jesse Rosenthal 2016-02-20 21:27:08 -05:00
parent 68fd333ec4
commit 5c055b4cf3
2 changed files with 25 additions and 4 deletions

View file

@ -215,6 +215,7 @@ data Opt = Opt
, optExtractMedia :: Maybe FilePath -- ^ Path to extract embedded media
, optTrace :: Bool -- ^ Print debug information
, optTrackChanges :: TrackChanges -- ^ Accept or reject MS Word track-changes.
, optFileScope :: Bool -- ^ Parse input files before combining
, optKaTeXStylesheet :: Maybe String -- ^ Path to stylesheet for KaTeX
, optKaTeXJS :: Maybe String -- ^ Path to js file for KaTeX
}
@ -278,6 +279,7 @@ defaultOpts = Opt
, optExtractMedia = Nothing
, optTrace = False
, optTrackChanges = AcceptChanges
, optFileScope = False
, optKaTeXStylesheet = Nothing
, optKaTeXJS = Nothing
}
@ -387,6 +389,11 @@ options =
"accept|reject|all")
"" -- "Accepting or reject MS Word track-changes.""
, Option "" ["file-scope"]
(NoArg
(\opt -> return opt { optFileScope = True }))
"" -- "Parse input files before combining"
, Option "" ["extract-media"]
(ReqArg
(\arg opt ->
@ -1117,6 +1124,7 @@ convertWithOpts opts args = do
, optExtractMedia = mbExtractMedia
, optTrace = trace
, optTrackChanges = trackChanges
, optFileScope = fileScope
, optKaTeXStylesheet = katexStylesheet
, optKaTeXJS = katexJS
} = opts
@ -1269,6 +1277,7 @@ convertWithOpts opts args = do
, readerDefaultImageExtension = defaultImageExtension
, readerTrace = trace
, readerTrackChanges = trackChanges
, readerFileScope = fileScope
}
when (not (isTextFormat format) && outputFile == "-") $
@ -1301,13 +1310,23 @@ convertWithOpts opts args = do
then handleIncludes
else return . Right
(doc, media) <- fmap handleError $
let sourceToDoc :: [FilePath] -> IO (Pandoc, MediaBag)
sourceToDoc sources' = fmap handleError $
case reader of
StringReader r-> do
srcs <- convertTabs . intercalate "\n" <$> readSources sources
srcs <- convertTabs . intercalate "\n" <$> readSources sources'
doc <- handleIncludes' srcs
either (return . Left) (\s -> fmap (,mempty) <$> r readerOpts s) doc
ByteStringReader r -> readFiles sources >>= r readerOpts
ByteStringReader r -> readFiles sources' >>= r readerOpts
-- We parse first if fileScope is set OR if the reader is a
-- BSReader. So, if it's a StringReader AND not fileScope, we
-- don't.
(doc, media) <- case reader of
(StringReader _) | not fileScope -> sourceToDoc sources
_ -> do
pairs <- mapM (\s -> sourceToDoc [s]) sources
return (mconcat $ map fst pairs, mconcat $ map snd pairs)
let writerOptions = def { writerStandalone = standalone',
writerTemplate = templ,

View file

@ -264,6 +264,7 @@ data ReaderOptions = ReaderOptions{
, readerDefaultImageExtension :: String -- ^ Default extension for images
, readerTrace :: Bool -- ^ Print debugging info
, readerTrackChanges :: TrackChanges
, readerFileScope :: Bool -- ^ Parse before combining
} deriving (Show, Read, Data, Typeable, Generic)
instance Default ReaderOptions
@ -280,6 +281,7 @@ instance Default ReaderOptions
, readerDefaultImageExtension = ""
, readerTrace = False
, readerTrackChanges = AcceptChanges
, readerFileScope = False
}
--