2018-03-18 10:46:28 -07:00
{-# LANGUAGE NoImplicitPrelude #-}
2017-03-04 13:03:41 +01:00
{-# LANGUAGE ScopedTypeVariables #-}
2017-02-05 21:58:45 +01:00
{- |
Module : Text.Pandoc.App
2019-02-04 22:52:31 +01:00
Copyright : Copyright (C) 2006-2019 John MacFarlane
2017-02-05 21:58:45 +01:00
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley@edu>
Stability : alpha
Portability : portable
Does a pandoc conversion based on command-line options.
module Text.Pandoc.App (
, Opt(..)
2017-12-26 10:11:19 -08:00
, LineEnding(..)
, Filter(..)
2017-02-05 21:58:45 +01:00
, defaultOpts
2017-02-06 14:46:48 +01:00
, parseOptions
, options
2017-10-24 22:46:06 -07:00
, applyFilters
2017-02-05 21:58:45 +01:00
) where
2018-03-18 10:46:28 -07:00
import Prelude
2017-02-05 21:58:45 +01:00
import qualified Control.Exception as E
2017-02-06 14:46:48 +01:00
import Control.Monad
2017-02-06 17:03:03 +01:00
import Control.Monad.Trans
2019-07-14 11:35:17 -07:00
import Control.Monad.Except (throwError)
2017-02-06 14:46:48 +01:00
import qualified Data.ByteString as BS
2019-07-20 12:26:24 -07:00
import qualified Data.ByteString.Lazy as BL
2017-10-21 23:00:49 +02:00
import Data.Char (toLower)
2017-03-04 13:03:41 +01:00
import Data.Maybe (fromMaybe, isJust, isNothing)
2017-06-01 15:09:38 +02:00
import qualified Data.Set as Set
2017-06-10 15:55:18 +02:00
import Data.Text (Text)
2017-03-04 13:03:41 +01:00
import qualified Data.Text as T
2017-12-27 11:36:08 -08:00
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TE
import qualified Data.Text.Encoding.Error as TE
2018-11-29 15:59:33 -08:00
import qualified Data.Text.Encoding.Error as TSE
2017-05-23 09:48:11 +02:00
import Network.URI (URI (..), parseURI)
2019-03-02 15:03:59 -08:00
import System.Directory (doesDirectoryExist)
2018-01-10 22:26:12 +01:00
import System.Exit (exitSuccess)
2017-03-04 13:03:41 +01:00
import System.FilePath
2017-06-01 15:09:38 +02:00
import System.IO (nativeNewline, stdout)
2017-08-11 11:56:54 -07:00
import qualified System.IO as IO (Newline (..))
2017-03-04 13:03:41 +01:00
import Text.Pandoc
2018-11-07 21:29:48 +01:00
import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
2018-11-06 21:25:14 +01:00
import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts)
import Text.Pandoc.App.CommandLineOptions (parseOptions, options)
2018-11-03 07:33:04 +01:00
import Text.Pandoc.App.OutputSettings (OutputSettings (..), optToOutputSettings)
2017-10-27 20:28:29 -07:00
import Text.Pandoc.BCP47 (Lang (..), parseBCP47)
2017-11-08 21:54:23 -08:00
import Text.Pandoc.Builder (setMeta, deleteMeta)
2018-01-10 22:26:12 +01:00
import Text.Pandoc.Filter (Filter (JSONFilter, LuaFilter), applyFilters)
2017-03-04 13:03:41 +01:00
import Text.Pandoc.PDF (makePDF)
2018-03-30 21:48:14 +02:00
import Text.Pandoc.Readers.Markdown (yamlToMeta)
2017-06-01 15:09:38 +02:00
import Text.Pandoc.SelfContained (makeDataURI, makeSelfContained)
2017-12-02 15:21:59 -08:00
import Text.Pandoc.Shared (eastAsianLineBreakFilter, stripEmptyParagraphs,
2019-03-02 15:03:59 -08:00
headerShift, isURI, tabFilter, uriPathToPath, filterIpynbOutput,
2017-03-04 13:03:41 +01:00
import qualified Text.Pandoc.UTF8 as UTF8
2017-08-16 10:39:34 -07:00
#ifndef _WINDOWS
import System.Posix.IO (stdOutput)
import System.Posix.Terminal (queryTerminal)
2017-02-06 14:46:48 +01:00
2017-09-12 05:18:42 +02:00
2017-02-06 14:52:16 +01:00
convertWithOpts :: Opt -> IO ()
convertWithOpts opts = do
2017-08-16 09:45:12 -07:00
let outputFile = fromMaybe "-" (optOutputFile opts)
2017-02-05 21:58:45 +01:00
let filters = optFilters opts
let verbosity = optVerbosity opts
when (optDumpArgs opts) $
do UTF8.hPutStrLn stdout outputFile
2017-09-30 16:07:47 -05:00
mapM_ (UTF8.hPutStrLn stdout) (optInputFiles opts)
2017-02-05 21:58:45 +01:00
2017-12-26 10:11:19 -08:00
let isPandocCiteproc (JSONFilter f) = takeBaseName f == "pandoc-citeproc"
isPandocCiteproc _ = False
2017-02-05 21:58:45 +01:00
-- --bibliography implies -F pandoc-citeproc for backwards compatibility:
let needsCiteproc = isJust (lookup "bibliography" (optMetadata opts)) &&
optCiteMethod opts `notElem` [Natbib, Biblatex] &&
2017-12-26 10:11:19 -08:00
all (not . isPandocCiteproc) filters
let filters' = if needsCiteproc then JSONFilter "pandoc-citeproc" : filters
2017-02-05 21:58:45 +01:00
else filters
2017-09-30 16:07:47 -05:00
let sources = case optInputFiles opts of
2017-02-05 21:58:45 +01:00
[] -> ["-"]
xs | optIgnoreArgs opts -> ["-"]
| otherwise -> xs
datadir <- case optDataDir opts of
2019-03-02 15:03:59 -08:00
Nothing -> do
ds <- defaultUserDataDirs
let selectUserDataDir [] = return Nothing
selectUserDataDir (dir:dirs) = do
exists <- doesDirectoryExist dir
if exists
then return (Just dir)
else selectUserDataDir dirs
selectUserDataDir ds
2017-02-05 21:58:45 +01:00
Just _ -> return $ optDataDir opts
let runIO' :: PandocIO a -> IO a
runIO' f = do
(res, reports) <- runIOorExplode $ do
2017-06-19 22:04:01 +02:00
setTrace (optTrace opts)
2017-02-05 21:58:45 +01:00
setVerbosity verbosity
x <- f
rs <- getLog
return (x, rs)
2017-02-11 09:59:54 +01:00
case optLogFile opts of
2017-03-04 13:03:41 +01:00
Nothing -> return ()
2019-07-20 12:26:24 -07:00
Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
2017-02-10 23:59:47 +01:00
let isWarning msg = messageVerbosity msg == WARNING
2017-02-05 21:58:45 +01:00
when (optFailIfWarnings opts && any isWarning reports) $
2017-04-13 17:02:30 +02:00
E.throwIO PandocFailOnWarningError
2017-02-05 21:58:45 +01:00
return res
2017-05-21 11:42:50 +02:00
let eol = case optEol opts of
2017-05-22 10:10:04 +02:00
Native -> nativeNewline
2019-07-20 12:51:03 -07:00
#ifdef _WINDOWS
let istty = True
istty <- liftIO $ queryTerminal stdOutput
2017-05-17 15:13:35 +02:00
2017-02-05 21:58:45 +01:00
runIO' $ do
2017-08-12 12:15:40 -07:00
setUserDataDir datadir
2017-09-30 16:07:47 -05:00
setInputFiles (optInputFiles opts)
setOutputFile (optOutputFile opts)
2017-08-12 12:15:40 -07:00
2019-07-14 11:35:17 -07:00
-- assign reader and writer based on options and filenames
readerName <- case optReader opts of
Just f -> return f
Nothing -> case formatFromFilePaths sources of
Just f' -> return f'
Nothing | sources == ["-"] -> return "markdown"
| any isURI sources -> return "html"
| otherwise -> do
report $ UnknownExtensions
(map takeExtension sources) "markdown"
return "markdown"
let pdfOutput = map toLower (takeExtension outputFile) == ".pdf"
(reader, readerExts) <-
case getReader readerName of
Right (r, es) -> return (r :: Reader PandocIO, es)
Left e -> throwError $ PandocAppError e'
where e' = case readerName of
"pdf" -> e ++
"\nPandoc can convert to PDF, but not from PDF."
"doc" -> e ++
"\nPandoc can convert from DOCX, but not from DOC.\nTry using Word to save your DOC file as DOCX, and convert that with pandoc."
_ -> e
let convertTabs = tabFilter (if optPreserveTabs opts ||
readerName == "t2t" ||
readerName == "man"
then 0
else optTabStop opts)
let readSources :: [FilePath] -> PandocIO Text
readSources srcs = convertTabs . T.intercalate (T.pack "\n") <$>
mapM readSource srcs
2018-11-03 07:33:04 +01:00
outputSettings <- optToOutputSettings opts
let format = outputFormat outputSettings
let writer = outputWriter outputSettings
let writerName = outputWriterName outputSettings
let writerOptions = outputWriterOptions outputSettings
let standalone = optStandalone opts || not (isTextFormat format) || pdfOutput
-- We don't want to send output to the terminal if the user
-- does 'pandoc -t docx input.txt'; though we allow them to
-- force this with '-o -'. On posix systems, we detect
-- when stdout is being piped and allow output to stdout
-- in that case, but on Windows we can't.
when (not (isTextFormat format) && istty && isNothing ( optOutputFile opts)) $
2019-07-14 11:35:17 -07:00
throwError $ PandocAppError $
2018-11-03 07:33:04 +01:00
"Cannot write " ++ format ++ " output to terminal.\n" ++
"Specify an output file using the -o option, or " ++
"use '-o -' to force output to stdout."
2017-08-11 11:56:54 -07:00
2017-10-21 23:00:49 +02:00
abbrevs <- Set.fromList . filter (not . null) . lines <$>
2017-08-11 11:56:54 -07:00
case optAbbreviations opts of
Nothing -> UTF8.toString <$> readDataFile "abbreviations"
Just f -> UTF8.toString <$> readFileStrict f
metadata <- if format == "jats" &&
isNothing (lookup "csl" (optMetadata opts)) &&
isNothing (lookup "citation-style" (optMetadata opts))
then do
jatsCSL <- readDataFile "jats.csl"
let jatsEncoded = makeDataURI
("application/xml", jatsCSL)
return $ ("csl", jatsEncoded) : optMetadata opts
else return $ optMetadata opts
case lookup "lang" (optMetadata opts) of
Just l -> case parseBCP47 l of
2017-10-27 20:28:29 -07:00
Left _ -> return ()
2017-08-11 11:56:54 -07:00
Right l' -> setTranslations l'
Nothing -> setTranslations $ Lang "en" "" "US" []
let readerOpts = def{
readerStandalone = standalone
, readerColumns = optColumns opts
, readerTabStop = optTabStop opts
, readerIndentedCodeClasses = optIndentedCodeClasses opts
, readerDefaultImageExtension =
optDefaultImageExtension opts
, readerTrackChanges = optTrackChanges opts
, readerAbbreviations = abbrevs
, readerExtensions = readerExts
2017-09-17 12:49:15 -07:00
, readerStripComments = optStripComments opts
2017-08-11 11:56:54 -07:00
2019-03-01 06:52:15 +01:00
metadataFromFile <-
case optMetadataFile opts of
Nothing -> return mempty
Just file -> readFileLazy file >>= yamlToMeta readerOpts
2017-08-11 11:56:54 -07:00
let transforms = (case optBaseHeaderLevel opts of
x | x > 1 -> (headerShift (x - 1) :)
2017-12-02 15:21:59 -08:00
| otherwise -> id) .
(if optStripEmptyParagraphs opts
then (stripEmptyParagraphs :)
else id) .
2017-08-11 11:56:54 -07:00
(if extensionEnabled Ext_east_asian_line_breaks
readerExts &&
not (extensionEnabled Ext_east_asian_line_breaks
2018-11-03 07:33:04 +01:00
(writerExtensions writerOptions) &&
2017-08-11 11:56:54 -07:00
writerWrapText writerOptions == WrapPreserve)
then (eastAsianLineBreakFilter :)
2019-02-28 20:28:16 -08:00
else id) .
(case optIpynbOutput opts of
"all" -> id
"none" -> (filterIpynbOutput Nothing :)
"best" -> (filterIpynbOutput (Just $
2019-04-25 21:52:27 -04:00
if htmlFormat format
2019-02-28 20:28:16 -08:00
then Format "html"
2019-04-25 21:52:27 -04:00
case format of
2019-02-28 20:28:16 -08:00
"latex" -> Format "latex"
"beamer" -> Format "latex"
2019-04-25 21:52:27 -04:00
_ -> Format format) :)
2019-02-28 20:28:16 -08:00
_ -> id) -- should not happen
$ []
2017-08-11 11:56:54 -07:00
let sourceToDoc :: [FilePath] -> PandocIO Pandoc
sourceToDoc sources' =
case reader of
TextReader r
| optFileScope opts || readerName == "json" ->
2019-04-02 16:33:59 -07:00
mconcat <$> mapM (readSource >=> r readerOpts) sources'
2017-08-11 11:56:54 -07:00
| otherwise ->
readSources sources' >>= r readerOpts
ByteStringReader r ->
2019-04-02 16:33:59 -07:00
mconcat <$> mapM (readFile' >=> r readerOpts) sources'
2017-08-11 11:56:54 -07:00
2017-08-07 22:34:38 -07:00
when (readerName == "markdown_github" ||
writerName == "markdown_github") $
report $ Deprecated "markdown_github" "Use gfm instead."
2017-10-15 22:11:43 -07:00
2017-05-21 08:59:06 +02:00
setResourcePath (optResourcePath opts)
2017-10-29 12:58:41 -07:00
mapM_ (uncurry setRequestHeader) (optRequestHeaders opts)
2017-10-15 22:11:43 -07:00
2017-07-29 20:54:25 +02:00
doc <- sourceToDoc sources >>=
2017-05-07 11:45:33 +02:00
( (if isJust (optExtractMedia opts)
2017-09-30 16:07:47 -05:00
then fillMediaBag
2017-05-07 11:45:33 +02:00
else return)
2018-03-30 21:48:14 +02:00
>=> return . addNonPresentMetadata metadataFromFile
2017-11-08 21:54:23 -08:00
>=> return . addMetadata metadata
2017-09-29 21:05:24 +02:00
>=> applyTransforms transforms
2017-12-26 10:11:19 -08:00
>=> applyFilters readerOpts filters' [format]
>=> maybe return extractMedia (optExtractMedia opts)
2017-05-07 11:45:33 +02:00
2017-02-05 21:58:45 +01:00
case writer of
2017-05-07 10:34:04 +02:00
ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
2018-11-03 07:33:04 +01:00
TextWriter f -> case outputPdfProgram outputSettings of
2017-09-12 05:18:42 +02:00
Just pdfProg -> do
2017-10-26 11:11:04 -07:00
res <- makePDF pdfProg (optPdfEngineArgs opts) f
writerOptions doc
2017-02-05 21:58:45 +01:00
case res of
Right pdf -> writeFnBinary outputFile pdf
2019-07-20 12:53:44 -07:00
Left err' -> throwError $ PandocPDFError $
2017-12-27 11:36:08 -08:00
TL.unpack (TE.decodeUtf8With TE.lenientDecode err')
2017-09-12 05:18:42 +02:00
Nothing -> do
2019-02-28 20:28:16 -08:00
let addNl = if standalone
2017-06-10 23:39:49 +02:00
then id
else (<> T.singleton '\n')
Implement `--ascii` (`writerPreferAscii`) in writers, not App.
Now the `write*` functions for Docbook, HTML, ICML, JATS,
Man, Ms, OPML are sensitive to `writerPreferAscii`. Previously
the to-ascii translation was done in Text.Pandoc.App, and
thus not available to those using the writer functions
In addition, the LaTeX writer is now sensitive to
`writerPreferAscii` and to `--ascii`. 100% ASCII
output can't be guaranteed, but the writer will use
commands like `\"{a}` and `\l` whenever possible,
to avoid emiting a non-ASCII character.
A new unexported module, Text.Pandoc.Groff, has been
added to store functions used in the different groff-based
2018-09-30 22:32:00 -07:00
output <- addNl <$> f writerOptions doc
2017-06-10 23:39:49 +02:00
writerFn eol outputFile =<<
2019-04-25 21:52:27 -04:00
if optSelfContained opts && htmlFormat format
2017-06-10 23:39:49 +02:00
-- TODO not maximally efficient; change type
-- of makeSelfContained so it works w/ Text
2017-09-30 16:07:47 -05:00
then T.pack <$> makeSelfContained (T.unpack output)
2017-06-10 23:39:49 +02:00
else return output
2017-02-05 21:58:45 +01:00
type Transform = Pandoc -> Pandoc
2019-02-28 20:28:16 -08:00
htmlFormat :: String -> Bool
htmlFormat = (`elem` ["html","html4","html5","s5","slidy",
2017-02-05 21:58:45 +01:00
isTextFormat :: String -> Bool
2017-12-21 10:22:58 -05:00
isTextFormat s = s `notElem` ["odt","docx","epub2","epub3","epub","pptx"]
2017-02-05 21:58:45 +01:00
2018-03-30 21:48:14 +02:00
addNonPresentMetadata :: Text.Pandoc.Meta -> Pandoc -> Pandoc
addNonPresentMetadata newmeta (Pandoc meta bs) = Pandoc (meta <> newmeta) bs
2017-11-08 21:54:23 -08:00
addMetadata :: [(String, String)] -> Pandoc -> Pandoc
addMetadata kvs pdc = foldr addMeta (removeMetaKeys kvs pdc) kvs
addMeta :: (String, String) -> Pandoc -> Pandoc
addMeta (k, v) (Pandoc meta bs) = Pandoc meta' bs
2017-02-05 21:58:45 +01:00
where meta' = case lookupMeta k meta of
Nothing -> setMeta k v' meta
Just (MetaList xs) ->
setMeta k (MetaList (xs ++ [v'])) meta
Just x -> setMeta k (MetaList [x, v']) meta
v' = readMetaValue v
2017-11-08 21:54:23 -08:00
removeMetaKeys :: [(String,String)] -> Pandoc -> Pandoc
removeMetaKeys kvs pdc = foldr (deleteMeta . fst) pdc kvs
2017-02-05 21:58:45 +01:00
readMetaValue :: String -> MetaValue
2018-12-31 21:20:56 -08:00
readMetaValue s
| s == "true" = MetaBool True
| s == "True" = MetaBool True
| s == "TRUE" = MetaBool True
| s == "false" = MetaBool False
| s == "False" = MetaBool False
| s == "FALSE" = MetaBool False
| otherwise = MetaString s
2017-02-05 21:58:45 +01:00
-- Transformations of a Pandoc document post-parsing:
applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
applyTransforms transforms d = return $ foldr ($) d transforms
2017-06-10 15:55:18 +02:00
readSource :: FilePath -> PandocIO Text
2017-02-05 21:58:45 +01:00
readSource src = case parseURI src of
Just u | uriScheme u `elem` ["http:","https:"] ->
readURI src
2018-11-29 15:59:33 -08:00
| uriScheme u == "file:" -> liftIO $
readTextFile (uriPathToPath $ uriPath u)
_ -> liftIO $ readTextFile src
where readTextFile :: FilePath -> IO Text
readTextFile fp = do
bs <- if src == "-"
then BS.getContents
else BS.readFile fp
E.catch (return $! UTF8.toText bs)
(\e -> case e of
TSE.DecodeError _ (Just w) -> do
case BS.elemIndex w bs of
Just offset -> E.throwIO $
PandocUTF8DecodingError fp offset w
_ -> E.throwIO $ PandocUTF8DecodingError fp 0 w
_ -> E.throwIO $ PandocAppError (show e))
2017-02-05 21:58:45 +01:00
2017-06-10 15:55:18 +02:00
readURI :: FilePath -> PandocIO Text
2017-10-15 22:11:43 -07:00
readURI src = UTF8.toText . fst <$> openURL src
2017-02-05 21:58:45 +01:00
2019-07-20 12:26:24 -07:00
readFile' :: MonadIO m => FilePath -> m BL.ByteString
readFile' "-" = liftIO BL.getContents
readFile' f = liftIO $ BL.readFile f
2017-02-05 21:58:45 +01:00
2019-07-20 12:26:24 -07:00
writeFnBinary :: MonadIO m => FilePath -> BL.ByteString -> m ()
writeFnBinary "-" = liftIO . BL.putStr
writeFnBinary f = liftIO . BL.writeFile (UTF8.encodePath f)
2017-02-05 21:58:45 +01:00
2017-06-10 23:39:49 +02:00
writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m ()
-- TODO this implementation isn't maximally efficient:
writerFn eol "-" = liftIO . UTF8.putStrWith eol . T.unpack
writerFn eol f = liftIO . UTF8.writeFileWith eol f . T.unpack