Reorganize App to make it easier to limit IO in main loop.

Previously we used liftIO fairly liberally.  The code has
been restructured to avoid this.

A small behavior change is that pandoc will now fall back
to latin1 encoding for inputs that can't be read as UTF-8.
This is what it did previously for content fetched from
the web and not marked as to content type. It makes sense
to do the same for local files.
This commit is contained in:
John MacFarlane 2021-08-22 17:47:47 -07:00
parent c39ddeb8f8
commit fd7c140cde

View file

@ -1,3 +1,4 @@
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-} {-# LANGUAGE CPP #-}
@ -27,7 +28,7 @@ module Text.Pandoc.App (
import qualified Control.Exception as E import qualified Control.Exception as E
import Control.Monad ( (>=>), when ) import Control.Monad ( (>=>), when )
import Control.Monad.Trans ( MonadIO(..) ) import Control.Monad.Trans ( MonadIO(..) )
import Control.Monad.Except (throwError) import Control.Monad.Except (throwError, catchError)
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
@ -38,6 +39,7 @@ import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TE import qualified Data.Text.Lazy.Encoding as TE
import qualified Data.Text.Encoding as TSE
import qualified Data.Text.Encoding.Error as TE import qualified Data.Text.Encoding.Error as TE
import qualified Data.Text.Encoding.Error as TSE import qualified Data.Text.Encoding.Error as TSE
import Network.URI (URI (..), parseURI) import Network.URI (URI (..), parseURI)
@ -48,7 +50,7 @@ import System.IO (nativeNewline, stdout)
import qualified System.IO as IO (Newline (..)) import qualified System.IO as IO (Newline (..))
import Text.Pandoc import Text.Pandoc
import Text.Pandoc.Builder (setMeta) import Text.Pandoc.Builder (setMeta)
import Text.Pandoc.MIME (getCharset) import Text.Pandoc.MIME (getCharset, MimeType)
import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths) import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts, import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
IpynbOutput (..)) IpynbOutput (..))
@ -69,6 +71,7 @@ import qualified Text.Pandoc.UTF8 as UTF8
import System.Posix.IO (stdOutput) import System.Posix.IO (stdOutput)
import System.Posix.Terminal (queryTerminal) import System.Posix.Terminal (queryTerminal)
#endif #endif
import Debug.Trace
convertWithOpts :: Opt -> IO () convertWithOpts :: Opt -> IO ()
convertWithOpts opts = do convertWithOpts opts = do
@ -94,14 +97,14 @@ convertWithOpts opts = do
let sources = case optInputFiles opts of let sources = case optInputFiles opts of
Just xs | not (optIgnoreArgs opts) -> xs Just xs | not (optIgnoreArgs opts) -> xs
_ -> ["-"] _ -> ["-"]
#ifdef _WINDOWS #ifdef _WINDOWS
let istty = True let istty = True
#else #else
istty <- liftIO $ queryTerminal stdOutput istty <- liftIO $ queryTerminal stdOutput
#endif #endif
(output, reports) <- runIOorExplode $ do res <- runIO $ do
setTrace (optTrace opts) setTrace (optTrace opts)
setVerbosity verbosity setVerbosity verbosity
setUserDataDir datadir setUserDataDir datadir
@ -110,6 +113,8 @@ convertWithOpts opts = do
setInputFiles (fromMaybe ["-"] (optInputFiles opts)) setInputFiles (fromMaybe ["-"] (optInputFiles opts))
setOutputFile (optOutputFile opts) setOutputFile (optOutputFile opts)
inputs <- readSources sources
-- assign reader and writer based on options and filenames -- assign reader and writer based on options and filenames
readerName <- case optFrom opts of readerName <- case optFrom opts of
Just f -> return f Just f -> return f
@ -135,20 +140,6 @@ convertWithOpts opts = do
(reader, readerExts) <- getReader readerName (reader, readerExts) <- getReader readerName
let convertTabs = tabFilter (if optPreserveTabs opts ||
readerNameBase == "t2t" ||
readerNameBase == "man"
then 0
else optTabStop opts)
let readSources :: [FilePath] -> PandocIO [(FilePath, Text)]
readSources srcs =
mapM (\fp -> do
t <- readSource fp
return (if fp == "-" then "" else fp, convertTabs t)) srcs
outputSettings <- optToOutputSettings opts outputSettings <- optToOutputSettings opts
let format = outputFormat outputSettings let format = outputFormat outputSettings
let writer = outputWriter outputSettings let writer = outputWriter outputSettings
@ -236,20 +227,11 @@ convertWithOpts opts = do
_ -> Format format) :)) _ -> Format format) :))
$ [] $ []
let sourceToDoc :: [FilePath] -> PandocIO Pandoc let convertTabs = tabFilter (if optPreserveTabs opts ||
sourceToDoc sources' = readerNameBase == "t2t" ||
case reader of readerNameBase == "man"
TextReader r then 0
| readerNameBase == "json" -> else optTabStop opts)
mconcat <$> mapM (readSource >=> r readerOpts) sources'
| optFileScope opts ->
-- Read source and convert tabs (see #6709)
let readSource' = fmap convertTabs . readSource
in mconcat <$> mapM (readSource' >=> r readerOpts) sources'
| otherwise ->
readSources sources' >>= r readerOpts
ByteStringReader r ->
mconcat <$> mapM (readFile' >=> r readerOpts) sources'
when (readerNameBase == "markdown_github" || when (readerNameBase == "markdown_github" ||
@ -275,7 +257,22 @@ convertWithOpts opts = do
maybe id (setMeta "citation-abbreviations") maybe id (setMeta "citation-abbreviations")
(optCitationAbbreviations opts) $ mempty (optCitationAbbreviations opts) $ mempty
doc <- sourceToDoc sources >>= doc <- (case reader of
TextReader r
| readerNameBase == "json" ->
mconcat <$>
mapM (inputToText convertTabs
>=> r readerOpts . (:[])) inputs
| optFileScope opts ->
mconcat <$> mapM
(inputToText convertTabs
>=> r readerOpts . (:[]))
inputs
| otherwise -> mapM (inputToText convertTabs) inputs
>>= r readerOpts
ByteStringReader r ->
mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs)
>>=
( (if isJust (optExtractMedia opts) ( (if isJust (optExtractMedia opts)
then fillMediaBag then fillMediaBag
else return) else return)
@ -310,19 +307,22 @@ convertWithOpts opts = do
reports <- getLog reports <- getLog
return (output, reports) return (output, reports)
case optLogFile opts of case res of
Nothing -> return () Left e -> E.throwIO e
Just logfile -> BL.writeFile logfile (encodeLogMessages reports) Right (output, reports) -> do
let isWarning msg = messageVerbosity msg == WARNING case optLogFile opts of
when (optFailIfWarnings opts && any isWarning reports) $ Nothing -> return ()
E.throwIO PandocFailOnWarningError Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
let eol = case optEol opts of let isWarning msg = messageVerbosity msg == WARNING
CRLF -> IO.CRLF when (optFailIfWarnings opts && any isWarning reports) $
LF -> IO.LF E.throwIO PandocFailOnWarningError
Native -> nativeNewline let eol = case optEol opts of
case output of CRLF -> IO.CRLF
TextOutput t -> writerFn eol outputFile t LF -> IO.LF
BinaryOutput bs -> writeFnBinary outputFile bs Native -> nativeNewline
case output of
TextOutput t -> writerFn eol outputFile t
BinaryOutput bs -> writeFnBinary outputFile bs
data PandocOutput = TextOutput Text | BinaryOutput BL.ByteString data PandocOutput = TextOutput Text | BinaryOutput BL.ByteString
deriving (Show) deriving (Show)
@ -344,49 +344,64 @@ adjustMetadata f (Pandoc meta bs) = Pandoc (f meta) bs
applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
applyTransforms transforms d = return $ foldr ($) d transforms applyTransforms transforms d = return $ foldr ($) d transforms
readSource :: FilePath -> PandocIO Text readSources :: (PandocMonad m, MonadIO m)
readSource src = case parseURI src of => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))]
Just u | uriScheme u `elem` ["http:","https:"] -> readSources srcs =
readURI src mapM (\fp -> do t <- readSource fp
| uriScheme u == "file:" -> liftIO $ return (if fp == "-" then "" else fp, t)) srcs
readTextFile (uriPathToPath $ T.pack $ uriPath u)
_ -> liftIO $ readTextFile src
where readTextFile :: FilePath -> IO Text
readTextFile fp = do
bs <- if src == "-"
then BS.getContents
else BS.readFile fp
E.catch (return $! UTF8.toText bs)
(\e -> E.throwIO $ case e of
TSE.DecodeError _ (Just w) ->
case BS.elemIndex w bs of
Just offset ->
PandocUTF8DecodingError (T.pack fp) offset w
_ -> PandocUTF8DecodingError (T.pack fp) 0 w
_ -> PandocAppError (tshow e))
readURI :: FilePath -> PandocIO Text readSource :: (PandocMonad m, MonadIO m)
readURI src = do => FilePath -> m (BS.ByteString, Maybe MimeType)
(bs, mt) <- openURL (T.pack src) readSource "-" = (,Nothing) <$> readStdinStrict
readSource src =
case parseURI src of
Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src)
| uriScheme u == "file:" ->
(,Nothing) <$>
readFileStrict (uriPathToPath $ T.pack $ uriPath u)
_ -> (,Nothing) <$> readFileStrict src
utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
utf8ToText fp bs =
case TSE.decodeUtf8' . dropBOM $ bs of
Left (TSE.DecodeError _ (Just w)) ->
case BS.elemIndex w bs of
Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
Left e -> throwError $ PandocAppError (tshow e)
Right t -> return t
where
dropBOM bs' =
if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
then BS.drop 3 bs'
else bs'
inputToText :: PandocMonad m
=> (Text -> Text)
-> (FilePath, (BS.ByteString, Maybe MimeType))
-> m (FilePath, Text)
inputToText convTabs (fp, (bs,mt)) =
(fp,) . convTabs . T.filter (/='\r') <$>
case mt >>= getCharset of case mt >>= getCharset of
Just "UTF-8" -> return $ UTF8.toText bs Just "UTF-8" -> utf8ToText fp bs
Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
Just charset -> throwError $ PandocUnsupportedCharsetError charset Just charset -> throwError $ PandocUnsupportedCharsetError charset
Nothing -> liftIO $ -- try first as UTF-8, then as latin1 Nothing -> catchError
E.catch (return $! UTF8.toText bs) (utf8ToText fp bs)
(\case (\case
TSE.DecodeError{} -> PandocUTF8DecodingError{} ->
return $ T.pack $ B8.unpack bs return $ T.pack $ B8.unpack bs
e -> E.throwIO e) e -> throwError e)
readFile' :: MonadIO m => FilePath -> m BL.ByteString inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
readFile' "-" = liftIO BL.getContents -> BL.ByteString
readFile' f = liftIO $ BL.readFile f inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs
writeFnBinary :: MonadIO m => FilePath -> BL.ByteString -> m () writeFnBinary :: FilePath -> BL.ByteString -> IO ()
writeFnBinary "-" = liftIO . BL.putStr writeFnBinary "-" = BL.putStr
writeFnBinary f = liftIO . BL.writeFile (UTF8.encodePath f) writeFnBinary f = BL.writeFile (UTF8.encodePath f)
writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m () writerFn :: IO.Newline -> FilePath -> Text -> IO ()
writerFn eol "-" = liftIO . UTF8.putStrWith eol writerFn eol "-" = UTF8.putStrWith eol
writerFn eol f = liftIO . UTF8.writeFileWith eol f writerFn eol f = UTF8.writeFileWith eol f