From 16f0316fbaa4d667ba40772969ab8e28fea6a493 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 24 Apr 2022 12:25:04 -0700 Subject: [PATCH] Add tsv (tab separated values) as an input format. We us ethe simple spec at . API change: Text.Pandoc.Readers.CSV now exports `readTSV`. Internal change: In Text.Pandoc.CSV, CSVOptions has changed so that csvQuote takes a Maybe value. Closes #7974. --- MANUAL.txt | 2 ++ src/Text/Pandoc/App.hs | 3 ++- src/Text/Pandoc/App/FormatHeuristics.hs | 1 + src/Text/Pandoc/CSV.hs | 30 +++++++++++++--------- src/Text/Pandoc/Readers.hs | 2 ++ src/Text/Pandoc/Readers/CSV.hs | 33 +++++++++++++++++++++---- src/Text/Pandoc/Readers/RST.hs | 4 +-- 7 files changed, 55 insertions(+), 20 deletions(-) diff --git a/MANUAL.txt b/MANUAL.txt index 6429bfda7..77c0182f2 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -232,6 +232,7 @@ header when requesting a document from a URL: - `creole` ([Creole 1.0]) - `csljson` ([CSL JSON] bibliography) - `csv` ([CSV] table) + - `tsv` ([TSV] table) - `docbook` ([DocBook]) - `docx` ([Word docx]) - `dokuwiki` ([DokuWiki markup]) @@ -484,6 +485,7 @@ header when requesting a document from a URL: [Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html [Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0 [CSV]: https://tools.ietf.org/html/rfc4180 +[TSV]: https://www.iana.org/assignments/media-types/text/tab-separated-values [roff man]: https://man.cx/groff_man(7) [roff ms]: https://man.cx/groff_ms(7) [Haskell]: https://www.haskell.org diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index 94b242cb4..1a694abb0 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -258,7 +258,8 @@ convertWithOpts opts = do let convertTabs = tabFilter (if optPreserveTabs opts || readerNameBase == "t2t" || - readerNameBase == "man" + readerNameBase == "man" || + readerNameBase == "tsv" then 0 else optTabStop opts) diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs index ebf8db4c5..c6f187363 100644 --- a/src/Text/Pandoc/App/FormatHeuristics.hs +++ b/src/Text/Pandoc/App/FormatHeuristics.hs @@ -86,6 +86,7 @@ formatFromFilePath x = ".xhtml" -> Just "html" ".ipynb" -> Just "ipynb" ".csv" -> Just "csv" + ".tsv" -> Just "tsv" ".bib" -> Just "biblatex" ['.',y] | y `elem` ['1'..'9'] -> Just "man" _ -> Nothing diff --git a/src/Text/Pandoc/CSV.hs b/src/Text/Pandoc/CSV.hs index 858dd5f6d..963fead0d 100644 --- a/src/Text/Pandoc/CSV.hs +++ b/src/Text/Pandoc/CSV.hs @@ -16,7 +16,7 @@ module Text.Pandoc.CSV ( ParseError ) where -import Control.Monad (unless, void) +import Control.Monad (unless, void, mzero) import Data.Text (Text) import qualified Data.Text as T import Text.Parsec @@ -24,7 +24,7 @@ import Text.Parsec.Text (Parser) data CSVOptions = CSVOptions{ csvDelim :: Char - , csvQuote :: Char + , csvQuote :: Maybe Char , csvKeepSpace :: Bool -- treat whitespace following delim as significant , csvEscape :: Maybe Char -- default is to double up quote } deriving (Read, Show) @@ -32,7 +32,7 @@ data CSVOptions = CSVOptions{ defaultCSVOptions :: CSVOptions defaultCSVOptions = CSVOptions{ csvDelim = ',' - , csvQuote = '"' + , csvQuote = Just '"' , csvKeepSpace = False , csvEscape = Nothing } @@ -53,18 +53,24 @@ pCSVCell :: CSVOptions -> Parser Text pCSVCell opts = pCSVQuotedCell opts <|> pCSVUnquotedCell opts pCSVQuotedCell :: CSVOptions -> Parser Text -pCSVQuotedCell opts = do - char (csvQuote opts) - res <- many (satisfy (\c -> c /= csvQuote opts && - Just c /= csvEscape opts) <|> escaped opts) - char (csvQuote opts) - return $ T.pack res +pCSVQuotedCell opts = + case csvQuote opts of + Nothing -> mzero + Just quotechar -> do + char quotechar + res <- many (satisfy (\c -> c /= quotechar && + Just c /= csvEscape opts) <|> escaped opts) + char quotechar + return $ T.pack res escaped :: CSVOptions -> Parser Char -escaped opts = try $ +escaped opts = case csvEscape opts of - Nothing -> char (csvQuote opts) >> char (csvQuote opts) - Just c -> char c >> noneOf "\r\n" + Nothing -> + case csvQuote opts of + Nothing -> mzero + Just q -> try $ char q >> char q + Just c -> try $ char c >> noneOf "\r\n" pCSVUnquotedCell :: CSVOptions -> Parser Text pCSVUnquotedCell opts = T.pack <$> diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 95f5f5b61..7abd1d024 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -52,6 +52,7 @@ module Text.Pandoc.Readers , readFB2 , readIpynb , readCSV + , readTSV , readCslJson , readBibTeX , readBibLaTeX @@ -152,6 +153,7 @@ readers = [("native" , TextReader readNative) ,("fb2" , TextReader readFB2) ,("ipynb" , TextReader readIpynb) ,("csv" , TextReader readCSV) + ,("tsv" , TextReader readTSV) ,("csljson" , TextReader readCslJson) ,("bibtex" , TextReader readBibTeX) ,("biblatex" , TextReader readBibLaTeX) diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs index 0fcf4bc35..23e0f7448 100644 --- a/src/Text/Pandoc/Readers/CSV.hs +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -10,11 +10,14 @@ Stability : alpha Portability : portable -Conversion from CSV to a 'Pandoc' table. +Conversion from CSV or TSV to a 'Pandoc' table. -} -module Text.Pandoc.Readers.CSV ( readCSV ) where +module Text.Pandoc.Readers.CSV ( + readCSV, + readTSV +) where import qualified Data.Text as T -import Text.Pandoc.CSV (parseCSV, defaultCSVOptions) +import Text.Pandoc.CSV (parseCSV, defaultCSVOptions, CSVOptions(..)) import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class (PandocMonad) @@ -22,14 +25,34 @@ import Text.Pandoc.Error import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Options (ReaderOptions) import Control.Monad.Except (throwError) +import Data.Text (Text) readCSV :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options -> a -> m Pandoc readCSV _opts s = do - let txt = sourcesToText $ toSources s - case parseCSV defaultCSVOptions txt of + readCSVWith defaultCSVOptions $ sourcesToText $ toSources s + +readTSV :: (PandocMonad m, ToSources a) + => ReaderOptions -- ^ Reader options + -> a + -> m Pandoc +readTSV _opts s = do + readCSVWith tsvOpts $ sourcesToText $ toSources s + where + tsvOpts = CSVOptions{ + csvDelim = '\t', + csvQuote = Nothing, + csvKeepSpace = False, + csvEscape = Nothing } + +readCSVWith :: PandocMonad m + => CSVOptions + -> Text + -> m Pandoc +readCSVWith csvopts txt = do + case parseCSV csvopts txt of Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) (TableHead nullAttr hdrs) diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs index f13b70738..b87c0ab71 100644 --- a/src/Text/Pandoc/Readers/RST.hs +++ b/src/Text/Pandoc/Readers/RST.hs @@ -845,8 +845,8 @@ csvTableDirective top fields rawcsv = do _ -> ',' , csvQuote = case trim <$> lookup "quote" fields of Just (T.unpack -> [c]) - -> c - _ -> '"' + -> Just c + _ -> Just '"' , csvEscape = case trim <$> lookup "escape" fields of Just (T.unpack -> [c]) -> Just c