From 16f0316fbaa4d667ba40772969ab8e28fea6a493 Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 24 Apr 2022 12:25:04 -0700
Subject: [PATCH] Add tsv (tab separated values) as an input format.

We us ethe simple spec at
<https://www.iana.org/assignments/media-types/text/tab-separated-values>.

API change: Text.Pandoc.Readers.CSV now exports `readTSV`.

Internal change:  In Text.Pandoc.CSV, CSVOptions has changed
so that csvQuote takes a Maybe value.

Closes #7974.
---
 MANUAL.txt                              |  2 ++
 src/Text/Pandoc/App.hs                  |  3 ++-
 src/Text/Pandoc/App/FormatHeuristics.hs |  1 +
 src/Text/Pandoc/CSV.hs                  | 30 +++++++++++++---------
 src/Text/Pandoc/Readers.hs              |  2 ++
 src/Text/Pandoc/Readers/CSV.hs          | 33 +++++++++++++++++++++----
 src/Text/Pandoc/Readers/RST.hs          |  4 +--
 7 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/MANUAL.txt b/MANUAL.txt
index 6429bfda7..77c0182f2 100644
--- a/MANUAL.txt
+++ b/MANUAL.txt
@@ -232,6 +232,7 @@ header when requesting a document from a URL:
     - `creole` ([Creole 1.0])
     - `csljson` ([CSL JSON] bibliography)
     - `csv` ([CSV] table)
+    - `tsv` ([TSV] table)
     - `docbook` ([DocBook])
     - `docx` ([Word docx])
     - `dokuwiki` ([DokuWiki markup])
@@ -484,6 +485,7 @@ header when requesting a document from a URL:
 [Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html
 [Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0
 [CSV]: https://tools.ietf.org/html/rfc4180
+[TSV]: https://www.iana.org/assignments/media-types/text/tab-separated-values
 [roff man]: https://man.cx/groff_man(7)
 [roff ms]: https://man.cx/groff_ms(7)
 [Haskell]: https://www.haskell.org
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 94b242cb4..1a694abb0 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -258,7 +258,8 @@ convertWithOpts opts = do
 
     let convertTabs = tabFilter (if optPreserveTabs opts ||
                                       readerNameBase == "t2t" ||
-                                      readerNameBase == "man"
+                                      readerNameBase == "man" ||
+                                      readerNameBase == "tsv"
                                     then 0
                                     else optTabStop opts)
 
diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs
index ebf8db4c5..c6f187363 100644
--- a/src/Text/Pandoc/App/FormatHeuristics.hs
+++ b/src/Text/Pandoc/App/FormatHeuristics.hs
@@ -86,6 +86,7 @@ formatFromFilePath x =
     ".xhtml"    -> Just "html"
     ".ipynb"    -> Just "ipynb"
     ".csv"      -> Just "csv"
+    ".tsv"      -> Just "tsv"
     ".bib"      -> Just "biblatex"
     ['.',y]     | y `elem` ['1'..'9'] -> Just "man"
     _           -> Nothing
diff --git a/src/Text/Pandoc/CSV.hs b/src/Text/Pandoc/CSV.hs
index 858dd5f6d..963fead0d 100644
--- a/src/Text/Pandoc/CSV.hs
+++ b/src/Text/Pandoc/CSV.hs
@@ -16,7 +16,7 @@ module Text.Pandoc.CSV (
   ParseError
 ) where
 
-import Control.Monad (unless, void)
+import Control.Monad (unless, void, mzero)
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.Parsec
@@ -24,7 +24,7 @@ import Text.Parsec.Text (Parser)
 
 data CSVOptions = CSVOptions{
     csvDelim     :: Char
-  , csvQuote     :: Char
+  , csvQuote     :: Maybe Char
   , csvKeepSpace :: Bool -- treat whitespace following delim as significant
   , csvEscape    :: Maybe Char -- default is to double up quote
 } deriving (Read, Show)
@@ -32,7 +32,7 @@ data CSVOptions = CSVOptions{
 defaultCSVOptions :: CSVOptions
 defaultCSVOptions = CSVOptions{
     csvDelim = ','
-  , csvQuote = '"'
+  , csvQuote = Just '"'
   , csvKeepSpace = False
   , csvEscape = Nothing }
 
@@ -53,18 +53,24 @@ pCSVCell :: CSVOptions -> Parser Text
 pCSVCell opts = pCSVQuotedCell opts <|> pCSVUnquotedCell opts
 
 pCSVQuotedCell :: CSVOptions -> Parser Text
-pCSVQuotedCell opts = do
-  char (csvQuote opts)
-  res <- many (satisfy (\c -> c /= csvQuote opts &&
-                              Just c /= csvEscape opts) <|> escaped opts)
-  char (csvQuote opts)
-  return $ T.pack res
+pCSVQuotedCell opts =
+  case csvQuote opts of
+    Nothing -> mzero
+    Just quotechar -> do
+      char quotechar
+      res <- many (satisfy (\c -> c /= quotechar &&
+                                  Just c /= csvEscape opts) <|> escaped opts)
+      char quotechar
+      return $ T.pack res
 
 escaped :: CSVOptions -> Parser Char
-escaped opts = try $
+escaped opts =
   case csvEscape opts of
-       Nothing -> char (csvQuote opts) >> char (csvQuote opts)
-       Just c  -> char c >> noneOf "\r\n"
+    Nothing ->
+      case csvQuote opts of
+        Nothing -> mzero
+        Just q -> try $ char q >> char q
+    Just c  -> try $ char c >> noneOf "\r\n"
 
 pCSVUnquotedCell :: CSVOptions -> Parser Text
 pCSVUnquotedCell opts = T.pack <$>
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 95f5f5b61..7abd1d024 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -52,6 +52,7 @@ module Text.Pandoc.Readers
   , readFB2
   , readIpynb
   , readCSV
+  , readTSV
   , readCslJson
   , readBibTeX
   , readBibLaTeX
@@ -152,6 +153,7 @@ readers = [("native"       , TextReader readNative)
           ,("fb2"          , TextReader readFB2)
           ,("ipynb"        , TextReader readIpynb)
           ,("csv"          , TextReader readCSV)
+          ,("tsv"          , TextReader readTSV)
           ,("csljson"      , TextReader readCslJson)
           ,("bibtex"       , TextReader readBibTeX)
           ,("biblatex"     , TextReader readBibLaTeX)
diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs
index 0fcf4bc35..23e0f7448 100644
--- a/src/Text/Pandoc/Readers/CSV.hs
+++ b/src/Text/Pandoc/Readers/CSV.hs
@@ -10,11 +10,14 @@
    Stability   : alpha
    Portability : portable
 
-Conversion from CSV to a 'Pandoc' table.
+Conversion from CSV or TSV to a 'Pandoc' table.
 -}
-module Text.Pandoc.Readers.CSV ( readCSV ) where
+module Text.Pandoc.Readers.CSV (
+  readCSV,
+  readTSV
+) where
 import qualified Data.Text as T
-import Text.Pandoc.CSV (parseCSV, defaultCSVOptions)
+import Text.Pandoc.CSV (parseCSV, defaultCSVOptions, CSVOptions(..))
 import Text.Pandoc.Definition
 import qualified Text.Pandoc.Builder as B
 import Text.Pandoc.Class (PandocMonad)
@@ -22,14 +25,34 @@ import Text.Pandoc.Error
 import Text.Pandoc.Sources (ToSources(..), sourcesToText)
 import Text.Pandoc.Options (ReaderOptions)
 import Control.Monad.Except (throwError)
+import Data.Text (Text)
 
 readCSV :: (PandocMonad m, ToSources a)
         => ReaderOptions -- ^ Reader options
         -> a
         -> m Pandoc
 readCSV _opts s = do
-  let txt = sourcesToText $ toSources s
-  case parseCSV defaultCSVOptions txt of
+  readCSVWith defaultCSVOptions $ sourcesToText $ toSources s
+
+readTSV :: (PandocMonad m, ToSources a)
+        => ReaderOptions -- ^ Reader options
+        -> a
+        -> m Pandoc
+readTSV _opts s = do
+  readCSVWith tsvOpts $ sourcesToText $ toSources s
+ where
+  tsvOpts = CSVOptions{
+    csvDelim = '\t',
+    csvQuote = Nothing,
+    csvKeepSpace = False,
+    csvEscape = Nothing }
+
+readCSVWith :: PandocMonad m
+            => CSVOptions
+            -> Text
+            -> m Pandoc
+readCSVWith csvopts txt = do
+  case parseCSV csvopts txt of
     Right (r:rs) -> return $ B.doc $ B.table capt
                                              (zip aligns widths)
                                              (TableHead nullAttr hdrs)
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
index f13b70738..b87c0ab71 100644
--- a/src/Text/Pandoc/Readers/RST.hs
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -845,8 +845,8 @@ csvTableDirective top fields rawcsv = do
                                 _            -> ','
               , csvQuote = case trim <$> lookup "quote" fields of
                                 Just (T.unpack -> [c])
-                                  -> c
-                                _ -> '"'
+                                  -> Just c
+                                _ -> Just '"'
               , csvEscape = case trim <$> lookup "escape" fields of
                                 Just (T.unpack -> [c])
                                   -> Just c