Add tsv (tab separated values) as an input format.

We us ethe simple spec at
<https://www.iana.org/assignments/media-types/text/tab-separated-values>.

API change: Text.Pandoc.Readers.CSV now exports `readTSV`.

Internal change:  In Text.Pandoc.CSV, CSVOptions has changed
so that csvQuote takes a Maybe value.

Closes #7974.
This commit is contained in:
John MacFarlane 2022-04-24 12:25:04 -07:00
parent b1990b0657
commit 16f0316fba
7 changed files with 55 additions and 20 deletions

View file

@ -232,6 +232,7 @@ header when requesting a document from a URL:
- `creole` ([Creole 1.0]) - `creole` ([Creole 1.0])
- `csljson` ([CSL JSON] bibliography) - `csljson` ([CSL JSON] bibliography)
- `csv` ([CSV] table) - `csv` ([CSV] table)
- `tsv` ([TSV] table)
- `docbook` ([DocBook]) - `docbook` ([DocBook])
- `docx` ([Word docx]) - `docx` ([Word docx])
- `dokuwiki` ([DokuWiki markup]) - `dokuwiki` ([DokuWiki markup])
@ -484,6 +485,7 @@ header when requesting a document from a URL:
[Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html [Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html
[Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0 [Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0
[CSV]: https://tools.ietf.org/html/rfc4180 [CSV]: https://tools.ietf.org/html/rfc4180
[TSV]: https://www.iana.org/assignments/media-types/text/tab-separated-values
[roff man]: https://man.cx/groff_man(7) [roff man]: https://man.cx/groff_man(7)
[roff ms]: https://man.cx/groff_ms(7) [roff ms]: https://man.cx/groff_ms(7)
[Haskell]: https://www.haskell.org [Haskell]: https://www.haskell.org

View file

@ -258,7 +258,8 @@ convertWithOpts opts = do
let convertTabs = tabFilter (if optPreserveTabs opts || let convertTabs = tabFilter (if optPreserveTabs opts ||
readerNameBase == "t2t" || readerNameBase == "t2t" ||
readerNameBase == "man" readerNameBase == "man" ||
readerNameBase == "tsv"
then 0 then 0
else optTabStop opts) else optTabStop opts)

View file

@ -86,6 +86,7 @@ formatFromFilePath x =
".xhtml" -> Just "html" ".xhtml" -> Just "html"
".ipynb" -> Just "ipynb" ".ipynb" -> Just "ipynb"
".csv" -> Just "csv" ".csv" -> Just "csv"
".tsv" -> Just "tsv"
".bib" -> Just "biblatex" ".bib" -> Just "biblatex"
['.',y] | y `elem` ['1'..'9'] -> Just "man" ['.',y] | y `elem` ['1'..'9'] -> Just "man"
_ -> Nothing _ -> Nothing

View file

@ -16,7 +16,7 @@ module Text.Pandoc.CSV (
ParseError ParseError
) where ) where
import Control.Monad (unless, void) import Control.Monad (unless, void, mzero)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
import Text.Parsec import Text.Parsec
@ -24,7 +24,7 @@ import Text.Parsec.Text (Parser)
data CSVOptions = CSVOptions{ data CSVOptions = CSVOptions{
csvDelim :: Char csvDelim :: Char
, csvQuote :: Char , csvQuote :: Maybe Char
, csvKeepSpace :: Bool -- treat whitespace following delim as significant , csvKeepSpace :: Bool -- treat whitespace following delim as significant
, csvEscape :: Maybe Char -- default is to double up quote , csvEscape :: Maybe Char -- default is to double up quote
} deriving (Read, Show) } deriving (Read, Show)
@ -32,7 +32,7 @@ data CSVOptions = CSVOptions{
defaultCSVOptions :: CSVOptions defaultCSVOptions :: CSVOptions
defaultCSVOptions = CSVOptions{ defaultCSVOptions = CSVOptions{
csvDelim = ',' csvDelim = ','
, csvQuote = '"' , csvQuote = Just '"'
, csvKeepSpace = False , csvKeepSpace = False
, csvEscape = Nothing } , csvEscape = Nothing }
@ -53,18 +53,24 @@ pCSVCell :: CSVOptions -> Parser Text
pCSVCell opts = pCSVQuotedCell opts <|> pCSVUnquotedCell opts pCSVCell opts = pCSVQuotedCell opts <|> pCSVUnquotedCell opts
pCSVQuotedCell :: CSVOptions -> Parser Text pCSVQuotedCell :: CSVOptions -> Parser Text
pCSVQuotedCell opts = do pCSVQuotedCell opts =
char (csvQuote opts) case csvQuote opts of
res <- many (satisfy (\c -> c /= csvQuote opts && Nothing -> mzero
Just c /= csvEscape opts) <|> escaped opts) Just quotechar -> do
char (csvQuote opts) char quotechar
return $ T.pack res res <- many (satisfy (\c -> c /= quotechar &&
Just c /= csvEscape opts) <|> escaped opts)
char quotechar
return $ T.pack res
escaped :: CSVOptions -> Parser Char escaped :: CSVOptions -> Parser Char
escaped opts = try $ escaped opts =
case csvEscape opts of case csvEscape opts of
Nothing -> char (csvQuote opts) >> char (csvQuote opts) Nothing ->
Just c -> char c >> noneOf "\r\n" case csvQuote opts of
Nothing -> mzero
Just q -> try $ char q >> char q
Just c -> try $ char c >> noneOf "\r\n"
pCSVUnquotedCell :: CSVOptions -> Parser Text pCSVUnquotedCell :: CSVOptions -> Parser Text
pCSVUnquotedCell opts = T.pack <$> pCSVUnquotedCell opts = T.pack <$>

View file

@ -52,6 +52,7 @@ module Text.Pandoc.Readers
, readFB2 , readFB2
, readIpynb , readIpynb
, readCSV , readCSV
, readTSV
, readCslJson , readCslJson
, readBibTeX , readBibTeX
, readBibLaTeX , readBibLaTeX
@ -152,6 +153,7 @@ readers = [("native" , TextReader readNative)
,("fb2" , TextReader readFB2) ,("fb2" , TextReader readFB2)
,("ipynb" , TextReader readIpynb) ,("ipynb" , TextReader readIpynb)
,("csv" , TextReader readCSV) ,("csv" , TextReader readCSV)
,("tsv" , TextReader readTSV)
,("csljson" , TextReader readCslJson) ,("csljson" , TextReader readCslJson)
,("bibtex" , TextReader readBibTeX) ,("bibtex" , TextReader readBibTeX)
,("biblatex" , TextReader readBibLaTeX) ,("biblatex" , TextReader readBibLaTeX)

View file

@ -10,11 +10,14 @@
Stability : alpha Stability : alpha
Portability : portable Portability : portable
Conversion from CSV to a 'Pandoc' table. Conversion from CSV or TSV to a 'Pandoc' table.
-} -}
module Text.Pandoc.Readers.CSV ( readCSV ) where module Text.Pandoc.Readers.CSV (
readCSV,
readTSV
) where
import qualified Data.Text as T import qualified Data.Text as T
import Text.Pandoc.CSV (parseCSV, defaultCSVOptions) import Text.Pandoc.CSV (parseCSV, defaultCSVOptions, CSVOptions(..))
import Text.Pandoc.Definition import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class (PandocMonad) import Text.Pandoc.Class (PandocMonad)
@ -22,14 +25,34 @@ import Text.Pandoc.Error
import Text.Pandoc.Sources (ToSources(..), sourcesToText) import Text.Pandoc.Sources (ToSources(..), sourcesToText)
import Text.Pandoc.Options (ReaderOptions) import Text.Pandoc.Options (ReaderOptions)
import Control.Monad.Except (throwError) import Control.Monad.Except (throwError)
import Data.Text (Text)
readCSV :: (PandocMonad m, ToSources a) readCSV :: (PandocMonad m, ToSources a)
=> ReaderOptions -- ^ Reader options => ReaderOptions -- ^ Reader options
-> a -> a
-> m Pandoc -> m Pandoc
readCSV _opts s = do readCSV _opts s = do
let txt = sourcesToText $ toSources s readCSVWith defaultCSVOptions $ sourcesToText $ toSources s
case parseCSV defaultCSVOptions txt of
readTSV :: (PandocMonad m, ToSources a)
=> ReaderOptions -- ^ Reader options
-> a
-> m Pandoc
readTSV _opts s = do
readCSVWith tsvOpts $ sourcesToText $ toSources s
where
tsvOpts = CSVOptions{
csvDelim = '\t',
csvQuote = Nothing,
csvKeepSpace = False,
csvEscape = Nothing }
readCSVWith :: PandocMonad m
=> CSVOptions
-> Text
-> m Pandoc
readCSVWith csvopts txt = do
case parseCSV csvopts txt of
Right (r:rs) -> return $ B.doc $ B.table capt Right (r:rs) -> return $ B.doc $ B.table capt
(zip aligns widths) (zip aligns widths)
(TableHead nullAttr hdrs) (TableHead nullAttr hdrs)

View file

@ -845,8 +845,8 @@ csvTableDirective top fields rawcsv = do
_ -> ',' _ -> ','
, csvQuote = case trim <$> lookup "quote" fields of , csvQuote = case trim <$> lookup "quote" fields of
Just (T.unpack -> [c]) Just (T.unpack -> [c])
-> c -> Just c
_ -> '"' _ -> Just '"'
, csvEscape = case trim <$> lookup "escape" fields of , csvEscape = case trim <$> lookup "escape" fields of
Just (T.unpack -> [c]) Just (T.unpack -> [c])
-> Just c -> Just c