From f9514ccb9e6efbd33ad84fcc1dfef20603affc4c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 31 Jan 2020 21:14:21 -0800 Subject: [PATCH] Add Text.Pandoc.Readers.CSV (readCSV). This adds csv as an input format. The CSV table is converted into a pandoc simple table. Closes #6100. --- MANUAL.txt | 2 + pandoc.cabal | 5 +- src/Text/Pandoc/App/FormatHeuristics.hs | 1 + src/Text/Pandoc/Readers.hs | 3 + src/Text/Pandoc/Readers/CSV.hs | 108 ++++++++++++++++++++++++ test/command/csv.md | 17 ++++ 6 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 src/Text/Pandoc/Readers/CSV.hs create mode 100644 test/command/csv.md diff --git a/MANUAL.txt b/MANUAL.txt index a776fab64..c90ac7b01 100644 --- a/MANUAL.txt +++ b/MANUAL.txt @@ -227,6 +227,7 @@ header when requesting a document from a URL: ::: {#input-formats} - `commonmark` ([CommonMark] Markdown) - `creole` ([Creole 1.0]) + - `csv` ([CSV] table) - `docbook` ([DocBook]) - `docx` ([Word docx]) - `dokuwiki` ([DokuWiki markup]) @@ -467,6 +468,7 @@ header when requesting a document from a URL: [TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax [Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html [Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0 +[CSV]: https://tools.ietf.org/html/rfc4180 [roff man]: https://man.cx/groff_man(7) [roff ms]: https://man.cx/groff_ms(7) [Haskell]: https://www.haskell.org diff --git a/pandoc.cabal b/pandoc.cabal index 1b50ddd7c..3b8e84634 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -1,5 +1,5 @@ name: pandoc -version: 2.9.1.1 +version: 2.9.2 cabal-version: 2.0 build-type: Simple license: GPL-2 @@ -21,7 +21,7 @@ description: Pandoc is a Haskell library for converting from one markup TikiWiki markup, Jira markup, Creole 1.0, Haddock markup, OPML, Emacs Org-Mode, Emacs Muse, txt2tags, ipynb (Jupyter notebooks), Vimwiki, Word Docx, ODT, EPUB, FictionBook2, - roff man, and Textile, and it can write Markdown, + roff man, Textile, and CSV, and it can write Markdown, reStructuredText, XHTML, HTML 5, LaTeX, ConTeXt, DocBook, JATS, OPML, TEI, OpenDocument, ODT, Word docx, PowerPoint pptx, RTF, MediaWiki, DokuWiki, XWiki, @@ -503,6 +503,7 @@ library Text.Pandoc.Readers.FB2, Text.Pandoc.Readers.DokuWiki, Text.Pandoc.Readers.Ipynb, + Text.Pandoc.Readers.CSV, Text.Pandoc.Writers, Text.Pandoc.Writers.Native, Text.Pandoc.Writers.Docbook, diff --git a/src/Text/Pandoc/App/FormatHeuristics.hs b/src/Text/Pandoc/App/FormatHeuristics.hs index 25e0a303e..21de09f3d 100644 --- a/src/Text/Pandoc/App/FormatHeuristics.hs +++ b/src/Text/Pandoc/App/FormatHeuristics.hs @@ -75,5 +75,6 @@ formatFromFilePath x = ".wiki" -> Just "mediawiki" ".xhtml" -> Just "html" ".ipynb" -> Just "ipynb" + ".csv" -> Just "csv" ['.',y] | y `elem` ['1'..'9'] -> Just "man" _ -> Nothing diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index 990e78f35..28c7c6810 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -51,6 +51,7 @@ module Text.Pandoc.Readers , readMuse , readFB2 , readIpynb + , readCSV -- * Miscellaneous , getReader , getDefaultExtensions @@ -95,6 +96,7 @@ import Text.Pandoc.Readers.TWiki import Text.Pandoc.Readers.Txt2Tags import Text.Pandoc.Readers.Vimwiki import Text.Pandoc.Readers.Man +import Text.Pandoc.Readers.CSV import qualified Text.Pandoc.UTF8 as UTF8 import Text.Parsec.Error @@ -136,6 +138,7 @@ readers = [ ("native" , TextReader readNative) ,("man" , TextReader readMan) ,("fb2" , TextReader readFB2) ,("ipynb" , TextReader readIpynb) + ,("csv" , TextReader readCSV) ] -- | Retrieve reader, extensions based on formatSpec (format+extensions). diff --git a/src/Text/Pandoc/Readers/CSV.hs b/src/Text/Pandoc/Readers/CSV.hs new file mode 100644 index 000000000..3eea9ccbd --- /dev/null +++ b/src/Text/Pandoc/Readers/CSV.hs @@ -0,0 +1,108 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.RST + Copyright : Copyright (C) 2006-2019 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion from CSV to a 'Pandoc' table. +-} +module Text.Pandoc.Readers.CSV ( readCSV ) where +import Prelude +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Definition +import qualified Text.Pandoc.Builder as B +import Text.Parsec +import Text.Parsec.Text (Parser) +import Text.Pandoc.Class (PandocMonad) +import Text.Pandoc.Shared (crFilter) +import Text.Pandoc.Error +import Text.Pandoc.Options (ReaderOptions(..)) +import Control.Monad.Except (throwError) + +readCSV :: PandocMonad m + => ReaderOptions -- ^ Reader options + -> Text -- ^ Text to parse (assuming @'\n'@ line endings) + -> m Pandoc +readCSV opts s = do + let columns = readerColumns opts + case parse pCSV "input" (crFilter s) of + Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) hdrs rows + where capt = mempty + numcols = length r + hdrs = map (B.plain . B.text) r + rows = map (map (B.plain . B.text)) rs + maximum' [] = 0 + maximum' xs = maximum xs + aligns = replicate numcols AlignDefault + widths = replicate numcols 0 + Right [] -> return $ B.doc mempty + Left e -> throwError $ PandocParsecError s e + +{- from RFC 4180 + + The ABNF grammar [2] appears as follows: + + file = [header CRLF] record *(CRLF record) [CRLF] + + header = name *(COMMA name) + + record = field *(COMMA field) + + name = field + + field = (escaped / non-escaped) + + escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE + + non-escaped = *TEXTDATA + + COMMA = %x2C + + CR = %x0D ;as per section 6.1 of RFC 2234 [2] + + DQUOTE = %x22 ;as per section 6.1 of RFC 2234 [2] + + LF = %x0A ;as per section 6.1 of RFC 2234 [2] + + CRLF = CR LF ;as per section 6.1 of RFC 2234 [2] + + TEXTDATA = %x20-21 / %x23-2B / %x2D-7E +-} + +pCSV :: Parser [[Text]] +pCSV = many pRecord + +pRecord :: Parser [Text] +pRecord = do + x <- pField + xs <- many $ pComma >> pField + () <$ newline <|> eof + return (x:xs) + +pField :: Parser Text +pField = pEscaped <|> pUnescaped + +pComma :: Parser Char +pComma = char ',' + +pUnescaped :: Parser Text +pUnescaped = T.strip . T.pack <$> many1 (noneOf "\n\r\",") + +pEscaped :: Parser Text +pEscaped = do + char '"' + t <- T.pack <$> many (pDoubledQuote <|> noneOf "\"") + char '"' + return t + +pDoubledQuote :: Parser Char +pDoubledQuote = try $ char '"' >> char '"' diff --git a/test/command/csv.md b/test/command/csv.md new file mode 100644 index 000000000..4723bc4f6 --- /dev/null +++ b/test/command/csv.md @@ -0,0 +1,17 @@ +``` +% pandoc -f csv -t native +Fruit,Price,Quantity +Apple,25 cents,33 +"""Navel"" Orange","35 cents",22 +^D +[Table [] [AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0] + [[Plain [Str "Fruit"]] + ,[Plain [Str "Price"]] + ,[Plain [Str "Quantity"]]] + [[[Plain [Str "Apple"]] + ,[Plain [Str "25",Space,Str "cents"]] + ,[Plain [Str "33"]]] + ,[[Plain [Str "\"Navel\"",Space,Str "Orange"]] + ,[Plain [Str "35",Space,Str "cents"]] + ,[Plain [Str "22"]]]]] +```