Add Text.Pandoc.Readers.CSV (readCSV).

This adds csv as an input format. The CSV table is converted into a pandoc simple table. Closes #6100.
2020-01-31 21:14:21 -08:00 · 2020-01-31 21:14:21 -08:00 · f9514ccb9e
commit f9514ccb9e
parent cef30e7384
6 changed files with 134 additions and 2 deletions
--- a/MANUAL.txt
+++ b/MANUAL.txt
@ -227,6 +227,7 @@ header when requesting a document from a URL:
    ::: {#input-formats}
    - `commonmark` ([CommonMark] Markdown)
    - `creole` ([Creole 1.0])
    - `csv` ([CSV] table)
    - `docbook` ([DocBook])
    - `docx` ([Word docx])
    - `dokuwiki` ([DokuWiki markup])
@ -467,6 +468,7 @@ header when requesting a document from a URL:
 [TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax
 [Haddock markup]: https://www.haskell.org/haddock/doc/html/ch03s08.html
 [Creole 1.0]: http://www.wikicreole.org/wiki/Creole1.0
 [CSV]: https://tools.ietf.org/html/rfc4180
 [roff man]: https://man.cx/groff_man(7)
 [roff ms]: https://man.cx/groff_ms(7)
 [Haskell]: https://www.haskell.org
--- a/pandoc.cabal
+++ b/pandoc.cabal
@ -1,5 +1,5 @@
 name:            pandoc
-version:         2.9.1.1
+version:         2.9.2
 cabal-version:   2.0
 build-type:      Simple
 license:         GPL-2
@ -21,7 +21,7 @@ description:     Pandoc is a Haskell library for converting from one markup
                 TikiWiki markup, Jira markup, Creole 1.0, Haddock markup,
                 OPML, Emacs Org-Mode, Emacs Muse, txt2tags, ipynb (Jupyter
                 notebooks), Vimwiki, Word Docx, ODT, EPUB, FictionBook2,
-                 roff man, and Textile, and it can write Markdown,
+                 roff man, Textile, and CSV, and it can write Markdown,
                 reStructuredText, XHTML, HTML 5, LaTeX, ConTeXt, DocBook,
                 JATS, OPML, TEI, OpenDocument, ODT, Word docx,
                 PowerPoint pptx, RTF, MediaWiki, DokuWiki, XWiki,
@ -503,6 +503,7 @@ library
                   Text.Pandoc.Readers.FB2,
                   Text.Pandoc.Readers.DokuWiki,
                   Text.Pandoc.Readers.Ipynb,
                   Text.Pandoc.Readers.CSV,
                   Text.Pandoc.Writers,
                   Text.Pandoc.Writers.Native,
                   Text.Pandoc.Writers.Docbook,
--- a/src/Text/Pandoc/App/FormatHeuristics.hs
+++ b/src/Text/Pandoc/App/FormatHeuristics.hs
@ -75,5 +75,6 @@ formatFromFilePath x =
    ".wiki"     -> Just "mediawiki"
    ".xhtml"    -> Just "html"
    ".ipynb"    -> Just "ipynb"
    ".csv"      -> Just "csv"
    ['.',y]     | y `elem` ['1'..'9'] -> Just "man"
    _           -> Nothing
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@ -51,6 +51,7 @@ module Text.Pandoc.Readers
  , readMuse
  , readFB2
  , readIpynb
  , readCSV
  -- * Miscellaneous
  , getReader
  , getDefaultExtensions
@ -95,6 +96,7 @@ import Text.Pandoc.Readers.TWiki
 import Text.Pandoc.Readers.Txt2Tags
 import Text.Pandoc.Readers.Vimwiki
 import Text.Pandoc.Readers.Man
 import Text.Pandoc.Readers.CSV
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Parsec.Error
@ -136,6 +138,7 @@ readers = [ ("native"       , TextReader readNative)
           ,("man"          , TextReader readMan)
           ,("fb2"          , TextReader readFB2)
           ,("ipynb"        , TextReader readIpynb)
           ,("csv"          , TextReader readCSV)
           ]
 -- | Retrieve reader, extensions based on formatSpec (format+extensions).
--- a/src/Text/Pandoc/Readers/CSV.hs
+++ b/src/Text/Pandoc/Readers/CSV.hs
@ -0,0 +1,108 @@
 {-# LANGUAGE NoImplicitPrelude   #-}
 {-# LANGUAGE FlexibleContexts    #-}
 {-# LANGUAGE OverloadedStrings   #-}
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE ViewPatterns        #-}
 {- |
   Module      : Text.Pandoc.Readers.RST
   Copyright   : Copyright (C) 2006-2019 John MacFarlane
   License     : GNU GPL, version 2 or above
   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable
 Conversion from CSV to a 'Pandoc' table.
 -}
 module Text.Pandoc.Readers.CSV ( readCSV ) where
 import Prelude
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.Pandoc.Definition
 import qualified Text.Pandoc.Builder as B
 import Text.Parsec
 import Text.Parsec.Text (Parser)
 import Text.Pandoc.Class (PandocMonad)
 import Text.Pandoc.Shared (crFilter)
 import Text.Pandoc.Error
 import Text.Pandoc.Options (ReaderOptions(..))
 import Control.Monad.Except (throwError)
 readCSV :: PandocMonad m
        => ReaderOptions -- ^ Reader options
        -> Text          -- ^ Text to parse (assuming @'\n'@ line endings)
        -> m Pandoc
 readCSV opts s = do
  let columns = readerColumns opts
  case parse pCSV "input" (crFilter s) of
    Right (r:rs) -> return $ B.doc $ B.table capt (zip aligns widths) hdrs rows
       where capt = mempty
             numcols = length r
             hdrs = map (B.plain . B.text) r
             rows = map (map (B.plain . B.text)) rs
             maximum' [] = 0
             maximum' xs = maximum xs
             aligns = replicate numcols AlignDefault
             widths = replicate numcols 0
    Right []     -> return $ B.doc mempty
    Left e       -> throwError $ PandocParsecError s e
 {- from RFC 4180
   The ABNF grammar [2] appears as follows:
   file = [header CRLF] record *(CRLF record) [CRLF]
   header = name *(COMMA name)
   record = field *(COMMA field)
   name = field
   field = (escaped / non-escaped)
   escaped = DQUOTE *(TEXTDATA / COMMA / CR / LF / 2DQUOTE) DQUOTE
   non-escaped = *TEXTDATA
   COMMA = %x2C
   CR = %x0D ;as per section 6.1 of RFC 2234 [2]
   DQUOTE =  %x22 ;as per section 6.1 of RFC 2234 [2]
   LF = %x0A ;as per section 6.1 of RFC 2234 [2]
   CRLF = CR LF ;as per section 6.1 of RFC 2234 [2]
   TEXTDATA =  %x20-21 / %x23-2B / %x2D-7E
 -}
 pCSV :: Parser [[Text]]
 pCSV = many pRecord
 pRecord :: Parser [Text]
 pRecord = do
  x <- pField
  xs <- many $ pComma >> pField
  () <$ newline <|> eof
  return (x:xs)
 pField :: Parser Text
 pField = pEscaped <|> pUnescaped
 pComma :: Parser Char
 pComma = char ','
 pUnescaped :: Parser Text
 pUnescaped = T.strip . T.pack <$> many1 (noneOf "\n\r\",")
 pEscaped :: Parser Text
 pEscaped = do
  char '"'
  t <- T.pack <$> many (pDoubledQuote <|> noneOf "\"")
  char '"'
  return t
 pDoubledQuote :: Parser Char
 pDoubledQuote = try $ char '"' >> char '"'
--- a/test/command/csv.md
+++ b/test/command/csv.md
@ -0,0 +1,17 @@
 ```
 % pandoc -f csv -t native
 Fruit,Price,Quantity
 Apple,25 cents,33
 """Navel"" Orange","35 cents",22
 ^D
 [Table [] [AlignDefault,AlignDefault,AlignDefault] [0.0,0.0,0.0]
 [[Plain [Str "Fruit"]]
 ,[Plain [Str "Price"]]
 ,[Plain [Str "Quantity"]]]
 [[[Plain [Str "Apple"]]
  ,[Plain [Str "25",Space,Str "cents"]]
  ,[Plain [Str "33"]]]
 ,[[Plain [Str "\"Navel\"",Space,Str "Orange"]]
  ,[Plain [Str "35",Space,Str "cents"]]
  ,[Plain [Str "22"]]]]]
 ```