From b9eeb77df552e39148b02c16cbb65e6b1c7a248d Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <albert@zeitkraut.de>
Date: Thu, 24 Mar 2022 19:59:20 +0100
Subject: [PATCH] [API change] Unify grid table parsing (#7971)

Grid table parsing in Markdown and rst are updated use the same
functions. Functions are generalized to meet requirements for both
formats.

This change also lays the ground for further generalizations in table
parsers, including support for advanced table features.

API changes in Text.Pandoc.Parsing:

- Parse results of functions `tableWith'` and `gridTableWith'` are now a
  `mf TableComponents` instead of a quadruple of alignments, column
  widths, header rows and body rows.

Additional exports from Text.Pandoc.Parsing:

- `tableWith'`
- `TableComponents`
- `TableNormalization`
- `toTableComponents`
- `toTableComponents'`
---
 src/Text/Pandoc/Parsing.hs           |   5 +
 src/Text/Pandoc/Parsing/GridTable.hs | 138 +++++++++++++++++++--------
 src/Text/Pandoc/Readers/Markdown.hs  |  75 +++++----------
 src/Text/Pandoc/Readers/MediaWiki.hs |   2 +-
 src/Text/Pandoc/Readers/TWiki.hs     |   4 +-
 5 files changed, 130 insertions(+), 94 deletions(-)

diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index b663f7fa9..fdf5aa332 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -59,9 +59,14 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
                              charRef,
                              lineBlockLines,
                              tableWith,
+                             tableWith',
                              widthsFromIndices,
                              gridTableWith,
                              gridTableWith',
+                             TableComponents (..),
+                             TableNormalization (..),
+                             toTableComponents,
+                             toTableComponents',
                              readWith,
                              readWithM,
                              testStringWith,
diff --git a/src/Text/Pandoc/Parsing/GridTable.hs b/src/Text/Pandoc/Parsing/GridTable.hs
index bdfcb2bb3..1c029df8a 100644
--- a/src/Text/Pandoc/Parsing/GridTable.hs
+++ b/src/Text/Pandoc/Parsing/GridTable.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE FlexibleContexts           #-}
+{-# LANGUAGE LambdaCase                 #-}
 {-# LANGUAGE OverloadedStrings          #-}
 {- |
    Module      : Text.Pandoc.Parsing.GridTable
@@ -14,6 +15,11 @@ module Text.Pandoc.Parsing.GridTable
   , tableWith
   , tableWith'
   , widthsFromIndices
+    -- * Components of a plain-text table
+  , TableComponents (..)
+  , TableNormalization (..)
+  , toTableComponents
+  , toTableComponents'
   )
 where
 
@@ -34,6 +40,65 @@ import Text.Parsec
 import qualified Data.Text as T
 import qualified Text.Pandoc.Builder as B
 
+-- | Collection of components making up a Table block.
+data TableComponents = TableComponents
+  { tableAttr     :: Attr
+  , tableCaption  :: Caption
+  , tableColSpecs :: [ColSpec]
+  , tableHead     :: TableHead
+  , tableBodies   :: [TableBody]
+  , tableFoot     :: TableFoot
+  }
+
+-- | Creates a table block from the collection of table parts.
+tableFromComponents :: TableComponents -> Blocks
+tableFromComponents (TableComponents attr capt colspecs th tb tf) =
+  B.tableWith attr capt colspecs th tb tf
+
+-- | Bundles basic table components into a single value.
+toTableComponents :: [Alignment] -> [Double] -> [Blocks] -> [[Blocks]]
+                  -> TableComponents
+toTableComponents = toTableComponents' NoNormalization
+
+-- | Bundles basic table components into a single value, performing
+-- normalizations as necessary.
+toTableComponents' :: TableNormalization
+                   -> [Alignment] -> [Double] -> [Blocks] -> [[Blocks]]
+                   -> TableComponents
+toTableComponents' normalization aligns widths heads rows =
+  let th = TableHead nullAttr (toHeaderRow normalization heads)
+      tb = TableBody nullAttr 0 [] (map toRow rows)
+      tf = TableFoot nullAttr []
+      colspecs = toColSpecs aligns widths
+  in TableComponents nullAttr B.emptyCaption colspecs th [tb] tf
+
+-- | Combine a list of column alignments and column widths into a list
+-- of column specifiers. Both input lists should have the same length.
+toColSpecs :: [Alignment]   -- ^ column alignments
+           -> [Double]      -- ^ column widths
+           -> [ColSpec]
+toColSpecs aligns widths = zip aligns (map fromWidth widths')
+  where
+    fromWidth n
+      | n > 0     = ColWidth n
+      | otherwise = ColWidthDefault
+
+    -- renormalize widths if greater than 100%:
+    totalWidth = sum widths
+    widths' = if totalWidth < 1
+              then widths
+              else map (/ totalWidth) widths
+
+-- | Whether the table header should be normalized, i.e., whether an header row
+-- with only empty cells should be omitted.
+data TableNormalization
+  = NoNormalization
+  | NormalizeHeader
+
+--
+-- Grid Tables
+--
+
 -- | Parse a grid table: starts with row of '-' on top, then header
 -- (which may be grid), then the rows, which may be grid, separated by
 -- blank lines, and ending with a footer (dashed line followed by blank
@@ -50,11 +115,13 @@ gridTableWith blocks headless =
 -- Table.
 gridTableWith' :: (Monad m, Monad mf,
                    HasReaderOptions st, HasLastStrPosition st)
-               => ParserT Sources st m (mf Blocks) -- ^ Block list parser
+               => TableNormalization
+               -> ParserT Sources st m (mf Blocks) -- ^ Block list parser
                -> Bool                             -- ^ Headerless table
-               -> ParserT Sources st m (TableComponents mf)
-gridTableWith' blocks headless =
-  tableWith' (gridTableHeader headless blocks) (gridTableRow blocks)
+               -> ParserT Sources st m (mf TableComponents)
+gridTableWith' normalization blocks headless =
+  tableWith' normalization
+             (gridTableHeader headless blocks) (gridTableRow blocks)
              (gridTableSep '-') gridTableFooter
 
 gridTableSplitLine :: [Int] -> Text -> [Text]
@@ -162,44 +229,39 @@ gridTableFooter = optional blanklines
 -- 'lineParser', and 'footerParser'.
 tableWith :: (Stream s m Char, UpdateSourcePos s Char,
               HasReaderOptions st, Monad mf)
-          => ParserT s st m (mf [Blocks], [Alignment], [Int])
-          -> ([Int] -> ParserT s st m (mf [Blocks]))
-          -> ParserT s st m sep
-          -> ParserT s st m end
+          => ParserT s st m (mf [Blocks], [Alignment], [Int]) -- ^ header parser
+          -> ([Int] -> ParserT s st m (mf [Blocks]))  -- ^ row parser
+          -> ParserT s st m sep                       -- ^ line parser
+          -> ParserT s st m end                       -- ^ footer parser
           -> ParserT s st m (mf Blocks)
-tableWith headerParser rowParser lineParser footerParser = try $ do
-  (aligns, widths, heads, rows) <- tableWith' headerParser rowParser
-                                                lineParser footerParser
-  let th = TableHead nullAttr <$> heads
-      tb = (:[]) . TableBody nullAttr 0 [] <$> rows
-      tf = pure $ TableFoot nullAttr []
-      colspecs = zip aligns (map fromWidth widths)
-  return $ B.table B.emptyCaption colspecs <$> th <*> tb <*> tf
-  where
-    fromWidth n
-      | n > 0     = ColWidth n
-      | otherwise = ColWidthDefault
-
-type TableComponents mf = ([Alignment], [Double], mf [Row], mf [Row])
+tableWith hp rp lp fp = fmap tableFromComponents <$>
+  tableWith' NoNormalization hp rp lp fp
 
 tableWith' :: (Stream s m Char, UpdateSourcePos s Char,
                HasReaderOptions st, Monad mf)
-           => ParserT s st m (mf [Blocks], [Alignment], [Int])
-           -> ([Int] -> ParserT s st m (mf [Blocks]))
-           -> ParserT s st m sep
-           -> ParserT s st m end
-           -> ParserT s st m (TableComponents mf)
-tableWith' headerParser rowParser lineParser footerParser = try $ do
-    (heads, aligns, indices) <- headerParser
-    lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser
-    footerParser
-    numColumns <- getOption readerColumns
-    let widths = if null indices
-                    then replicate (length aligns) 0.0
-                    else widthsFromIndices numColumns indices
-    let toRow =  Row nullAttr . map B.simpleCell
-        toHeaderRow l = [toRow l | not (null l)]
-    return (aligns, widths, toHeaderRow <$> heads, map toRow <$> lines')
+           => TableNormalization
+           -> ParserT s st m (mf [Blocks], [Alignment], [Int]) -- ^ header parser
+           -> ([Int] -> ParserT s st m (mf [Blocks]))  -- ^ row parser
+           -> ParserT s st m sep                       -- ^ line parser
+           -> ParserT s st m end                       -- ^ footer parser
+           -> ParserT s st m (mf TableComponents)
+tableWith' n11n headerParser rowParser lineParser footerParser = try $ do
+  (heads, aligns, indices) <- headerParser
+  lines' <- sequence <$> rowParser indices `sepEndBy1` lineParser
+  footerParser
+  numColumns <- getOption readerColumns
+  let widths = if null indices
+               then replicate (length aligns) 0.0
+               else widthsFromIndices numColumns indices
+  return $ toTableComponents' n11n aligns widths <$> heads <*> lines'
+
+toRow :: [Blocks] -> Row
+toRow =  Row nullAttr . map B.simpleCell
+
+toHeaderRow :: TableNormalization -> [Blocks] -> [Row]
+toHeaderRow = \case
+  NoNormalization -> \l -> [toRow l | not (null l)]
+  NormalizeHeader -> \l -> [toRow l | not (null l) && not (all null l)]
 
 -- Calculate relative widths of table columns, based on indices
 widthsFromIndices :: Int      -- Number of columns on terminal
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 307d09a12..4e0a1fa6a 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -21,6 +21,7 @@ module Text.Pandoc.Readers.Markdown (
 
 import Control.Monad
 import Control.Monad.Except (throwError)
+import Data.Bifunctor (second)
 import Data.Char (isAlphaNum, isPunctuation, isSpace)
 import Text.DocLayout (realLength)
 import Data.List (transpose, elemIndex, sortOn, foldl')
@@ -44,7 +45,7 @@ import Safe.Foldable (maximumBounded)
 import Text.Pandoc.Logging
 import Text.Pandoc.Options
 import Text.Pandoc.Walk (walk)
-import Text.Pandoc.Parsing hiding (tableWith)
+import Text.Pandoc.Parsing hiding (tableCaption)
 import Text.Pandoc.Readers.HTML (htmlInBalanced, htmlTag, isBlockTag,
                                  isCommentTag, isInlineTag, isTextTag)
 import Text.Pandoc.Readers.LaTeX (applyMacros, rawLaTeXBlock, rawLaTeXInline)
@@ -1298,14 +1299,18 @@ tableCaption = do
 -- Parse a simple table with '---' header and one line per row.
 simpleTable :: PandocMonad m
             => Bool  -- ^ Headerless table
-            -> MarkdownParser m ([Alignment], [Double], F [Row], F [Row])
+            -> MarkdownParser m (F TableComponents)
 simpleTable headless = do
-  (aligns, _widths, heads', lines') <-
-       tableWith (simpleTableHeader headless) tableLine
+  tableComponents <-
+       tableWith' NormalizeHeader
+              (simpleTableHeader headless) tableLine
               (return ())
               (if headless then tableFooter else tableFooter <|> blanklines')
-  -- Simple tables get 0s for relative column widths (i.e., use default)
-  return (aligns, replicate (length aligns) 0, heads', lines')
+  -- All columns in simple tables have default widths.
+  let useDefaultColumnWidths tc =
+        let cs' = map (second (const ColWidthDefault)) $ tableColSpecs tc
+        in tc {tableColSpecs = cs'}
+  return $ useDefaultColumnWidths <$> tableComponents
 
 -- Parse a multiline table:  starts with row of '-' on top, then header
 -- (which may be multiline), then the rows,
@@ -1313,9 +1318,10 @@ simpleTable headless = do
 -- ending with a footer (dashed line followed by blank line).
 multilineTable :: PandocMonad m
                => Bool -- ^ Headerless table
-               -> MarkdownParser m ([Alignment], [Double], F [Row], F [Row])
+               -> MarkdownParser m (F TableComponents)
 multilineTable headless =
-  tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter
+  tableWith' NormalizeHeader (multilineTableHeader headless)
+             multilineRow blanklines tableFooter
 
 multilineTableHeader :: PandocMonad m
                      => Bool -- ^ Headerless table
@@ -1355,8 +1361,8 @@ multilineTableHeader headless = try $ do
 -- which may be grid, separated by blank lines, and
 -- ending with a footer (dashed line followed by blank line).
 gridTable :: PandocMonad m => Bool -- ^ Headerless table
-          -> MarkdownParser m ([Alignment], [Double], F [Row], F [Row])
-gridTable headless = gridTableWith' parseBlocks headless
+          -> MarkdownParser m (F TableComponents)
+gridTable headless = gridTableWith' NormalizeHeader parseBlocks headless
 
 pipeBreak :: PandocMonad m => MarkdownParser m ([Alignment], [Int])
 pipeBreak = try $ do
@@ -1370,7 +1376,7 @@ pipeBreak = try $ do
   blankline
   return $ unzip (first:rest)
 
-pipeTable :: PandocMonad m => MarkdownParser m ([Alignment], [Double], F [Row], F [Row])
+pipeTable :: PandocMonad m => MarkdownParser m (F TableComponents)
 pipeTable = try $ do
   nonindentSpaces
   lookAhead nonspaceChar
@@ -1390,7 +1396,8 @@ pipeTable = try $ do
                   else replicate (length aligns) 0.0
   (headCells :: F [Blocks]) <- sequence <$> mapM cellContents heads'
   (rows :: F [[Blocks]]) <- sequence <$> mapM (fmap sequence . mapM cellContents) lines''
-  return (aligns, widths, toHeaderRow <$> headCells, map toRow <$> rows)
+  return $
+    toTableComponents' NormalizeHeader aligns widths <$> headCells <*> rows
 
 sepPipe :: PandocMonad m => MarkdownParser m ()
 sepPipe = try $ do
@@ -1446,29 +1453,10 @@ scanForPipe = do
        (_, T.uncons -> Just ('|', _)) -> return ()
        _                              -> mzero
 
--- | Parse a table using 'headerParser', 'rowParser',
--- 'lineParser', and 'footerParser'.  Variant of the version in
--- Text.Pandoc.Parsing.
-tableWith :: PandocMonad m
-          => MarkdownParser m (F [Blocks], [Alignment], [Int])
-          -> ([Int] -> MarkdownParser m (F [Blocks]))
-          -> MarkdownParser m sep
-          -> MarkdownParser m end
-          -> MarkdownParser m ([Alignment], [Double], F [Row], F [Row])
-tableWith headerParser rowParser lineParser footerParser = try $ do
-    (heads, aligns, indices) <- headerParser
-    lines' <- fmap sequence $ rowParser indices `sepEndBy1` lineParser
-    footerParser
-    numColumns <- getOption readerColumns
-    let widths = if null indices
-                    then replicate (length aligns) 0.0
-                    else widthsFromIndices numColumns indices
-    return (aligns, widths, toHeaderRow <$> heads, map toRow <$> lines')
-
 table :: PandocMonad m => MarkdownParser m (F Blocks)
 table = try $ do
   frontCaption <- option Nothing (Just <$> tableCaption)
-  (aligns, widths, heads, lns) <-
+  tableComponents <-
          (guardEnabled Ext_pipe_tables >> try (scanForPipe >> pipeTable)) <|>
          (guardEnabled Ext_multiline_tables >> try (multilineTable False)) <|>
          (guardEnabled Ext_simple_tables >>
@@ -1481,23 +1469,10 @@ table = try $ do
   caption <- case frontCaption of
                   Nothing -> option (return mempty) tableCaption
                   Just c  -> return c
-  -- renormalize widths if greater than 100%:
-  let totalWidth = sum widths
-  let widths' = if totalWidth < 1
-                   then widths
-                   else map (/ totalWidth) widths
-  let strictPos w
-        | w > 0     = ColWidth w
-        | otherwise = ColWidthDefault
   return $ do
     caption' <- caption
-    heads' <- heads
-    lns' <- lns
-    return $ B.table (B.simpleCaption $ B.plain caption')
-                     (zip aligns (strictPos <$> widths'))
-                     (TableHead nullAttr heads')
-                     [TableBody nullAttr 0 [] lns']
-                     (TableFoot nullAttr [])
+    (TableComponents _attr _capt colspecs th tb tf) <- tableComponents
+    return $ B.table (B.simpleCaption $ B.plain caption') colspecs th tb tf
 
 --
 -- inline
@@ -2283,9 +2258,3 @@ doubleQuoted = do
     fmap B.doubleQuoted . trimInlinesF . mconcat <$>
       many1Till inline doubleQuoteEnd))
     <|> (return (return (B.str "\8220")))
-
-toRow :: [Blocks] -> Row
-toRow = Row nullAttr . map B.simpleCell
-
-toHeaderRow :: [Blocks] -> [Row]
-toHeaderRow l = [toRow l | not (null l) && not (all null l)]
diff --git a/src/Text/Pandoc/Readers/MediaWiki.hs b/src/Text/Pandoc/Readers/MediaWiki.hs
index b0101213b..7a406ec4b 100644
--- a/src/Text/Pandoc/Readers/MediaWiki.hs
+++ b/src/Text/Pandoc/Readers/MediaWiki.hs
@@ -34,7 +34,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.Definition
 import Text.Pandoc.Logging
 import Text.Pandoc.Options
-import Text.Pandoc.Parsing hiding (nested)
+import Text.Pandoc.Parsing hiding (nested, tableCaption)
 import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag)
 import Text.Pandoc.Shared (safeRead, stringify, stripTrailingNewlines,
                            trim, splitTextBy, tshow)
diff --git a/src/Text/Pandoc/Readers/TWiki.hs b/src/Text/Pandoc/Readers/TWiki.hs
index 276d28aaa..7ce4e593c 100644
--- a/src/Text/Pandoc/Readers/TWiki.hs
+++ b/src/Text/Pandoc/Readers/TWiki.hs
@@ -221,9 +221,9 @@ listItemLine prefix marker = mconcat <$> (lineContent >>= parseContent)
 
 table :: PandocMonad m => TWParser m B.Blocks
 table = try $ do
-  tableHead <- optionMaybe (unzip <$> many1Till tableParseHeader newline)
+  thead <- optionMaybe (unzip <$> many1Till tableParseHeader newline)
   rows <- many1 tableParseRow
-  return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) tableHead
+  return $ buildTable mempty rows $ fromMaybe (align rows, columns rows) thead
   where
     buildTable caption rows (aligns, heads)
                     = B.table (B.simpleCaption $ B.plain caption)