HTML reader: extract table parsing into separate module

This commit is contained in:
Albert Krewinkel 2020-11-23 12:32:37 +01:00
parent 2f110265ff
commit 41237fcc0e
No known key found for this signature in database
GPG key ID: 388DC0B21F631124
4 changed files with 141 additions and 95 deletions

View file

@ -600,6 +600,7 @@ library
Text.Pandoc.Readers.Docx.Util,
Text.Pandoc.Readers.Docx.Fields,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.Table,
Text.Pandoc.Readers.HTML.TagCategories,
Text.Pandoc.Readers.HTML.Types,
Text.Pandoc.Readers.LaTeX.Parsing,

View file

@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Definition
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.Table (pTable')
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.Options (
@ -64,7 +64,7 @@ import Text.Pandoc.Options (
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
onlySimpleTableCells, safeRead, tshow)
safeRead, tshow)
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
@ -474,79 +474,7 @@ pHrule = do
return B.horizontalRule
pTable :: PandocMonad m => TagParser m Blocks
pTable = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
let attribs = toAttr attribs'
skipMany pBlank
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
let pTh = option [] $ pInTags "tr" (pCell "th")
pTr = try $ skipMany pBlank >>
pInTags "tr" (pCell "td" <|> pCell "th")
pTBody = pInTag True "tbody" $ many1 pTr
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
head' <- map snd <$>
pInTag True "tbody"
(if null head'' then pTh else return head'')
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
rowsLs <- many pTBody
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
TagClose _ <- pSatisfy (matchTagClose "table")
let rows'' = concat rowsLs <> topfoot <> bottomfoot
let rows''' = map (map snd) rows''
-- fail on empty table
guard $ not $ null head' && null rows'''
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
let cols = if null head'
then maximum (map length rows''')
else length head'
-- add empty cells to short rows
let addEmpties r = case cols - length r of
n | n > 0 -> r <> replicate n mempty
| otherwise -> r
let rows = map addEmpties rows'''
let aligns = case rows'' of
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
_ -> replicate cols AlignDefault
let widths = if null widths'
then if isSimple
then replicate cols ColWidthDefault
else replicate cols (ColWidth (1.0 / fromIntegral cols))
else widths'
let toRow = Row nullAttr . map B.simpleCell
toHeaderRow l = [toRow l | not (null l)]
return $ B.tableWith attribs
(B.simpleCaption $ B.plain caption)
(zip aligns widths)
(TableHead nullAttr $ toHeaderRow head')
[TableBody nullAttr 0 [] $ map toRow rows]
(TableFoot nullAttr [])
pCol :: PandocMonad m => TagParser m ColWidth
pCol = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
let attribs = toStringAttr attribs'
skipMany pBlank
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
let width = case lookup "width" attribs of
Nothing -> case lookup "style" attribs of
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
fromMaybe 0.0 $ safeRead (T.filter
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
_ -> 0.0
Just (T.unsnoc -> Just (xs, '%')) ->
fromMaybe 0.0 $ safeRead xs
_ -> 0.0
if width > 0.0
then return $ ColWidth $ width / 100.0
else return ColWidthDefault
pColgroup :: PandocMonad m => TagParser m [ColWidth]
pColgroup = try $ do
pSatisfy (matchTagOpen "colgroup" [])
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
pTable = pTable' inline pCell
noColOrRowSpans :: Tag Text -> Bool
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
@ -847,16 +775,6 @@ pRawHtmlInline = do
mathMLToTeXMath :: Text -> Either Text Text
mathMLToTeXMath s = writeTeX <$> readMathML s
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go
where
go (x,y) =
case T.stripPrefix "data-" x of
Just x' | x' `Set.notMember` (html5Attributes <>
html4Attributes <> rdfaAttributes)
-> (x',y)
_ -> (x,y)
pScriptMath :: PandocMonad m => TagParser m Inlines
pScriptMath = try $ do
TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
@ -1151,16 +1069,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
toAttr :: [(Text, Text)] -> Attr
toAttr = mkAttr . toStringAttr
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix

View file

@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
, matchTagClose
, matchTagOpen
, isSpace
, toAttr
, toStringAttr
)
where
import Control.Monad (guard, void, mzero)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Definition (Attr)
import Text.Pandoc.Parsing
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Shared (tshow)
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import qualified Data.Set as Set
import qualified Data.Text as T
@ -154,3 +160,23 @@ t1 `closes` t2 |
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
toStringAttr = map go
where
go (x,y) =
case T.stripPrefix "data-" x of
Just x' | x' `Set.notMember` (html5Attributes <>
html4Attributes <> rdfaAttributes)
-> (x',y)
_ -> (x,y)
mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
toAttr :: [(Text, Text)] -> Attr
toAttr = mkAttr . toStringAttr

View file

@ -0,0 +1,111 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{- |
Module : Text.Pandoc.Readers.HTML.Table
Copyright : © 2006-2020 John MacFarlane,
2020 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <albert@zeitkraut.de>
Stability : alpha
Portability : portable
HTML table parser.
-}
module Text.Pandoc.Readers.HTML.Table (pTable') where
import Control.Monad (guard)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks, Inlines)
import Text.Pandoc.Definition
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Parsing
( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.Types (TagParser)
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
import qualified Data.Text as T
import qualified Text.Pandoc.Builder as B
-- | Parses a @<col>@ element, returning the column's width. Defaults to
-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
pCol :: PandocMonad m => TagParser m ColWidth
pCol = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
let attribs = toStringAttr attribs'
skipMany pBlank
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
let width = case lookup "width" attribs of
Nothing -> case lookup "style" attribs of
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
fromMaybe 0.0 $ safeRead (T.filter
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
_ -> 0.0
Just (T.unsnoc -> Just (xs, '%')) ->
fromMaybe 0.0 $ safeRead xs
_ -> 0.0
if width > 0.0
then return $ ColWidth $ width / 100.0
else return ColWidthDefault
pColgroup :: PandocMonad m => TagParser m [ColWidth]
pColgroup = try $ do
pSatisfy (matchTagOpen "colgroup" [])
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
-- | Parses a simple HTML table
pTable' :: PandocMonad m
=> TagParser m Inlines -- ^ Caption parser
-> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
-> TagParser m Blocks
pTable' inline pCell = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
let attribs = toAttr attribs'
skipMany pBlank
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
let pTh = option [] $ pInTags "tr" (pCell "th")
pTr = try $ skipMany pBlank >>
pInTags "tr" (pCell "td" <|> pCell "th")
pTBody = pInTag True "tbody" $ many1 pTr
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
head' <- map snd <$>
pInTag True "tbody"
(if null head'' then pTh else return head'')
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
rowsLs <- many pTBody
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
TagClose _ <- pSatisfy (matchTagClose "table")
let rows'' = concat rowsLs <> topfoot <> bottomfoot
let rows''' = map (map snd) rows''
-- fail on empty table
guard $ not $ null head' && null rows'''
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
let cols = if null head'
then maximum (map length rows''')
else length head'
-- add empty cells to short rows
let addEmpties r = case cols - length r of
n | n > 0 -> r <> replicate n mempty
| otherwise -> r
let rows = map addEmpties rows'''
let aligns = case rows'' of
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
_ -> replicate cols AlignDefault
let widths = if null widths'
then if isSimple
then replicate cols ColWidthDefault
else replicate cols (ColWidth (1.0 / fromIntegral cols))
else widths'
let toRow = Row nullAttr . map B.simpleCell
toHeaderRow l = [toRow l | not (null l)]
return $ B.tableWith attribs
(B.simpleCaption $ B.plain caption)
(zip aligns widths)
(TableHead nullAttr $ toHeaderRow head')
[TableBody nullAttr 0 [] $ map toRow rows]
(TableFoot nullAttr [])