HTML reader: extract table parsing into separate module
This commit is contained in:
parent
2f110265ff
commit
41237fcc0e
4 changed files with 141 additions and 95 deletions
|
@ -600,6 +600,7 @@ library
|
|||
Text.Pandoc.Readers.Docx.Util,
|
||||
Text.Pandoc.Readers.Docx.Fields,
|
||||
Text.Pandoc.Readers.HTML.Parsing,
|
||||
Text.Pandoc.Readers.HTML.Table,
|
||||
Text.Pandoc.Readers.HTML.TagCategories,
|
||||
Text.Pandoc.Readers.HTML.Types,
|
||||
Text.Pandoc.Readers.LaTeX.Parsing,
|
||||
|
|
|
@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
|
|||
import qualified Text.Pandoc.UTF8 as UTF8
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Readers.HTML.Parsing
|
||||
import Text.Pandoc.Readers.HTML.Table (pTable')
|
||||
import Text.Pandoc.Readers.HTML.TagCategories
|
||||
import Text.Pandoc.Readers.HTML.Types
|
||||
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
|
||||
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
|
||||
import Text.Pandoc.Error
|
||||
import Text.Pandoc.Logging
|
||||
import Text.Pandoc.Options (
|
||||
|
@ -64,7 +64,7 @@ import Text.Pandoc.Options (
|
|||
import Text.Pandoc.Parsing hiding ((<|>))
|
||||
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
|
||||
extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
|
||||
onlySimpleTableCells, safeRead, tshow)
|
||||
safeRead, tshow)
|
||||
import Text.Pandoc.Walk
|
||||
import Text.Parsec.Error
|
||||
import Text.TeXMath (readMathML, writeTeX)
|
||||
|
@ -474,79 +474,7 @@ pHrule = do
|
|||
return B.horizontalRule
|
||||
|
||||
pTable :: PandocMonad m => TagParser m Blocks
|
||||
pTable = try $ do
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
|
||||
let attribs = toAttr attribs'
|
||||
skipMany pBlank
|
||||
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
||||
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
||||
let pTh = option [] $ pInTags "tr" (pCell "th")
|
||||
pTr = try $ skipMany pBlank >>
|
||||
pInTags "tr" (pCell "td" <|> pCell "th")
|
||||
pTBody = pInTag True "tbody" $ many1 pTr
|
||||
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
|
||||
head' <- map snd <$>
|
||||
pInTag True "tbody"
|
||||
(if null head'' then pTh else return head'')
|
||||
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||
rowsLs <- many pTBody
|
||||
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||
TagClose _ <- pSatisfy (matchTagClose "table")
|
||||
let rows'' = concat rowsLs <> topfoot <> bottomfoot
|
||||
let rows''' = map (map snd) rows''
|
||||
-- fail on empty table
|
||||
guard $ not $ null head' && null rows'''
|
||||
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
|
||||
let cols = if null head'
|
||||
then maximum (map length rows''')
|
||||
else length head'
|
||||
-- add empty cells to short rows
|
||||
let addEmpties r = case cols - length r of
|
||||
n | n > 0 -> r <> replicate n mempty
|
||||
| otherwise -> r
|
||||
let rows = map addEmpties rows'''
|
||||
let aligns = case rows'' of
|
||||
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
|
||||
_ -> replicate cols AlignDefault
|
||||
let widths = if null widths'
|
||||
then if isSimple
|
||||
then replicate cols ColWidthDefault
|
||||
else replicate cols (ColWidth (1.0 / fromIntegral cols))
|
||||
else widths'
|
||||
let toRow = Row nullAttr . map B.simpleCell
|
||||
toHeaderRow l = [toRow l | not (null l)]
|
||||
return $ B.tableWith attribs
|
||||
(B.simpleCaption $ B.plain caption)
|
||||
(zip aligns widths)
|
||||
(TableHead nullAttr $ toHeaderRow head')
|
||||
[TableBody nullAttr 0 [] $ map toRow rows]
|
||||
(TableFoot nullAttr [])
|
||||
|
||||
pCol :: PandocMonad m => TagParser m ColWidth
|
||||
pCol = try $ do
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
|
||||
let attribs = toStringAttr attribs'
|
||||
skipMany pBlank
|
||||
optional $ pSatisfy (matchTagClose "col")
|
||||
skipMany pBlank
|
||||
let width = case lookup "width" attribs of
|
||||
Nothing -> case lookup "style" attribs of
|
||||
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
|
||||
fromMaybe 0.0 $ safeRead (T.filter
|
||||
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
|
||||
_ -> 0.0
|
||||
Just (T.unsnoc -> Just (xs, '%')) ->
|
||||
fromMaybe 0.0 $ safeRead xs
|
||||
_ -> 0.0
|
||||
if width > 0.0
|
||||
then return $ ColWidth $ width / 100.0
|
||||
else return ColWidthDefault
|
||||
|
||||
pColgroup :: PandocMonad m => TagParser m [ColWidth]
|
||||
pColgroup = try $ do
|
||||
pSatisfy (matchTagOpen "colgroup" [])
|
||||
skipMany pBlank
|
||||
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
||||
pTable = pTable' inline pCell
|
||||
|
||||
noColOrRowSpans :: Tag Text -> Bool
|
||||
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
|
||||
|
@ -847,16 +775,6 @@ pRawHtmlInline = do
|
|||
mathMLToTeXMath :: Text -> Either Text Text
|
||||
mathMLToTeXMath s = writeTeX <$> readMathML s
|
||||
|
||||
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
|
||||
toStringAttr = map go
|
||||
where
|
||||
go (x,y) =
|
||||
case T.stripPrefix "data-" x of
|
||||
Just x' | x' `Set.notMember` (html5Attributes <>
|
||||
html4Attributes <> rdfaAttributes)
|
||||
-> (x',y)
|
||||
_ -> (x,y)
|
||||
|
||||
pScriptMath :: PandocMonad m => TagParser m Inlines
|
||||
pScriptMath = try $ do
|
||||
TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
|
||||
|
@ -1151,16 +1069,6 @@ htmlTag f = try $ do
|
|||
handleTag tagname
|
||||
_ -> mzero
|
||||
|
||||
mkAttr :: [(Text, Text)] -> Attr
|
||||
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
||||
where attribsId = fromMaybe "" $ lookup "id" attr
|
||||
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
|
||||
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
||||
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
|
||||
|
||||
toAttr :: [(Text, Text)] -> Attr
|
||||
toAttr = mkAttr . toStringAttr
|
||||
|
||||
-- Strip namespace prefixes
|
||||
stripPrefixes :: [Tag Text] -> [Tag Text]
|
||||
stripPrefixes = map stripPrefix
|
||||
|
|
|
@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
|
|||
, matchTagClose
|
||||
, matchTagOpen
|
||||
, isSpace
|
||||
, toAttr
|
||||
, toStringAttr
|
||||
)
|
||||
where
|
||||
|
||||
import Control.Monad (guard, void, mzero)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Text (Text)
|
||||
import Text.HTML.TagSoup
|
||||
( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||
import Text.Pandoc.Definition (Attr)
|
||||
import Text.Pandoc.Parsing
|
||||
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
|
||||
, skipMany, setPosition, token, try)
|
||||
import Text.Pandoc.Readers.HTML.TagCategories
|
||||
import Text.Pandoc.Readers.HTML.Types
|
||||
import Text.Pandoc.Shared (tshow)
|
||||
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Text as T
|
||||
|
||||
|
@ -154,3 +160,23 @@ t1 `closes` t2 |
|
|||
t2 `Set.notMember` blockTags &&
|
||||
t2 `Set.notMember` eitherBlockOrInline = True
|
||||
_ `closes` _ = False
|
||||
|
||||
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
|
||||
toStringAttr = map go
|
||||
where
|
||||
go (x,y) =
|
||||
case T.stripPrefix "data-" x of
|
||||
Just x' | x' `Set.notMember` (html5Attributes <>
|
||||
html4Attributes <> rdfaAttributes)
|
||||
-> (x',y)
|
||||
_ -> (x,y)
|
||||
|
||||
mkAttr :: [(Text, Text)] -> Attr
|
||||
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
||||
where attribsId = fromMaybe "" $ lookup "id" attr
|
||||
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
|
||||
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
||||
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
|
||||
|
||||
toAttr :: [(Text, Text)] -> Attr
|
||||
toAttr = mkAttr . toStringAttr
|
||||
|
|
111
src/Text/Pandoc/Readers/HTML/Table.hs
Normal file
111
src/Text/Pandoc/Readers/HTML/Table.hs
Normal file
|
@ -0,0 +1,111 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
{- |
|
||||
Module : Text.Pandoc.Readers.HTML.Table
|
||||
Copyright : © 2006-2020 John MacFarlane,
|
||||
2020 Albert Krewinkel
|
||||
License : GNU GPL, version 2 or above
|
||||
|
||||
Maintainer : Albert Krewinkel <albert@zeitkraut.de>
|
||||
Stability : alpha
|
||||
Portability : portable
|
||||
|
||||
HTML table parser.
|
||||
-}
|
||||
module Text.Pandoc.Readers.HTML.Table (pTable') where
|
||||
|
||||
import Control.Monad (guard)
|
||||
import Data.Maybe (fromMaybe)
|
||||
import Data.Text (Text)
|
||||
import Text.HTML.TagSoup
|
||||
import Text.Pandoc.Builder (Blocks, Inlines)
|
||||
import Text.Pandoc.Definition
|
||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||
import Text.Pandoc.Parsing
|
||||
( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
|
||||
import Text.Pandoc.Readers.HTML.Parsing
|
||||
import Text.Pandoc.Readers.HTML.Types (TagParser)
|
||||
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
|
||||
import qualified Data.Text as T
|
||||
import qualified Text.Pandoc.Builder as B
|
||||
|
||||
-- | Parses a @<col>@ element, returning the column's width. Defaults to
|
||||
-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
|
||||
pCol :: PandocMonad m => TagParser m ColWidth
|
||||
pCol = try $ do
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
|
||||
let attribs = toStringAttr attribs'
|
||||
skipMany pBlank
|
||||
optional $ pSatisfy (matchTagClose "col")
|
||||
skipMany pBlank
|
||||
let width = case lookup "width" attribs of
|
||||
Nothing -> case lookup "style" attribs of
|
||||
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
|
||||
fromMaybe 0.0 $ safeRead (T.filter
|
||||
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
|
||||
_ -> 0.0
|
||||
Just (T.unsnoc -> Just (xs, '%')) ->
|
||||
fromMaybe 0.0 $ safeRead xs
|
||||
_ -> 0.0
|
||||
if width > 0.0
|
||||
then return $ ColWidth $ width / 100.0
|
||||
else return ColWidthDefault
|
||||
|
||||
pColgroup :: PandocMonad m => TagParser m [ColWidth]
|
||||
pColgroup = try $ do
|
||||
pSatisfy (matchTagOpen "colgroup" [])
|
||||
skipMany pBlank
|
||||
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
||||
|
||||
-- | Parses a simple HTML table
|
||||
pTable' :: PandocMonad m
|
||||
=> TagParser m Inlines -- ^ Caption parser
|
||||
-> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
|
||||
-> TagParser m Blocks
|
||||
pTable' inline pCell = try $ do
|
||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
|
||||
let attribs = toAttr attribs'
|
||||
skipMany pBlank
|
||||
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
||||
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
||||
let pTh = option [] $ pInTags "tr" (pCell "th")
|
||||
pTr = try $ skipMany pBlank >>
|
||||
pInTags "tr" (pCell "td" <|> pCell "th")
|
||||
pTBody = pInTag True "tbody" $ many1 pTr
|
||||
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
|
||||
head' <- map snd <$>
|
||||
pInTag True "tbody"
|
||||
(if null head'' then pTh else return head'')
|
||||
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||
rowsLs <- many pTBody
|
||||
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||
TagClose _ <- pSatisfy (matchTagClose "table")
|
||||
let rows'' = concat rowsLs <> topfoot <> bottomfoot
|
||||
let rows''' = map (map snd) rows''
|
||||
-- fail on empty table
|
||||
guard $ not $ null head' && null rows'''
|
||||
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
|
||||
let cols = if null head'
|
||||
then maximum (map length rows''')
|
||||
else length head'
|
||||
-- add empty cells to short rows
|
||||
let addEmpties r = case cols - length r of
|
||||
n | n > 0 -> r <> replicate n mempty
|
||||
| otherwise -> r
|
||||
let rows = map addEmpties rows'''
|
||||
let aligns = case rows'' of
|
||||
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
|
||||
_ -> replicate cols AlignDefault
|
||||
let widths = if null widths'
|
||||
then if isSimple
|
||||
then replicate cols ColWidthDefault
|
||||
else replicate cols (ColWidth (1.0 / fromIntegral cols))
|
||||
else widths'
|
||||
let toRow = Row nullAttr . map B.simpleCell
|
||||
toHeaderRow l = [toRow l | not (null l)]
|
||||
return $ B.tableWith attribs
|
||||
(B.simpleCaption $ B.plain caption)
|
||||
(zip aligns widths)
|
||||
(TableHead nullAttr $ toHeaderRow head')
|
||||
[TableBody nullAttr 0 [] $ map toRow rows]
|
||||
(TableFoot nullAttr [])
|
Loading…
Reference in a new issue