HTML reader: extract table parsing into separate module
This commit is contained in:
parent
2f110265ff
commit
41237fcc0e
4 changed files with 141 additions and 95 deletions
|
@ -600,6 +600,7 @@ library
|
||||||
Text.Pandoc.Readers.Docx.Util,
|
Text.Pandoc.Readers.Docx.Util,
|
||||||
Text.Pandoc.Readers.Docx.Fields,
|
Text.Pandoc.Readers.Docx.Fields,
|
||||||
Text.Pandoc.Readers.HTML.Parsing,
|
Text.Pandoc.Readers.HTML.Parsing,
|
||||||
|
Text.Pandoc.Readers.HTML.Table,
|
||||||
Text.Pandoc.Readers.HTML.TagCategories,
|
Text.Pandoc.Readers.HTML.TagCategories,
|
||||||
Text.Pandoc.Readers.HTML.Types,
|
Text.Pandoc.Readers.HTML.Types,
|
||||||
Text.Pandoc.Readers.LaTeX.Parsing,
|
Text.Pandoc.Readers.LaTeX.Parsing,
|
||||||
|
|
|
@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
|
||||||
import qualified Text.Pandoc.UTF8 as UTF8
|
import qualified Text.Pandoc.UTF8 as UTF8
|
||||||
import Text.Pandoc.Definition
|
import Text.Pandoc.Definition
|
||||||
import Text.Pandoc.Readers.HTML.Parsing
|
import Text.Pandoc.Readers.HTML.Parsing
|
||||||
|
import Text.Pandoc.Readers.HTML.Table (pTable')
|
||||||
import Text.Pandoc.Readers.HTML.TagCategories
|
import Text.Pandoc.Readers.HTML.TagCategories
|
||||||
import Text.Pandoc.Readers.HTML.Types
|
import Text.Pandoc.Readers.HTML.Types
|
||||||
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
|
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
|
||||||
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
|
|
||||||
import Text.Pandoc.Error
|
import Text.Pandoc.Error
|
||||||
import Text.Pandoc.Logging
|
import Text.Pandoc.Logging
|
||||||
import Text.Pandoc.Options (
|
import Text.Pandoc.Options (
|
||||||
|
@ -64,7 +64,7 @@ import Text.Pandoc.Options (
|
||||||
import Text.Pandoc.Parsing hiding ((<|>))
|
import Text.Pandoc.Parsing hiding ((<|>))
|
||||||
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
|
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
|
||||||
extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
|
extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
|
||||||
onlySimpleTableCells, safeRead, tshow)
|
safeRead, tshow)
|
||||||
import Text.Pandoc.Walk
|
import Text.Pandoc.Walk
|
||||||
import Text.Parsec.Error
|
import Text.Parsec.Error
|
||||||
import Text.TeXMath (readMathML, writeTeX)
|
import Text.TeXMath (readMathML, writeTeX)
|
||||||
|
@ -474,79 +474,7 @@ pHrule = do
|
||||||
return B.horizontalRule
|
return B.horizontalRule
|
||||||
|
|
||||||
pTable :: PandocMonad m => TagParser m Blocks
|
pTable :: PandocMonad m => TagParser m Blocks
|
||||||
pTable = try $ do
|
pTable = pTable' inline pCell
|
||||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
|
|
||||||
let attribs = toAttr attribs'
|
|
||||||
skipMany pBlank
|
|
||||||
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
|
||||||
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
|
||||||
let pTh = option [] $ pInTags "tr" (pCell "th")
|
|
||||||
pTr = try $ skipMany pBlank >>
|
|
||||||
pInTags "tr" (pCell "td" <|> pCell "th")
|
|
||||||
pTBody = pInTag True "tbody" $ many1 pTr
|
|
||||||
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
|
|
||||||
head' <- map snd <$>
|
|
||||||
pInTag True "tbody"
|
|
||||||
(if null head'' then pTh else return head'')
|
|
||||||
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
|
||||||
rowsLs <- many pTBody
|
|
||||||
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
|
||||||
TagClose _ <- pSatisfy (matchTagClose "table")
|
|
||||||
let rows'' = concat rowsLs <> topfoot <> bottomfoot
|
|
||||||
let rows''' = map (map snd) rows''
|
|
||||||
-- fail on empty table
|
|
||||||
guard $ not $ null head' && null rows'''
|
|
||||||
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
|
|
||||||
let cols = if null head'
|
|
||||||
then maximum (map length rows''')
|
|
||||||
else length head'
|
|
||||||
-- add empty cells to short rows
|
|
||||||
let addEmpties r = case cols - length r of
|
|
||||||
n | n > 0 -> r <> replicate n mempty
|
|
||||||
| otherwise -> r
|
|
||||||
let rows = map addEmpties rows'''
|
|
||||||
let aligns = case rows'' of
|
|
||||||
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
|
|
||||||
_ -> replicate cols AlignDefault
|
|
||||||
let widths = if null widths'
|
|
||||||
then if isSimple
|
|
||||||
then replicate cols ColWidthDefault
|
|
||||||
else replicate cols (ColWidth (1.0 / fromIntegral cols))
|
|
||||||
else widths'
|
|
||||||
let toRow = Row nullAttr . map B.simpleCell
|
|
||||||
toHeaderRow l = [toRow l | not (null l)]
|
|
||||||
return $ B.tableWith attribs
|
|
||||||
(B.simpleCaption $ B.plain caption)
|
|
||||||
(zip aligns widths)
|
|
||||||
(TableHead nullAttr $ toHeaderRow head')
|
|
||||||
[TableBody nullAttr 0 [] $ map toRow rows]
|
|
||||||
(TableFoot nullAttr [])
|
|
||||||
|
|
||||||
pCol :: PandocMonad m => TagParser m ColWidth
|
|
||||||
pCol = try $ do
|
|
||||||
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
|
|
||||||
let attribs = toStringAttr attribs'
|
|
||||||
skipMany pBlank
|
|
||||||
optional $ pSatisfy (matchTagClose "col")
|
|
||||||
skipMany pBlank
|
|
||||||
let width = case lookup "width" attribs of
|
|
||||||
Nothing -> case lookup "style" attribs of
|
|
||||||
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
|
|
||||||
fromMaybe 0.0 $ safeRead (T.filter
|
|
||||||
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
|
|
||||||
_ -> 0.0
|
|
||||||
Just (T.unsnoc -> Just (xs, '%')) ->
|
|
||||||
fromMaybe 0.0 $ safeRead xs
|
|
||||||
_ -> 0.0
|
|
||||||
if width > 0.0
|
|
||||||
then return $ ColWidth $ width / 100.0
|
|
||||||
else return ColWidthDefault
|
|
||||||
|
|
||||||
pColgroup :: PandocMonad m => TagParser m [ColWidth]
|
|
||||||
pColgroup = try $ do
|
|
||||||
pSatisfy (matchTagOpen "colgroup" [])
|
|
||||||
skipMany pBlank
|
|
||||||
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
|
||||||
|
|
||||||
noColOrRowSpans :: Tag Text -> Bool
|
noColOrRowSpans :: Tag Text -> Bool
|
||||||
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
|
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
|
||||||
|
@ -847,16 +775,6 @@ pRawHtmlInline = do
|
||||||
mathMLToTeXMath :: Text -> Either Text Text
|
mathMLToTeXMath :: Text -> Either Text Text
|
||||||
mathMLToTeXMath s = writeTeX <$> readMathML s
|
mathMLToTeXMath s = writeTeX <$> readMathML s
|
||||||
|
|
||||||
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
|
|
||||||
toStringAttr = map go
|
|
||||||
where
|
|
||||||
go (x,y) =
|
|
||||||
case T.stripPrefix "data-" x of
|
|
||||||
Just x' | x' `Set.notMember` (html5Attributes <>
|
|
||||||
html4Attributes <> rdfaAttributes)
|
|
||||||
-> (x',y)
|
|
||||||
_ -> (x,y)
|
|
||||||
|
|
||||||
pScriptMath :: PandocMonad m => TagParser m Inlines
|
pScriptMath :: PandocMonad m => TagParser m Inlines
|
||||||
pScriptMath = try $ do
|
pScriptMath = try $ do
|
||||||
TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
|
TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
|
||||||
|
@ -1151,16 +1069,6 @@ htmlTag f = try $ do
|
||||||
handleTag tagname
|
handleTag tagname
|
||||||
_ -> mzero
|
_ -> mzero
|
||||||
|
|
||||||
mkAttr :: [(Text, Text)] -> Attr
|
|
||||||
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
|
||||||
where attribsId = fromMaybe "" $ lookup "id" attr
|
|
||||||
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
|
|
||||||
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
|
||||||
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
|
|
||||||
|
|
||||||
toAttr :: [(Text, Text)] -> Attr
|
|
||||||
toAttr = mkAttr . toStringAttr
|
|
||||||
|
|
||||||
-- Strip namespace prefixes
|
-- Strip namespace prefixes
|
||||||
stripPrefixes :: [Tag Text] -> [Tag Text]
|
stripPrefixes :: [Tag Text] -> [Tag Text]
|
||||||
stripPrefixes = map stripPrefix
|
stripPrefixes = map stripPrefix
|
||||||
|
|
|
@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
|
||||||
, matchTagClose
|
, matchTagClose
|
||||||
, matchTagOpen
|
, matchTagOpen
|
||||||
, isSpace
|
, isSpace
|
||||||
|
, toAttr
|
||||||
|
, toStringAttr
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
|
|
||||||
import Control.Monad (guard, void, mzero)
|
import Control.Monad (guard, void, mzero)
|
||||||
|
import Data.Maybe (fromMaybe)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import Text.HTML.TagSoup
|
import Text.HTML.TagSoup
|
||||||
|
( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
|
||||||
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||||
|
import Text.Pandoc.Definition (Attr)
|
||||||
import Text.Pandoc.Parsing
|
import Text.Pandoc.Parsing
|
||||||
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
|
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
|
||||||
, skipMany, setPosition, token, try)
|
, skipMany, setPosition, token, try)
|
||||||
import Text.Pandoc.Readers.HTML.TagCategories
|
import Text.Pandoc.Readers.HTML.TagCategories
|
||||||
import Text.Pandoc.Readers.HTML.Types
|
import Text.Pandoc.Readers.HTML.Types
|
||||||
import Text.Pandoc.Shared (tshow)
|
import Text.Pandoc.Shared (tshow)
|
||||||
|
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
|
||||||
import qualified Data.Set as Set
|
import qualified Data.Set as Set
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
|
|
||||||
|
@ -154,3 +160,23 @@ t1 `closes` t2 |
|
||||||
t2 `Set.notMember` blockTags &&
|
t2 `Set.notMember` blockTags &&
|
||||||
t2 `Set.notMember` eitherBlockOrInline = True
|
t2 `Set.notMember` eitherBlockOrInline = True
|
||||||
_ `closes` _ = False
|
_ `closes` _ = False
|
||||||
|
|
||||||
|
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
|
||||||
|
toStringAttr = map go
|
||||||
|
where
|
||||||
|
go (x,y) =
|
||||||
|
case T.stripPrefix "data-" x of
|
||||||
|
Just x' | x' `Set.notMember` (html5Attributes <>
|
||||||
|
html4Attributes <> rdfaAttributes)
|
||||||
|
-> (x',y)
|
||||||
|
_ -> (x,y)
|
||||||
|
|
||||||
|
mkAttr :: [(Text, Text)] -> Attr
|
||||||
|
mkAttr attr = (attribsId, attribsClasses, attribsKV)
|
||||||
|
where attribsId = fromMaybe "" $ lookup "id" attr
|
||||||
|
attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
|
||||||
|
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
|
||||||
|
epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
|
||||||
|
|
||||||
|
toAttr :: [(Text, Text)] -> Attr
|
||||||
|
toAttr = mkAttr . toStringAttr
|
||||||
|
|
111
src/Text/Pandoc/Readers/HTML/Table.hs
Normal file
111
src/Text/Pandoc/Readers/HTML/Table.hs
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ViewPatterns #-}
|
||||||
|
{- |
|
||||||
|
Module : Text.Pandoc.Readers.HTML.Table
|
||||||
|
Copyright : © 2006-2020 John MacFarlane,
|
||||||
|
2020 Albert Krewinkel
|
||||||
|
License : GNU GPL, version 2 or above
|
||||||
|
|
||||||
|
Maintainer : Albert Krewinkel <albert@zeitkraut.de>
|
||||||
|
Stability : alpha
|
||||||
|
Portability : portable
|
||||||
|
|
||||||
|
HTML table parser.
|
||||||
|
-}
|
||||||
|
module Text.Pandoc.Readers.HTML.Table (pTable') where
|
||||||
|
|
||||||
|
import Control.Monad (guard)
|
||||||
|
import Data.Maybe (fromMaybe)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import Text.HTML.TagSoup
|
||||||
|
import Text.Pandoc.Builder (Blocks, Inlines)
|
||||||
|
import Text.Pandoc.Definition
|
||||||
|
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
|
||||||
|
import Text.Pandoc.Parsing
|
||||||
|
( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
|
||||||
|
import Text.Pandoc.Readers.HTML.Parsing
|
||||||
|
import Text.Pandoc.Readers.HTML.Types (TagParser)
|
||||||
|
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import qualified Text.Pandoc.Builder as B
|
||||||
|
|
||||||
|
-- | Parses a @<col>@ element, returning the column's width. Defaults to
|
||||||
|
-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
|
||||||
|
pCol :: PandocMonad m => TagParser m ColWidth
|
||||||
|
pCol = try $ do
|
||||||
|
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
|
||||||
|
let attribs = toStringAttr attribs'
|
||||||
|
skipMany pBlank
|
||||||
|
optional $ pSatisfy (matchTagClose "col")
|
||||||
|
skipMany pBlank
|
||||||
|
let width = case lookup "width" attribs of
|
||||||
|
Nothing -> case lookup "style" attribs of
|
||||||
|
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
|
||||||
|
fromMaybe 0.0 $ safeRead (T.filter
|
||||||
|
(`notElem` (" \t\r\n%'\";" :: [Char])) xs)
|
||||||
|
_ -> 0.0
|
||||||
|
Just (T.unsnoc -> Just (xs, '%')) ->
|
||||||
|
fromMaybe 0.0 $ safeRead xs
|
||||||
|
_ -> 0.0
|
||||||
|
if width > 0.0
|
||||||
|
then return $ ColWidth $ width / 100.0
|
||||||
|
else return ColWidthDefault
|
||||||
|
|
||||||
|
pColgroup :: PandocMonad m => TagParser m [ColWidth]
|
||||||
|
pColgroup = try $ do
|
||||||
|
pSatisfy (matchTagOpen "colgroup" [])
|
||||||
|
skipMany pBlank
|
||||||
|
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
|
||||||
|
|
||||||
|
-- | Parses a simple HTML table
|
||||||
|
pTable' :: PandocMonad m
|
||||||
|
=> TagParser m Inlines -- ^ Caption parser
|
||||||
|
-> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
|
||||||
|
-> TagParser m Blocks
|
||||||
|
pTable' inline pCell = try $ do
|
||||||
|
TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
|
||||||
|
let attribs = toAttr attribs'
|
||||||
|
skipMany pBlank
|
||||||
|
caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
|
||||||
|
widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
|
||||||
|
let pTh = option [] $ pInTags "tr" (pCell "th")
|
||||||
|
pTr = try $ skipMany pBlank >>
|
||||||
|
pInTags "tr" (pCell "td" <|> pCell "th")
|
||||||
|
pTBody = pInTag True "tbody" $ many1 pTr
|
||||||
|
head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
|
||||||
|
head' <- map snd <$>
|
||||||
|
pInTag True "tbody"
|
||||||
|
(if null head'' then pTh else return head'')
|
||||||
|
topfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||||
|
rowsLs <- many pTBody
|
||||||
|
bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
|
||||||
|
TagClose _ <- pSatisfy (matchTagClose "table")
|
||||||
|
let rows'' = concat rowsLs <> topfoot <> bottomfoot
|
||||||
|
let rows''' = map (map snd) rows''
|
||||||
|
-- fail on empty table
|
||||||
|
guard $ not $ null head' && null rows'''
|
||||||
|
let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
|
||||||
|
let cols = if null head'
|
||||||
|
then maximum (map length rows''')
|
||||||
|
else length head'
|
||||||
|
-- add empty cells to short rows
|
||||||
|
let addEmpties r = case cols - length r of
|
||||||
|
n | n > 0 -> r <> replicate n mempty
|
||||||
|
| otherwise -> r
|
||||||
|
let rows = map addEmpties rows'''
|
||||||
|
let aligns = case rows'' of
|
||||||
|
(cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
|
||||||
|
_ -> replicate cols AlignDefault
|
||||||
|
let widths = if null widths'
|
||||||
|
then if isSimple
|
||||||
|
then replicate cols ColWidthDefault
|
||||||
|
else replicate cols (ColWidth (1.0 / fromIntegral cols))
|
||||||
|
else widths'
|
||||||
|
let toRow = Row nullAttr . map B.simpleCell
|
||||||
|
toHeaderRow l = [toRow l | not (null l)]
|
||||||
|
return $ B.tableWith attribs
|
||||||
|
(B.simpleCaption $ B.plain caption)
|
||||||
|
(zip aligns widths)
|
||||||
|
(TableHead nullAttr $ toHeaderRow head')
|
||||||
|
[TableBody nullAttr 0 [] $ map toRow rows]
|
||||||
|
(TableFoot nullAttr [])
|
Loading…
Reference in a new issue