diff --git a/pandoc.cabal b/pandoc.cabal index c3ef00dcd..673df76eb 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -600,6 +600,7 @@ library Text.Pandoc.Readers.Docx.Util, Text.Pandoc.Readers.Docx.Fields, Text.Pandoc.Readers.HTML.Parsing, + Text.Pandoc.Readers.HTML.Table, Text.Pandoc.Readers.HTML.TagCategories, Text.Pandoc.Readers.HTML.Types, Text.Pandoc.Readers.LaTeX.Parsing, diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 9e84bedab..e9fefb9c0 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Definition import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.Table (pTable') import Text.Pandoc.Readers.HTML.TagCategories import Text.Pandoc.Readers.HTML.Types import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) -import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options ( @@ -64,7 +64,7 @@ import Text.Pandoc.Options ( import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, extractSpaces, htmlSpanLikeElements, elemText, splitTextBy, - onlySimpleTableCells, safeRead, tshow) + safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) @@ -474,79 +474,7 @@ pHrule = do return B.horizontalRule pTable :: PandocMonad m => TagParser m Blocks -pTable = try $ do - TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) - let attribs = toAttr attribs' - skipMany pBlank - caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank - widths' <- (mconcat <$> many1 pColgroup) <|> many pCol - let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank >> - pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = pInTag True "tbody" $ many1 pTr - head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh - head' <- map snd <$> - pInTag True "tbody" - (if null head'' then pTh else return head'') - topfoot <- option [] $ pInTag False "tfoot" $ many pTr - rowsLs <- many pTBody - bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr - TagClose _ <- pSatisfy (matchTagClose "table") - let rows'' = concat rowsLs <> topfoot <> bottomfoot - let rows''' = map (map snd) rows'' - -- fail on empty table - guard $ not $ null head' && null rows''' - let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows''' - let cols = if null head' - then maximum (map length rows''') - else length head' - -- add empty cells to short rows - let addEmpties r = case cols - length r of - n | n > 0 -> r <> replicate n mempty - | otherwise -> r - let rows = map addEmpties rows''' - let aligns = case rows'' of - (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault - _ -> replicate cols AlignDefault - let widths = if null widths' - then if isSimple - then replicate cols ColWidthDefault - else replicate cols (ColWidth (1.0 / fromIntegral cols)) - else widths' - let toRow = Row nullAttr . map B.simpleCell - toHeaderRow l = [toRow l | not (null l)] - return $ B.tableWith attribs - (B.simpleCaption $ B.plain caption) - (zip aligns widths) - (TableHead nullAttr $ toHeaderRow head') - [TableBody nullAttr 0 [] $ map toRow rows] - (TableFoot nullAttr []) - -pCol :: PandocMonad m => TagParser m ColWidth -pCol = try $ do - TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) - let attribs = toStringAttr attribs' - skipMany pBlank - optional $ pSatisfy (matchTagClose "col") - skipMany pBlank - let width = case lookup "width" attribs of - Nothing -> case lookup "style" attribs of - Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> - fromMaybe 0.0 $ safeRead (T.filter - (`notElem` (" \t\r\n%'\";" :: [Char])) xs) - _ -> 0.0 - Just (T.unsnoc -> Just (xs, '%')) -> - fromMaybe 0.0 $ safeRead xs - _ -> 0.0 - if width > 0.0 - then return $ ColWidth $ width / 100.0 - else return ColWidthDefault - -pColgroup :: PandocMonad m => TagParser m [ColWidth] -pColgroup = try $ do - pSatisfy (matchTagOpen "colgroup" []) - skipMany pBlank - manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +pTable = pTable' inline pCell noColOrRowSpans :: Tag Text -> Bool noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" @@ -847,16 +775,6 @@ pRawHtmlInline = do mathMLToTeXMath :: Text -> Either Text Text mathMLToTeXMath s = writeTeX <$> readMathML s -toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go - where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) - pScriptMath :: PandocMonad m => TagParser m Inlines pScriptMath = try $ do TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True) @@ -1151,16 +1069,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero -mkAttr :: [(Text, Text)] -> Attr -mkAttr attr = (attribsId, attribsClasses, attribsKV) - where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes - attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr - epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr - -toAttr :: [(Text, Text)] -> Attr -toAttr = mkAttr . toStringAttr - -- Strip namespace prefixes stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 7fda066b5..e68e43b25 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing , matchTagClose , matchTagOpen , isSpace + , toAttr + , toStringAttr ) where import Control.Monad (guard, void, mzero) +import Data.Maybe (fromMaybe) import Data.Text (Text) import Text.HTML.TagSoup + ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose ) import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Definition (Attr) import Text.Pandoc.Parsing ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional , skipMany, setPosition, token, try) import Text.Pandoc.Readers.HTML.TagCategories import Text.Pandoc.Readers.HTML.Types import Text.Pandoc.Shared (tshow) +import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes) import qualified Data.Set as Set import qualified Data.Text as T @@ -154,3 +160,23 @@ t1 `closes` t2 | t2 `Set.notMember` blockTags && t2 `Set.notMember` eitherBlockOrInline = True _ `closes` _ = False + +toStringAttr :: [(Text, Text)] -> [(Text, Text)] +toStringAttr = map go + where + go (x,y) = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> (x',y) + _ -> (x,y) + +mkAttr :: [(Text, Text)] -> Attr +mkAttr attr = (attribsId, attribsClasses, attribsKV) + where attribsId = fromMaybe "" $ lookup "id" attr + attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes + attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr + +toAttr :: [(Text, Text)] -> Attr +toAttr = mkAttr . toStringAttr diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs new file mode 100644 index 000000000..bad39bd2d --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -0,0 +1,111 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.HTML.Table + Copyright : © 2006-2020 John MacFarlane, + 2020 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + Stability : alpha + Portability : portable + +HTML table parser. +-} +module Text.Pandoc.Readers.HTML.Table (pTable') where + +import Control.Monad (guard) +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Definition +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing + ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) +import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.Types (TagParser) +import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B + +-- | Parses a @@ element, returning the column's width. Defaults to +-- @'ColWidthDefault'@ if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m ColWidth +pCol = try $ do + TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) + let attribs = toStringAttr attribs' + skipMany pBlank + optional $ pSatisfy (matchTagClose "col") + skipMany pBlank + let width = case lookup "width" attribs of + Nothing -> case lookup "style" attribs of + Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> + fromMaybe 0.0 $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> 0.0 + Just (T.unsnoc -> Just (xs, '%')) -> + fromMaybe 0.0 $ safeRead xs + _ -> 0.0 + if width > 0.0 + then return $ ColWidth $ width / 100.0 + else return ColWidthDefault + +pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup = try $ do + pSatisfy (matchTagOpen "colgroup" []) + skipMany pBlank + manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank + +-- | Parses a simple HTML table +pTable' :: PandocMonad m + => TagParser m Inlines -- ^ Caption parser + -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser + -> TagParser m Blocks +pTable' inline pCell = try $ do + TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) + let attribs = toAttr attribs' + skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank + widths' <- (mconcat <$> many1 pColgroup) <|> many pCol + let pTh = option [] $ pInTags "tr" (pCell "th") + pTr = try $ skipMany pBlank >> + pInTags "tr" (pCell "td" <|> pCell "th") + pTBody = pInTag True "tbody" $ many1 pTr + head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh + head' <- map snd <$> + pInTag True "tbody" + (if null head'' then pTh else return head'') + topfoot <- option [] $ pInTag False "tfoot" $ many pTr + rowsLs <- many pTBody + bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr + TagClose _ <- pSatisfy (matchTagClose "table") + let rows'' = concat rowsLs <> topfoot <> bottomfoot + let rows''' = map (map snd) rows'' + -- fail on empty table + guard $ not $ null head' && null rows''' + let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows''' + let cols = if null head' + then maximum (map length rows''') + else length head' + -- add empty cells to short rows + let addEmpties r = case cols - length r of + n | n > 0 -> r <> replicate n mempty + | otherwise -> r + let rows = map addEmpties rows''' + let aligns = case rows'' of + (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault + _ -> replicate cols AlignDefault + let widths = if null widths' + then if isSimple + then replicate cols ColWidthDefault + else replicate cols (ColWidth (1.0 / fromIntegral cols)) + else widths' + let toRow = Row nullAttr . map B.simpleCell + toHeaderRow l = [toRow l | not (null l)] + return $ B.tableWith attribs + (B.simpleCaption $ B.plain caption) + (zip aligns widths) + (TableHead nullAttr $ toHeaderRow head') + [TableBody nullAttr 0 [] $ map toRow rows] + (TableFoot nullAttr [])