From f9258371dd20e0a9569c04923188a91f6c2e489e Mon Sep 17 00:00:00 2001 From: Albert Krewinkel <albert@zeitkraut.de> Date: Sun, 22 Nov 2020 22:30:47 +0100 Subject: [PATCH] HTML reader: extract submodules Reducing module size should reduce memory use during compilation. This is preparatory work to tackle support for more table features. --- pandoc.cabal | 3 + src/Text/Pandoc/Readers/HTML.hs | 250 +----------------- src/Text/Pandoc/Readers/HTML/Parsing.hs | 156 +++++++++++ src/Text/Pandoc/Readers/HTML/TagCategories.hs | 78 ++++++ src/Text/Pandoc/Readers/HTML/Types.hs | 97 +++++++ 5 files changed, 345 insertions(+), 239 deletions(-) create mode 100644 src/Text/Pandoc/Readers/HTML/Parsing.hs create mode 100644 src/Text/Pandoc/Readers/HTML/TagCategories.hs create mode 100644 src/Text/Pandoc/Readers/HTML/Types.hs diff --git a/pandoc.cabal b/pandoc.cabal index a065d7459..c3ef00dcd 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -599,6 +599,9 @@ library Text.Pandoc.Readers.Docx.Parse.Styles, Text.Pandoc.Readers.Docx.Util, Text.Pandoc.Readers.Docx.Fields, + Text.Pandoc.Readers.HTML.Parsing, + Text.Pandoc.Readers.HTML.TagCategories, + Text.Pandoc.Readers.HTML.Types, Text.Pandoc.Readers.LaTeX.Parsing, Text.Pandoc.Readers.LaTeX.Lang, Text.Pandoc.Readers.LaTeX.SIunitx, diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 7eab27cef..9e84bedab 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,7 +1,6 @@ -{-# LANGUAGE LambdaCase #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {- | @@ -29,7 +28,8 @@ import Control.Applicative ((<|>)) import Control.Arrow (first) import Control.Monad (guard, mplus, msum, mzero, unless, void) import Control.Monad.Except (throwError) -import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT) +import Control.Monad.Reader (ask, asks, lift, local, runReaderT) +import Data.ByteString.Base64 (encode) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) @@ -40,17 +40,19 @@ import Data.Monoid (First (..)) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T -import Network.URI (URI, nonStrictRelativeTo, parseURIReference) +import Network.URI (nonStrictRelativeTo, parseURIReference) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match -import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Definition +import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.TagCategories +import Text.Pandoc.Readers.HTML.Types import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) -import Text.Pandoc.Readers.LaTeX.Types (Macro) import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes) import Text.Pandoc.Error import Text.Pandoc.Logging @@ -66,7 +68,6 @@ import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -import Data.ByteString.Base64 (encode) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: PandocMonad m @@ -105,32 +106,12 @@ replaceNotes' noteTbl (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) $ lookup ref noteTbl replaceNotes' _ x = x -data HTMLState = - HTMLState - { parserState :: ParserState, - noteTable :: [(Text, Blocks)], - baseHref :: Maybe URI, - identifiers :: Set.Set Text, - logMessages :: [LogMessage], - macros :: M.Map Text Macro, - readerOpts :: ReaderOptions - } - -data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext - , inChapter :: Bool -- ^ Set if in chapter section - , inPlain :: Bool -- ^ Set if in pPlain - } - setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInChapter = local (\s -> s {inChapter = True}) setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) -type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) - -type TagParser m = HTMLParser m [Tag Text] - pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAny @@ -681,22 +662,6 @@ inline = choice , pRawHtmlInline ] -pLocation :: PandocMonad m => TagParser m () -pLocation = do - (TagPosition r c) <- pSat isTagPosition - setPosition $ newPos "input" r c - -pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) -pSat f = do - pos <- getPosition - token tshow (const pos) (\x -> if f x then Just x else Nothing) - -pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) -pSatisfy f = try $ optional pLocation >> pSat f - -pAny :: PandocMonad m => TagParser m (Tag Text) -pAny = pSatisfy (const True) - pSelfClosing :: PandocMonad m => (Text -> Bool) -> ([Attribute Text] -> Bool) -> TagParser m (Tag Text) @@ -924,49 +889,6 @@ pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines) -> TagParser m Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a -pInTags tagtype parser = pInTags' tagtype (const True) parser - -pInTags' :: (PandocMonad m, Monoid a) - => Text - -> (Tag Text -> Bool) - -> TagParser m a - -> TagParser m a -pInTags' tagtype tagtest parser = try $ do - pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) - mconcat <$> manyTill parser (pCloses tagtype <|> eof) - --- parses p, preceded by an opening tag (optional if tagsOptional) --- and followed by a closing tag (optional if tagsOptional) -pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a -pInTag tagsOptional tagtype p = try $ do - skipMany pBlank - (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype []) - skipMany pBlank - x <- p - skipMany pBlank - (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype) - skipMany pBlank - return x - -pCloses :: PandocMonad m => Text -> TagParser m () -pCloses tagtype = try $ do - t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag - case t of - (TagClose t') | t' == tagtype -> void pAny - (TagOpen t' _) | t' `closes` tagtype -> return () - (TagClose "ul") | tagtype == "li" -> return () - (TagClose "ol") | tagtype == "li" -> return () - (TagClose "dl") | tagtype == "dd" -> return () - (TagClose "table") | tagtype == "td" -> return () - (TagClose "table") | tagtype == "th" -> return () - (TagClose "table") | tagtype == "tr" -> return () - (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () - (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () - (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags - -> return () -- see #3794 - _ -> mzero - pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do (TagText str) <- pSatisfy isTagText @@ -975,14 +897,10 @@ pTagText = try $ do parsed <- lift $ lift $ flip runReaderT qu $ runParserT (many pTagContents) st "text" str case parsed of - Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'" + Left _ -> throwError $ PandocParseError $ + "Could not parse `" <> str <> "'" Right result -> return $ mconcat result -pBlank :: PandocMonad m => TagParser m () -pBlank = try $ do - (TagText str) <- pSatisfy isTagText - guard $ T.all isSpace str - type InlinesParser m = HTMLParser m Text pTagContents :: PandocMonad m => InlinesParser m Inlines @@ -1077,54 +995,6 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> then return B.softbreak else return B.space --- --- Constants --- - -eitherBlockOrInline :: Set.Set Text -eitherBlockOrInline = Set.fromList - ["audio", "applet", "button", "iframe", "embed", - "del", "ins", "progress", "map", "area", "noscript", "script", - "object", "svg", "video", "source"] - -blockHtmlTags :: Set.Set Text -blockHtmlTags = Set.fromList - ["?xml", "!DOCTYPE", "address", "article", "aside", - "blockquote", "body", "canvas", - "caption", "center", "col", "colgroup", "dd", "details", - "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", - "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", - "isindex", "main", "menu", "meta", "noframes", "nav", - "ol", "output", "p", "pre", - "section", "summary", "table", "tbody", "textarea", - "thead", "tfoot", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style"] - --- We want to allow raw docbook in markdown documents, so we --- include docbook block tags here too. -blockDocBookTags :: Set.Set Text -blockDocBookTags = Set.fromList - ["calloutlist", "bibliolist", "glosslist", "itemizedlist", - "orderedlist", "segmentedlist", "simplelist", - "variablelist", "caution", "important", "note", "tip", - "warning", "address", "literallayout", "programlisting", - "programlistingco", "screen", "screenco", "screenshot", - "synopsis", "example", "informalexample", "figure", - "informalfigure", "table", "informaltable", "para", - "simpara", "formalpara", "equation", "informalequation", - "figure", "screenshot", "mediaobject", "qandaset", - "procedure", "task", "cmdsynopsis", "funcsynopsis", - "classsynopsis", "blockquote", "epigraph", "msgset", - "sidebar", "title"] - -epubTags :: Set.Set Text -epubTags = Set.fromList ["case", "switch", "default"] - -blockTags :: Set.Set Text -blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] - class NamedTag a where getTagName :: a -> Maybe Text @@ -1162,47 +1032,6 @@ isTextTag = tagText (const True) isCommentTag :: Tag a -> Bool isCommentTag = tagComment (const True) --- taken from HXT and extended --- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: Text -> Text -> Bool -_ `closes` "body" = False -_ `closes` "html" = False -"body" `closes` "head" = True -"a" `closes` "a" = True -"li" `closes` "li" = True -"th" `closes` t | t `elem` ["th","td"] = True -"td" `closes` t | t `elem` ["th","td"] = True -"tr" `closes` t | t `elem` ["th","td","tr"] = True -"dd" `closes` t | t `elem` ["dt", "dd"] = True -"dt" `closes` t | t `elem` ["dt","dd"] = True -"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True -"optgroup" `closes` "optgroup" = True -"optgroup" `closes` "option" = True -"option" `closes` "option" = True --- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags -x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", - "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", - "table", "ul"] = True -_ `closes` "meta" = True -"form" `closes` "form" = True -"label" `closes` "label" = True -"map" `closes` "map" = True -"object" `closes` "object" = True -_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True -t `closes` "select" | t /= "option" = True -"thead" `closes` "colgroup" = True -"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True -"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True -t `closes` t2 | - t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && - t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" -t1 `closes` t2 | - t1 `Set.member` blockTags && - t2 `Set.notMember` blockTags && - t2 `Set.notMember` eitherBlockOrInline = True -_ `closes` _ = False - --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. @@ -1347,13 +1176,6 @@ stripPrefix' s = if T.null t then s else T.drop 1 t where (_, t) = T.span (/= ':') s -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace '\r' = True -isSpace _ = False - -- Utilities -- | Adjusts a url according to the document's base URL. @@ -1364,41 +1186,6 @@ canonicalizeUrl url = do (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url - --- Instances - -instance HasMacros HTMLState where - extractMacros = macros - updateMacros f st = st{ macros = f $ macros st } - -instance HasIdentifierList HTMLState where - extractIdentifierList = identifiers - updateIdentifierList f s = s{ identifiers = f (identifiers s) } - -instance HasLogMessages HTMLState where - addLogMessage m s = s{ logMessages = m : logMessages s } - getLogMessages = reverse . logMessages - --- This signature should be more general --- MonadReader HTMLLocal m => HasQuoteContext st m -instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where - getQuoteContext = asks quoteContext - withQuoteContext q = local (\s -> s{quoteContext = q}) - -instance HasReaderOptions HTMLState where - extractReaderOptions = extractReaderOptions . parserState - -instance HasMeta HTMLState where - setMeta s b st = st {parserState = setMeta s b $ parserState st} - deleteMeta s st = st {parserState = deleteMeta s $ parserState st} - -instance Default HTMLLocal where - def = HTMLLocal NoQuote False False - -instance HasLastStrPosition HTMLState where - setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} - getLastStrPos = getLastStrPos . parserState - -- For now we need a special version here; the one in Shared has String type renderTags' :: [Tag Text] -> Text renderTags' = renderTagsOptions @@ -1411,21 +1198,6 @@ renderTags' = renderTagsOptions -- EPUB Specific -- -- -sectioningContent :: [Text] -sectioningContent = ["article", "aside", "nav", "section"] - - -groupingContent :: [Text] -groupingContent = ["p", "hr", "pre", "blockquote", "ol" - , "ul", "li", "dl", "dt", "dt", "dd" - , "figure", "figcaption", "div", "main"] - -matchTagClose :: Text -> (Tag Text -> Bool) -matchTagClose t = (~== TagClose t) - -matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) -matchTagOpen t as = (~== TagOpen t as) - {- types :: [(String, ([String], Int))] diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs new file mode 100644 index 000000000..7fda066b5 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -0,0 +1,156 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.HTML.Parsing + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Parsing functions and utilities. +-} +module Text.Pandoc.Readers.HTML.Parsing + ( pInTags + , pInTags' + , pInTag + , pAny + , pCloses + , pSatisfy + , pBlank + , matchTagClose + , matchTagOpen + , isSpace + ) +where + +import Control.Monad (guard, void, mzero) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing + ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional + , skipMany, setPosition, token, try) +import Text.Pandoc.Readers.HTML.TagCategories +import Text.Pandoc.Readers.HTML.Types +import Text.Pandoc.Shared (tshow) +import qualified Data.Set as Set +import qualified Data.Text as T + +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a +pInTags tagtype parser = pInTags' tagtype (const True) parser + +pInTags' :: (PandocMonad m, Monoid a) + => Text + -> (Tag Text -> Bool) + -> TagParser m a + -> TagParser m a +pInTags' tagtype tagtest parser = try $ do + pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) + +-- parses p, preceded by an opening tag (optional if tagsOptional) +-- and followed by a closing tag (optional if tagsOptional) +pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a +pInTag tagsOptional tagtype p = try $ do + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype []) + skipMany pBlank + x <- p + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype) + skipMany pBlank + return x + +pCloses :: PandocMonad m => Text -> TagParser m () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> void pAny + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "dd" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "th" -> return () + (TagClose "table") | tagtype == "tr" -> return () + (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags + -> return () -- see #3794 + _ -> mzero + +pBlank :: PandocMonad m => TagParser m () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ T.all isSpace str + +pLocation :: PandocMonad m => TagParser m () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSat f = do + pos <- getPosition + token tshow (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSatisfy f = try $ optional pLocation >> pSat f + +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) + +pAny :: PandocMonad m => TagParser m (Tag Text) +pAny = pSatisfy (const True) + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- taken from HXT and extended +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags +closes :: Text -> Text -> Bool +_ `closes` "body" = False +_ `closes` "html" = False +"body" `closes` "head" = True +"a" `closes` "a" = True +"li" `closes` "li" = True +"th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True +"tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True +"dt" `closes` t | t `elem` ["dt","dd"] = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True +_ `closes` "meta" = True +"form" `closes` "form" = True +"label" `closes` "label" = True +"map" `closes` "map" = True +"object" `closes` "object" = True +_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True +t `closes` "select" | t /= "option" = True +"thead" `closes` "colgroup" = True +"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True +"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True +t `closes` t2 | + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" +t1 `closes` t2 | + t1 `Set.member` blockTags && + t2 `Set.notMember` blockTags && + t2 `Set.notMember` eitherBlockOrInline = True +_ `closes` _ = False diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs new file mode 100644 index 000000000..4f82a1831 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -0,0 +1,78 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.HTML.TagCategories + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Categories of tags. +-} +module Text.Pandoc.Readers.HTML.TagCategories + ( blockHtmlTags + , blockDocBookTags + , eitherBlockOrInline + , epubTags + , blockTags + , sectioningContent + , groupingContent + ) +where + +import Data.Set (Set, fromList, unions) +import Data.Text (Text) + +eitherBlockOrInline :: Set Text +eitherBlockOrInline = fromList + ["audio", "applet", "button", "iframe", "embed", + "del", "ins", "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] + +blockHtmlTags :: Set Text +blockHtmlTags = fromList + ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "canvas", + "caption", "center", "col", "colgroup", "dd", "details", + "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "main", "menu", "meta", "noframes", "nav", + "ol", "output", "p", "pre", + "section", "summary", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script", "style"] + +-- We want to allow raw docbook in markdown documents, so we +-- include docbook block tags here too. +blockDocBookTags :: Set Text +blockDocBookTags = fromList + ["calloutlist", "bibliolist", "glosslist", "itemizedlist", + "orderedlist", "segmentedlist", "simplelist", + "variablelist", "caution", "important", "note", "tip", + "warning", "address", "literallayout", "programlisting", + "programlistingco", "screen", "screenco", "screenshot", + "synopsis", "example", "informalexample", "figure", + "informalfigure", "table", "informaltable", "para", + "simpara", "formalpara", "equation", "informalequation", + "figure", "screenshot", "mediaobject", "qandaset", + "procedure", "task", "cmdsynopsis", "funcsynopsis", + "classsynopsis", "blockquote", "epigraph", "msgset", + "sidebar", "title"] + +epubTags :: Set Text +epubTags = fromList ["case", "switch", "default"] + +blockTags :: Set Text +blockTags = unions [blockHtmlTags, blockDocBookTags, epubTags] + +sectioningContent :: [Text] +sectioningContent = ["article", "aside", "nav", "section"] + + +groupingContent :: [Text] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs new file mode 100644 index 000000000..a94eeb828 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -0,0 +1,97 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{- | + Module : Text.Pandoc.Readers.HTML.Types + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Types for pandoc's HTML reader. +-} +module Text.Pandoc.Readers.HTML.Types + ( TagParser + , HTMLParser + , HTMLState (..) + , HTMLLocal (..) + ) +where + +import Control.Monad.Reader (ReaderT, asks, local) +import Data.Default (Default (def)) +import Data.Map (Map) +import Data.Set (Set) +import Data.Text (Text) +import Network.URI (URI) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, HasMeta (..)) +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Logging (LogMessage) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Parsing + ( HasIdentifierList (..), HasLastStrPosition (..), HasLogMessages (..) + , HasMacros (..), HasQuoteContext (..), HasReaderOptions (..) + , ParserT, ParserState, QuoteContext (NoQuote) + ) +import Text.Pandoc.Readers.LaTeX.Types (Macro) + +-- | HTML parser type +type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) + +-- | HTML parser, expecting @Tag Text@ as tokens. +type TagParser m = HTMLParser m [Tag Text] + +-- | Global HTML parser state +data HTMLState = HTMLState + { parserState :: ParserState + , noteTable :: [(Text, Blocks)] + , baseHref :: Maybe URI + , identifiers :: Set Text + , logMessages :: [LogMessage] + , macros :: Map Text Macro + , readerOpts :: ReaderOptions + } + +-- | Local HTML parser state +data HTMLLocal = HTMLLocal + { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain + } + + +-- Instances + +instance HasMacros HTMLState where + extractMacros = macros + updateMacros f st = st{ macros = f $ macros st } + +instance HasIdentifierList HTMLState where + extractIdentifierList = identifiers + updateIdentifierList f s = s{ identifiers = f (identifiers s) } + +instance HasLogMessages HTMLState where + addLogMessage m s = s{ logMessages = m : logMessages s } + getLogMessages = reverse . logMessages + +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) + +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState