HTML reader: extract submodules

Reducing module size should reduce memory use during compilation.

This is preparatory work to tackle support for more table features.
This commit is contained in:
Albert Krewinkel 2020-11-22 22:30:47 +01:00 committed by Albert Krewinkel
parent 75c881e2d9
commit f9258371dd
5 changed files with 345 additions and 239 deletions

View file

@ -599,6 +599,9 @@ library
Text.Pandoc.Readers.Docx.Parse.Styles,
Text.Pandoc.Readers.Docx.Util,
Text.Pandoc.Readers.Docx.Fields,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.TagCategories,
Text.Pandoc.Readers.HTML.Types,
Text.Pandoc.Readers.LaTeX.Parsing,
Text.Pandoc.Readers.LaTeX.Lang,
Text.Pandoc.Readers.LaTeX.SIunitx,

View file

@ -1,7 +1,6 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{- |
@ -29,7 +28,8 @@ import Control.Applicative ((<|>))
import Control.Arrow (first)
import Control.Monad (guard, mplus, msum, mzero, unless, void)
import Control.Monad.Except (throwError)
import Control.Monad.Reader (ReaderT, ask, asks, lift, local, runReaderT)
import Control.Monad.Reader (ask, asks, lift, local, runReaderT)
import Data.ByteString.Base64 (encode)
import Data.Char (isAlphaNum, isLetter)
import Data.Default (Default (..), def)
import Data.Foldable (for_)
@ -40,17 +40,19 @@ import Data.Monoid (First (..))
import qualified Data.Set as Set
import Data.Text (Text)
import qualified Data.Text as T
import Network.URI (URI, nonStrictRelativeTo, parseURIReference)
import Network.URI (nonStrictRelativeTo, parseURIReference)
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Builder (Blocks, HasMeta (..), Inlines, trimInlines)
import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
import qualified Text.Pandoc.Builder as B
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Definition
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
import Text.Pandoc.Readers.LaTeX.Types (Macro)
import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import Text.Pandoc.Error
import Text.Pandoc.Logging
@ -66,7 +68,6 @@ import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
import Data.ByteString.Base64 (encode)
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: PandocMonad m
@ -105,32 +106,12 @@ replaceNotes' noteTbl (RawInline (Format "noteref") ref) =
maybe (Str "") (Note . B.toList) $ lookup ref noteTbl
replaceNotes' _ x = x
data HTMLState =
HTMLState
{ parserState :: ParserState,
noteTable :: [(Text, Blocks)],
baseHref :: Maybe URI,
identifiers :: Set.Set Text,
logMessages :: [LogMessage],
macros :: M.Map Text Macro,
readerOpts :: ReaderOptions
}
data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
, inChapter :: Bool -- ^ Set if in chapter section
, inPlain :: Bool -- ^ Set if in pPlain
}
setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInChapter = local (\s -> s {inChapter = True})
setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
setInPlain = local (\s -> s {inPlain = True})
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
type TagParser m = HTMLParser m [Tag Text]
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
@ -681,22 +662,6 @@ inline = choice
, pRawHtmlInline
]
pLocation :: PandocMonad m => TagParser m ()
pLocation = do
(TagPosition r c) <- pSat isTagPosition
setPosition $ newPos "input" r c
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
token tshow (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
pAny :: PandocMonad m => TagParser m (Tag Text)
pAny = pSatisfy (const True)
pSelfClosing :: PandocMonad m
=> (Text -> Bool) -> ([Attribute Text] -> Bool)
-> TagParser m (Tag Text)
@ -924,49 +889,6 @@ pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines)
-> TagParser m Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
pInTags tagtype parser = pInTags' tagtype (const True) parser
pInTags' :: (PandocMonad m, Monoid a)
=> Text
-> (Tag Text -> Bool)
-> TagParser m a
-> TagParser m a
pInTags' tagtype tagtest parser = try $ do
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
-- parses p, preceded by an opening tag (optional if tagsOptional)
-- and followed by a closing tag (optional if tagsOptional)
pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
pInTag tagsOptional tagtype p = try $ do
skipMany pBlank
(if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
skipMany pBlank
x <- p
skipMany pBlank
(if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
skipMany pBlank
return x
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> void pAny
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "dd" -> return ()
(TagClose "table") | tagtype == "td" -> return ()
(TagClose "table") | tagtype == "th" -> return ()
(TagClose "table") | tagtype == "tr" -> return ()
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
-> return () -- see #3794
_ -> mzero
pTagText :: PandocMonad m => TagParser m Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
@ -975,14 +897,10 @@ pTagText = try $ do
parsed <- lift $ lift $
flip runReaderT qu $ runParserT (many pTagContents) st "text" str
case parsed of
Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'"
Left _ -> throwError $ PandocParseError $
"Could not parse `" <> str <> "'"
Right result -> return $ mconcat result
pBlank :: PandocMonad m => TagParser m ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ T.all isSpace str
type InlinesParser m = HTMLParser m Text
pTagContents :: PandocMonad m => InlinesParser m Inlines
@ -1077,54 +995,6 @@ pSpace = many1 (satisfy isSpace) >>= \xs ->
then return B.softbreak
else return B.space
--
-- Constants
--
eitherBlockOrInline :: Set.Set Text
eitherBlockOrInline = Set.fromList
["audio", "applet", "button", "iframe", "embed",
"del", "ins", "progress", "map", "area", "noscript", "script",
"object", "svg", "video", "source"]
blockHtmlTags :: Set.Set Text
blockHtmlTags = Set.fromList
["?xml", "!DOCTYPE", "address", "article", "aside",
"blockquote", "body", "canvas",
"caption", "center", "col", "colgroup", "dd", "details",
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
"footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "head", "header", "hgroup", "hr", "html",
"isindex", "main", "menu", "meta", "noframes", "nav",
"ol", "output", "p", "pre",
"section", "summary", "table", "tbody", "textarea",
"thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
blockDocBookTags :: Set.Set Text
blockDocBookTags = Set.fromList
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"orderedlist", "segmentedlist", "simplelist",
"variablelist", "caution", "important", "note", "tip",
"warning", "address", "literallayout", "programlisting",
"programlistingco", "screen", "screenco", "screenshot",
"synopsis", "example", "informalexample", "figure",
"informalfigure", "table", "informaltable", "para",
"simpara", "formalpara", "equation", "informalequation",
"figure", "screenshot", "mediaobject", "qandaset",
"procedure", "task", "cmdsynopsis", "funcsynopsis",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
epubTags :: Set.Set Text
epubTags = Set.fromList ["case", "switch", "default"]
blockTags :: Set.Set Text
blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags]
class NamedTag a where
getTagName :: a -> Maybe Text
@ -1162,47 +1032,6 @@ isTextTag = tagText (const True)
isCommentTag :: Tag a -> Bool
isCommentTag = tagComment (const True)
-- taken from HXT and extended
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
closes :: Text -> Text -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
"body" `closes` "head" = True
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
"optgroup" `closes` "optgroup" = True
"optgroup" `closes` "option" = True
"option" `closes` "option" = True
-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
"table", "ul"] = True
_ `closes` "meta" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object" `closes` "object" = True
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` "colgroup" = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
t `closes` t2 |
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
t1 `closes` t2 |
t1 `Set.member` blockTags &&
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
--- parsers for use in markdown, textile readers
-- | Matches a stretch of HTML in balanced tags.
@ -1347,13 +1176,6 @@ stripPrefix' s =
if T.null t then s else T.drop 1 t
where (_, t) = T.span (/= ':') s
isSpace :: Char -> Bool
isSpace ' ' = True
isSpace '\t' = True
isSpace '\n' = True
isSpace '\r' = True
isSpace _ = False
-- Utilities
-- | Adjusts a url according to the document's base URL.
@ -1364,41 +1186,6 @@ canonicalizeUrl url = do
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
-- Instances
instance HasMacros HTMLState where
extractMacros = macros
updateMacros f st = st{ macros = f $ macros st }
instance HasIdentifierList HTMLState where
extractIdentifierList = identifiers
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
instance HasLogMessages HTMLState where
addLogMessage m s = s{ logMessages = m : logMessages s }
getLogMessages = reverse . logMessages
-- This signature should be more general
-- MonadReader HTMLLocal m => HasQuoteContext st m
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
getQuoteContext = asks quoteContext
withQuoteContext q = local (\s -> s{quoteContext = q})
instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
def = HTMLLocal NoQuote False False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
getLastStrPos = getLastStrPos . parserState
-- For now we need a special version here; the one in Shared has String type
renderTags' :: [Tag Text] -> Text
renderTags' = renderTagsOptions
@ -1411,21 +1198,6 @@ renderTags' = renderTagsOptions
-- EPUB Specific
--
--
sectioningContent :: [Text]
sectioningContent = ["article", "aside", "nav", "section"]
groupingContent :: [Text]
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
, "ul", "li", "dl", "dt", "dt", "dd"
, "figure", "figcaption", "div", "main"]
matchTagClose :: Text -> (Tag Text -> Bool)
matchTagClose t = (~== TagClose t)
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
matchTagOpen t as = (~== TagOpen t as)
{-
types :: [(String, ([String], Int))]

View file

@ -0,0 +1,156 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.HTML.Parsing
Copyright : Copyright (C) 2006-2020 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Parsing functions and utilities.
-}
module Text.Pandoc.Readers.HTML.Parsing
( pInTags
, pInTags'
, pInTag
, pAny
, pCloses
, pSatisfy
, pBlank
, matchTagClose
, matchTagOpen
, isSpace
)
where
import Control.Monad (guard, void, mzero)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Parsing
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Shared (tshow)
import qualified Data.Set as Set
import qualified Data.Text as T
pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a
pInTags tagtype parser = pInTags' tagtype (const True) parser
pInTags' :: (PandocMonad m, Monoid a)
=> Text
-> (Tag Text -> Bool)
-> TagParser m a
-> TagParser m a
pInTags' tagtype tagtest parser = try $ do
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
-- parses p, preceded by an opening tag (optional if tagsOptional)
-- and followed by a closing tag (optional if tagsOptional)
pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a
pInTag tagsOptional tagtype p = try $ do
skipMany pBlank
(if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype [])
skipMany pBlank
x <- p
skipMany pBlank
(if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype)
skipMany pBlank
return x
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
(TagClose t') | t' == tagtype -> void pAny
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "dd" -> return ()
(TagClose "table") | tagtype == "td" -> return ()
(TagClose "table") | tagtype == "th" -> return ()
(TagClose "table") | tagtype == "tr" -> return ()
(TagClose "td") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose "th") | tagtype `Set.member` blockHtmlTags -> return ()
(TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags
-> return () -- see #3794
_ -> mzero
pBlank :: PandocMonad m => TagParser m ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ T.all isSpace str
pLocation :: PandocMonad m => TagParser m ()
pLocation = do
(TagPosition r c) <- pSat isTagPosition
setPosition $ newPos "input" r c
pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSat f = do
pos <- getPosition
token tshow (const pos) (\x -> if f x then Just x else Nothing)
pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text)
pSatisfy f = try $ optional pLocation >> pSat f
matchTagClose :: Text -> (Tag Text -> Bool)
matchTagClose t = (~== TagClose t)
matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool)
matchTagOpen t as = (~== TagOpen t as)
pAny :: PandocMonad m => TagParser m (Tag Text)
pAny = pSatisfy (const True)
isSpace :: Char -> Bool
isSpace ' ' = True
isSpace '\t' = True
isSpace '\n' = True
isSpace '\r' = True
isSpace _ = False
-- taken from HXT and extended
-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
closes :: Text -> Text -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
"body" `closes` "head" = True
"a" `closes` "a" = True
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
"optgroup" `closes` "optgroup" = True
"optgroup" `closes` "option" = True
"option" `closes` "option" = True
-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags
x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
"dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section",
"table", "ul"] = True
_ `closes` "meta" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
"object" `closes` "object" = True
_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
t `closes` "select" | t /= "option" = True
"thead" `closes` "colgroup" = True
"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
t `closes` t2 |
t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] &&
t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main"
t1 `closes` t2 |
t1 `Set.member` blockTags &&
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False

View file

@ -0,0 +1,78 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.HTML.TagCategories
Copyright : Copyright (C) 2006-2020 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Categories of tags.
-}
module Text.Pandoc.Readers.HTML.TagCategories
( blockHtmlTags
, blockDocBookTags
, eitherBlockOrInline
, epubTags
, blockTags
, sectioningContent
, groupingContent
)
where
import Data.Set (Set, fromList, unions)
import Data.Text (Text)
eitherBlockOrInline :: Set Text
eitherBlockOrInline = fromList
["audio", "applet", "button", "iframe", "embed",
"del", "ins", "progress", "map", "area", "noscript", "script",
"object", "svg", "video", "source"]
blockHtmlTags :: Set Text
blockHtmlTags = fromList
["?xml", "!DOCTYPE", "address", "article", "aside",
"blockquote", "body", "canvas",
"caption", "center", "col", "colgroup", "dd", "details",
"dir", "div", "dl", "dt", "fieldset", "figcaption", "figure",
"footer", "form", "h1", "h2", "h3", "h4",
"h5", "h6", "head", "header", "hgroup", "hr", "html",
"isindex", "main", "menu", "meta", "noframes", "nav",
"ol", "output", "p", "pre",
"section", "summary", "table", "tbody", "textarea",
"thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
blockDocBookTags :: Set Text
blockDocBookTags = fromList
["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"orderedlist", "segmentedlist", "simplelist",
"variablelist", "caution", "important", "note", "tip",
"warning", "address", "literallayout", "programlisting",
"programlistingco", "screen", "screenco", "screenshot",
"synopsis", "example", "informalexample", "figure",
"informalfigure", "table", "informaltable", "para",
"simpara", "formalpara", "equation", "informalequation",
"figure", "screenshot", "mediaobject", "qandaset",
"procedure", "task", "cmdsynopsis", "funcsynopsis",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
epubTags :: Set Text
epubTags = fromList ["case", "switch", "default"]
blockTags :: Set Text
blockTags = unions [blockHtmlTags, blockDocBookTags, epubTags]
sectioningContent :: [Text]
sectioningContent = ["article", "aside", "nav", "section"]
groupingContent :: [Text]
groupingContent = ["p", "hr", "pre", "blockquote", "ol"
, "ul", "li", "dl", "dt", "dt", "dd"
, "figure", "figcaption", "div", "main"]

View file

@ -0,0 +1,97 @@
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{- |
Module : Text.Pandoc.Readers.HTML.Types
Copyright : Copyright (C) 2006-2020 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Types for pandoc's HTML reader.
-}
module Text.Pandoc.Readers.HTML.Types
( TagParser
, HTMLParser
, HTMLState (..)
, HTMLLocal (..)
)
where
import Control.Monad.Reader (ReaderT, asks, local)
import Data.Default (Default (def))
import Data.Map (Map)
import Data.Set (Set)
import Data.Text (Text)
import Network.URI (URI)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks, HasMeta (..))
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Logging (LogMessage)
import Text.Pandoc.Options (ReaderOptions)
import Text.Pandoc.Parsing
( HasIdentifierList (..), HasLastStrPosition (..), HasLogMessages (..)
, HasMacros (..), HasQuoteContext (..), HasReaderOptions (..)
, ParserT, ParserState, QuoteContext (NoQuote)
)
import Text.Pandoc.Readers.LaTeX.Types (Macro)
-- | HTML parser type
type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
-- | HTML parser, expecting @Tag Text@ as tokens.
type TagParser m = HTMLParser m [Tag Text]
-- | Global HTML parser state
data HTMLState = HTMLState
{ parserState :: ParserState
, noteTable :: [(Text, Blocks)]
, baseHref :: Maybe URI
, identifiers :: Set Text
, logMessages :: [LogMessage]
, macros :: Map Text Macro
, readerOpts :: ReaderOptions
}
-- | Local HTML parser state
data HTMLLocal = HTMLLocal
{ quoteContext :: QuoteContext
, inChapter :: Bool -- ^ Set if in chapter section
, inPlain :: Bool -- ^ Set if in pPlain
}
-- Instances
instance HasMacros HTMLState where
extractMacros = macros
updateMacros f st = st{ macros = f $ macros st }
instance HasIdentifierList HTMLState where
extractIdentifierList = identifiers
updateIdentifierList f s = s{ identifiers = f (identifiers s) }
instance HasLogMessages HTMLState where
addLogMessage m s = s{ logMessages = m : logMessages s }
getLogMessages = reverse . logMessages
-- This signature should be more general
-- MonadReader HTMLLocal m => HasQuoteContext st m
instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
getQuoteContext = asks quoteContext
withQuoteContext q = local (\s -> s{quoteContext = q})
instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
def = HTMLLocal NoQuote False False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
getLastStrPos = getLastStrPos . parserState